summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--libs/minikin/Hyphenator.cpp17
-rw-r--r--libs/minikin/LineBreaker.cpp22
2 files changed, 33 insertions, 6 deletions
diff --git a/libs/minikin/Hyphenator.cpp b/libs/minikin/Hyphenator.cpp
index c50b386..3eb151b 100644
--- a/libs/minikin/Hyphenator.cpp
+++ b/libs/minikin/Hyphenator.cpp
@@ -16,9 +16,9 @@
#include <vector>
#include <memory>
-#include <cctype>
#include <algorithm>
#include <string>
+#include <unicode/uchar.h>
// HACK: for reading pattern file
#include <fcntl.h>
@@ -95,8 +95,19 @@ void Hyphenator::hyphenate(vector<uint8_t>* result, const uint16_t* word, size_t
hyphenateSoft(result, word, len);
return;
}
- // TODO: use locale-sensitive case folding from ICU.
- c = tolower(c);
+ // TODO: This uses ICU's simple character to character lowercasing, which ignores
+ // the locale, and ignores cases when lowercasing a character results in more than
+ // one character. It should be fixed to consider the locale (in order for it to work
+ // correctly for Turkish and Azerbaijani), as well as support one-to-many, and
+ // many-to-many case conversions (including non-BMP cases).
+ if (c < 0x00C0) { // U+00C0 is the lowest uppercase non-ASCII character
+ // Convert uppercase ASCII to lowercase ASCII, but keep other characters as-is
+ if (0x0041 <= c && c <= 0x005A) {
+ c += 0x0020;
+ }
+ } else {
+ c = u_tolower(c);
+ }
}
auto search = node->succ.find(c);
if (search != node->succ.end()) {
diff --git a/libs/minikin/LineBreaker.cpp b/libs/minikin/LineBreaker.cpp
index bf8be26..72e5c18 100644
--- a/libs/minikin/LineBreaker.cpp
+++ b/libs/minikin/LineBreaker.cpp
@@ -29,9 +29,7 @@ using std::vector;
namespace android {
const int CHAR_TAB = 0x0009;
-const uint16_t CHAR_HYPHEN_MINUS = 0x002D;
const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
-const uint16_t CHAR_HYPHEN = 0x2010;
// Large scores in a hierarchy; we prefer desperate breaks to an overfull line. All these
// constants are larger than any reasonable actual width score.
@@ -107,6 +105,24 @@ static bool isLineEndSpace(uint16_t c) {
c == 0x205F || c == 0x3000;
}
+// This function determines whether a character is like U+2010 HYPHEN in
+// line breaking and usage: a character immediately after which line breaks
+// are allowed, but words containing it should not be automatically
+// hyphenated. This is a curated set, created by manually inspecting all
+// the characters that have the Unicode line breaking property of BA or HY
+// and seeing which ones are hyphens.
+static bool isLineBreakingHyphen(uint16_t c) {
+ return (c == 0x002D || // HYPHEN-MINUS
+ c == 0x058A || // ARMENIAN HYPHEN
+ c == 0x05BE || // HEBREW PUNCTUATION MAQAF
+ c == 0x1400 || // CANADIAN SYLLABICS HYPHEN
+ c == 0x2010 || // HYPHEN
+ c == 0x2013 || // EN DASH
+ c == 0x2027 || // HYPHENATION POINT
+ c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN
+ c == 0x2E40); // DOUBLE HYPHEN
+}
+
// Ordinarily, this method measures the text in the range given. However, when paint
// is nullptr, it assumes the widths have already been calculated and stored in the
// width buffer.
@@ -163,7 +179,7 @@ float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typefa
if (c != CHAR_SOFT_HYPHEN) {
// TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so
// we can pass the whole word down to Hyphenator like the soft hyphen case.
- bool wordEndsInHyphen = (c == CHAR_HYPHEN_MINUS || c == CHAR_HYPHEN);
+ bool wordEndsInHyphen = isLineBreakingHyphen(c);
if (paint != nullptr && mHyphenator != nullptr &&
mHyphenationFrequency != kHyphenationFrequency_None &&
!wordEndsInHyphen && !temporarilySkipHyphenation &&