diff options
-rw-r--r-- | libs/minikin/Hyphenator.cpp | 17 | ||||
-rw-r--r-- | libs/minikin/LineBreaker.cpp | 22 |
2 files changed, 33 insertions, 6 deletions
diff --git a/libs/minikin/Hyphenator.cpp b/libs/minikin/Hyphenator.cpp index c50b386..3eb151b 100644 --- a/libs/minikin/Hyphenator.cpp +++ b/libs/minikin/Hyphenator.cpp @@ -16,9 +16,9 @@ #include <vector> #include <memory> -#include <cctype> #include <algorithm> #include <string> +#include <unicode/uchar.h> // HACK: for reading pattern file #include <fcntl.h> @@ -95,8 +95,19 @@ void Hyphenator::hyphenate(vector<uint8_t>* result, const uint16_t* word, size_t hyphenateSoft(result, word, len); return; } - // TODO: use locale-sensitive case folding from ICU. - c = tolower(c); + // TODO: This uses ICU's simple character to character lowercasing, which ignores + // the locale, and ignores cases when lowercasing a character results in more than + // one character. It should be fixed to consider the locale (in order for it to work + // correctly for Turkish and Azerbaijani), as well as support one-to-many, and + // many-to-many case conversions (including non-BMP cases). + if (c < 0x00C0) { // U+00C0 is the lowest uppercase non-ASCII character + // Convert uppercase ASCII to lowercase ASCII, but keep other characters as-is + if (0x0041 <= c && c <= 0x005A) { + c += 0x0020; + } + } else { + c = u_tolower(c); + } } auto search = node->succ.find(c); if (search != node->succ.end()) { diff --git a/libs/minikin/LineBreaker.cpp b/libs/minikin/LineBreaker.cpp index bf8be26..72e5c18 100644 --- a/libs/minikin/LineBreaker.cpp +++ b/libs/minikin/LineBreaker.cpp @@ -29,9 +29,7 @@ using std::vector; namespace android { const int CHAR_TAB = 0x0009; -const uint16_t CHAR_HYPHEN_MINUS = 0x002D; const uint16_t CHAR_SOFT_HYPHEN = 0x00AD; -const uint16_t CHAR_HYPHEN = 0x2010; // Large scores in a hierarchy; we prefer desperate breaks to an overfull line. All these // constants are larger than any reasonable actual width score. @@ -107,6 +105,24 @@ static bool isLineEndSpace(uint16_t c) { c == 0x205F || c == 0x3000; } +// This function determines whether a character is like U+2010 HYPHEN in +// line breaking and usage: a character immediately after which line breaks +// are allowed, but words containing it should not be automatically +// hyphenated. This is a curated set, created by manually inspecting all +// the characters that have the Unicode line breaking property of BA or HY +// and seeing which ones are hyphens. +static bool isLineBreakingHyphen(uint16_t c) { + return (c == 0x002D || // HYPHEN-MINUS + c == 0x058A || // ARMENIAN HYPHEN + c == 0x05BE || // HEBREW PUNCTUATION MAQAF + c == 0x1400 || // CANADIAN SYLLABICS HYPHEN + c == 0x2010 || // HYPHEN + c == 0x2013 || // EN DASH + c == 0x2027 || // HYPHENATION POINT + c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN + c == 0x2E40); // DOUBLE HYPHEN +} + // Ordinarily, this method measures the text in the range given. However, when paint // is nullptr, it assumes the widths have already been calculated and stored in the // width buffer. @@ -163,7 +179,7 @@ float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typefa if (c != CHAR_SOFT_HYPHEN) { // TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so // we can pass the whole word down to Hyphenator like the soft hyphen case. - bool wordEndsInHyphen = (c == CHAR_HYPHEN_MINUS || c == CHAR_HYPHEN); + bool wordEndsInHyphen = isLineBreakingHyphen(c); if (paint != nullptr && mHyphenator != nullptr && mHyphenationFrequency != kHyphenationFrequency_None && !wordEndsInHyphen && !temporarilySkipHyphenation && |