2 files changed, 33 insertions, 6 deletions
diff --git a/libs/minikin/Hyphenator.cpp b/libs/minikin/Hyphenator.cpp
index c50b386..3eb151b 100644
--- a/libs/minikin/Hyphenator.cpp
+++ b/libs/minikin/Hyphenator.cpp
@@ -16,9 +16,9 @@
 
 #include <vector>
 #include <memory>
-#include <cctype>
 #include <algorithm>
 #include <string>
+#include <unicode/uchar.h>
 
 // HACK: for reading pattern file
 #include <fcntl.h>
@@ -95,8 +95,19 @@ void Hyphenator::hyphenate(vector<uint8_t>* result, const uint16_t* word, size_t
                     hyphenateSoft(result, word, len);
                     return;
                 }
-                // TODO: use locale-sensitive case folding from ICU.
-                c = tolower(c);
+                // TODO: This uses ICU's simple character to character lowercasing, which ignores
+                // the locale, and ignores cases when lowercasing a character results in more than
+                // one character. It should be fixed to consider the locale (in order for it to work
+                // correctly for Turkish and Azerbaijani), as well as support one-to-many, and
+                // many-to-many case conversions (including non-BMP cases).
+                if (c < 0x00C0) { // U+00C0 is the lowest uppercase non-ASCII character
+                    // Convert uppercase ASCII to lowercase ASCII, but keep other characters as-is
+                    if (0x0041 <= c && c <= 0x005A) {
+                        c += 0x0020;
+                    }
+                } else {
+                    c = u_tolower(c);
+                }
             }
             auto search = node->succ.find(c);
             if (search != node->succ.end()) {
diff --git a/libs/minikin/LineBreaker.cpp b/libs/minikin/LineBreaker.cpp
index bf8be26..72e5c18 100644
--- a/libs/minikin/LineBreaker.cpp
+++ b/libs/minikin/LineBreaker.cpp
@@ -29,9 +29,7 @@ using std::vector;
 namespace android {
 
 const int CHAR_TAB = 0x0009;
-const uint16_t CHAR_HYPHEN_MINUS = 0x002D;
 const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
-const uint16_t CHAR_HYPHEN = 0x2010;
 
 // Large scores in a hierarchy; we prefer desperate breaks to an overfull line. All these
 // constants are larger than any reasonable actual width score.
@@ -107,6 +105,24 @@ static bool isLineEndSpace(uint16_t c) {
             c == 0x205F || c == 0x3000;
 }
 
+// This function determines whether a character is like U+2010 HYPHEN in
+// line breaking and usage: a character immediately after which line breaks
+// are allowed, but words containing it should not be automatically
+// hyphenated. This is a curated set, created by manually inspecting all
+// the characters that have the Unicode line breaking property of BA or HY
+// and seeing which ones are hyphens.
+static bool isLineBreakingHyphen(uint16_t c) {
+    return (c == 0x002D || // HYPHEN-MINUS
+            c == 0x058A || // ARMENIAN HYPHEN
+            c == 0x05BE || // HEBREW PUNCTUATION MAQAF
+            c == 0x1400 || // CANADIAN SYLLABICS HYPHEN
+            c == 0x2010 || // HYPHEN
+            c == 0x2013 || // EN DASH
+            c == 0x2027 || // HYPHENATION POINT
+            c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN
+            c == 0x2E40);  // DOUBLE HYPHEN
+}
+
 // Ordinarily, this method measures the text in the range given. However, when paint
 // is nullptr, it assumes the widths have already been calculated and stored in the
 // width buffer.
@@ -163,7 +179,7 @@ float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typefa
             if (c != CHAR_SOFT_HYPHEN) {
                 // TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so
                 // we can pass the whole word down to Hyphenator like the soft hyphen case.
-                bool wordEndsInHyphen = (c == CHAR_HYPHEN_MINUS || c == CHAR_HYPHEN);
+                bool wordEndsInHyphen = isLineBreakingHyphen(c);
                 if (paint != nullptr && mHyphenator != nullptr &&
                         mHyphenationFrequency != kHyphenationFrequency_None &&
                         !wordEndsInHyphen && !temporarilySkipHyphenation &&