summaryrefslogtreecommitdiffstats
path: root/libs
diff options
context:
space:
mode:
authorRaph Levien <raph@google.com>2015-03-18 23:04:28 -0700
committerRaph Levien <raph@google.com>2015-03-30 09:15:53 -0700
commitdaf6a6bdbf2ff1f66496d6200cb253e2f50759d5 (patch)
tree20b32ff4bddb7fd70b72a734fea4d9ac5458dd51 /libs
parent01f526614431e3a0a6e1a48039e00b8a9b7d6fbf (diff)
downloadandroid_frameworks_minikin-daf6a6bdbf2ff1f66496d6200cb253e2f50759d5.tar.gz
android_frameworks_minikin-daf6a6bdbf2ff1f66496d6200cb253e2f50759d5.tar.bz2
android_frameworks_minikin-daf6a6bdbf2ff1f66496d6200cb253e2f50759d5.zip
Add hyphenation to line breaking
This patch adds hyphenation using the Liang hyphenation algorithm, similar to TeX. It also improves the optimized line breaker so that it works correctly and efficiently even when the line width is not constant (there is a specialization for constant width, which is probably worthwhile, but performance TODOs remain). Still to be done: * hyphenator has many shortcuts, only tested with English * interaction between punctuation and hyphenation is problematic Change-Id: I2d94a1668ebc536398b7c43fcf486333eeb7c6aa
Diffstat (limited to 'libs')
-rw-r--r--libs/minikin/Android.mk1
-rw-r--r--libs/minikin/Hyphenator.cpp152
-rw-r--r--libs/minikin/LineBreaker.cpp196
3 files changed, 313 insertions, 36 deletions
diff --git a/libs/minikin/Android.mk b/libs/minikin/Android.mk
index 5406824..34b535c 100644
--- a/libs/minikin/Android.mk
+++ b/libs/minikin/Android.mk
@@ -22,6 +22,7 @@ LOCAL_SRC_FILES := \
FontCollection.cpp \
FontFamily.cpp \
GraphemeBreak.cpp \
+ Hyphenator.cpp \
Layout.cpp \
LineBreaker.cpp \
MinikinInternal.cpp \
diff --git a/libs/minikin/Hyphenator.cpp b/libs/minikin/Hyphenator.cpp
new file mode 100644
index 0000000..c50b386
--- /dev/null
+++ b/libs/minikin/Hyphenator.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+#include <memory>
+#include <cctype>
+#include <algorithm>
+#include <string>
+
+// HACK: for reading pattern file
+#include <fcntl.h>
+
+#define LOG_TAG "Minikin"
+#include "utils/Log.h"
+
+#include "minikin/Hyphenator.h"
+
+using std::vector;
+
+namespace android {
+
+static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
+
+void Hyphenator::addPattern(const uint16_t* pattern, size_t size) {
+ vector<uint16_t> word;
+ vector<uint8_t> result;
+
+ // start by parsing the Liang-format pattern into a word and a result vector, the
+ // vector right-aligned but without leading zeros. Examples:
+ // a1bc2d -> abcd [1, 0, 2, 0]
+ // abc1 -> abc [1]
+ // 1a2b3c4d5 -> abcd [1, 2, 3, 4, 5]
+ bool lastWasLetter = false;
+ bool haveSeenNumber = false;
+ for (size_t i = 0; i < size; i++) {
+ uint16_t c = pattern[i];
+ if (isdigit(c)) {
+ result.push_back(c - '0');
+ lastWasLetter = false;
+ haveSeenNumber = true;
+ } else {
+ word.push_back(c);
+ if (lastWasLetter && haveSeenNumber) {
+ result.push_back(0);
+ }
+ lastWasLetter = true;
+ }
+ }
+ if (lastWasLetter) {
+ result.push_back(0);
+ }
+ Trie* t = &root;
+ for (size_t i = 0; i < word.size(); i++) {
+ t = &t->succ[word[i]];
+ }
+ t->result = result;
+}
+
+// If any soft hyphen is present in the word, use soft hyphens to decide hyphenation,
+// as recommended in UAX #14 (Use of Soft Hyphen)
+void Hyphenator::hyphenateSoft(vector<uint8_t>* result, const uint16_t* word, size_t len) {
+ (*result)[0] = 0;
+ for (size_t i = 1; i < len; i++) {
+ (*result)[i] = word[i - 1] == CHAR_SOFT_HYPHEN;
+ }
+}
+
+void Hyphenator::hyphenate(vector<uint8_t>* result, const uint16_t* word, size_t len) {
+ result->clear();
+ result->resize(len);
+ if (len < MIN_PREFIX + MIN_SUFFIX) return;
+ size_t maxOffset = len - MIN_SUFFIX + 1;
+ for (size_t i = 0; i < len + 1; i++) {
+ const Trie* node = &root;
+ for (size_t j = i; j < len + 2; j++) {
+ uint16_t c;
+ if (j == 0 || j == len + 1) {
+ c = '.'; // word boundary character in pattern data files
+ } else {
+ c = word[j - 1];
+ if (c == CHAR_SOFT_HYPHEN) {
+ hyphenateSoft(result, word, len);
+ return;
+ }
+ // TODO: use locale-sensitive case folding from ICU.
+ c = tolower(c);
+ }
+ auto search = node->succ.find(c);
+ if (search != node->succ.end()) {
+ node = &search->second;
+ } else {
+ break;
+ }
+ if (!node->result.empty()) {
+ int resultLen = node->result.size();
+ int offset = j + 1 - resultLen;
+ int start = std::max(MIN_PREFIX - offset, 0);
+ int end = std::min(resultLen, (int)maxOffset - offset);
+ // TODO performance: this inner loop can profitably be optimized
+ for (int k = start; k < end; k++) {
+ (*result)[offset + k] = std::max((*result)[offset + k], node->result[k]);
+ }
+#if 0
+ // debug printing of matched patterns
+ std::string dbg;
+ for (size_t k = i; k <= j + 1; k++) {
+ int off = k - j - 2 + resultLen;
+ if (off >= 0 && node->result[off] != 0) {
+ dbg.push_back((char)('0' + node->result[off]));
+ }
+ if (k < j + 1) {
+ uint16_t c = (k == 0 || k == len + 1) ? '.' : word[k - 1];
+ dbg.push_back((char)c);
+ }
+ }
+ ALOGD("%d:%d %s", i, j, dbg.c_str());
+#endif
+ }
+ }
+ }
+ // Since the above calculation does not modify values outside
+ // [MIN_PREFIX, len - MIN_SUFFIX], they are left as 0.
+ for (size_t i = MIN_PREFIX; i < maxOffset; i++) {
+ (*result)[i] &= 1;
+ }
+}
+
+Hyphenator* Hyphenator::load(const uint16_t *patternData, size_t size) {
+ Hyphenator* result = new Hyphenator;
+ for (size_t i = 0; i < size; i++) {
+ size_t end = i;
+ while (patternData[end] != '\n') end++;
+ result->addPattern(patternData + i, end - i);
+ i = end;
+ }
+ return result;
+}
+
+} // namespace android
diff --git a/libs/minikin/LineBreaker.cpp b/libs/minikin/LineBreaker.cpp
index 99d7f69..f7f3fb3 100644
--- a/libs/minikin/LineBreaker.cpp
+++ b/libs/minikin/LineBreaker.cpp
@@ -29,6 +29,7 @@ using std::vector;
namespace android {
const int CHAR_TAB = 0x0009;
+const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
// Large scores in a hierarchy; we prefer desperate breaks to an overfull line. All these
// constants are larger than any reasonable actual width score.
@@ -40,6 +41,16 @@ const float SCORE_DESPERATE = 1e10f;
// to avoid allocation.
const size_t MAX_TEXT_BUF_RETAIN = 32678;
+void LineBreaker::setLocale(const icu::Locale& locale, Hyphenator* hyphenator) {
+ delete mBreakIterator;
+ UErrorCode status = U_ZERO_ERROR;
+ mBreakIterator = icu::BreakIterator::createLineInstance(locale, status);
+ // TODO: check status
+
+ // TODO: load actual resource dependent on locale; letting Minikin do it is a hack
+ mHyphenator = hyphenator;
+}
+
void LineBreaker::setText() {
UErrorCode status = U_ZERO_ERROR;
utext_openUChars(&mUText, mTextBuf.data(), mTextBuf.size(), &status);
@@ -49,7 +60,7 @@ void LineBreaker::setText() {
// handle initial break here because addStyleRun may never be called
mBreakIterator->next();
mCandidates.clear();
- Candidate cand = {0, 0, 0.0, 0.0, 0.0, 0.0};
+ Candidate cand = {0, 0, 0.0, 0.0, 0.0, 0.0, 0, 0};
mCandidates.push_back(cand);
// reset greedy breaker state
@@ -64,7 +75,6 @@ void LineBreaker::setText() {
}
void LineBreaker::setLineWidths(float firstWidth, int firstWidthLineCount, float restWidth) {
- ALOGD("width %f", firstWidth);
mLineWidths.setWidths(firstWidth, firstWidthLineCount, restWidth);
}
@@ -81,22 +91,29 @@ static bool isLineEndSpace(uint16_t c) {
// width buffer.
// This method finds the candidate word breaks (using the ICU break iterator) and sends them
// to addCandidate.
-float LineBreaker::addStyleRun(const MinikinPaint* paint, const FontCollection* typeface,
+float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typeface,
FontStyle style, size_t start, size_t end, bool isRtl) {
Layout layout; // performance TODO: move layout to self object to reduce allocation cost?
float width = 0.0f;
int bidiFlags = isRtl ? kBidi_Force_RTL : kBidi_Force_LTR;
+ float hyphenPenalty = 0.0;
if (paint != nullptr) {
layout.setFontCollection(typeface);
layout.doLayout(mTextBuf.data(), start, end - start, mTextBuf.size(), bidiFlags, style,
*paint);
layout.getAdvances(mCharWidths.data() + start);
width = layout.getAdvance();
+
+ // a heuristic that seems to perform well
+ hyphenPenalty = 0.5 * paint->size * paint->scaleX * mLineWidths.getLineWidth(0);
}
- ParaWidth postBreak = mWidth;
size_t current = (size_t)mBreakIterator->current();
+ size_t wordEnd = start;
+ size_t lastBreak = start;
+ ParaWidth lastBreakWidth = mWidth;
+ ParaWidth postBreak = mWidth;
for (size_t i = start; i < end; i++) {
uint16_t c = mTextBuf[i];
if (c == CHAR_TAB) {
@@ -110,14 +127,48 @@ float LineBreaker::addStyleRun(const MinikinPaint* paint, const FontCollection*
mWidth += mCharWidths[i];
if (!isLineEndSpace(c)) {
postBreak = mWidth;
+ wordEnd = i + 1;
}
}
if (i + 1 == current) {
- // TODO: hyphenation goes here
-
- // Skip break for zero-width characters.
- if (current == mTextBuf.size() || mCharWidths[current] > 0) {
- addWordBreak(current, mWidth, postBreak, 0);
+ // Override ICU's treatment of soft hyphen as a break opportunity, because we want it
+ // to be a hyphen break, with penalty and drawing behavior.
+ if (c != CHAR_SOFT_HYPHEN) {
+ if (paint != nullptr && mHyphenator != nullptr && wordEnd > lastBreak) {
+ mHyphenator->hyphenate(&mHyphBuf, &mTextBuf[lastBreak], wordEnd - lastBreak);
+ #if VERBOSE_DEBUG
+ std::string hyphenatedString;
+ for (size_t j = lastBreak; j < wordEnd; j++) {
+ if (mHyphBuf[j - lastBreak]) hyphenatedString.push_back('-');
+ // Note: only works with ASCII, should do UTF-8 conversion here
+ hyphenatedString.push_back(buffer()[j]);
+ }
+ ALOGD("hyphenated string: %s", hyphenatedString.c_str());
+ #endif
+
+ // measure hyphenated substrings
+ for (size_t j = lastBreak; j < wordEnd; j++) {
+ uint8_t hyph = mHyphBuf[j - lastBreak];
+ if (hyph) {
+ paint->hyphenEdit = hyph;
+ layout.doLayout(mTextBuf.data(), lastBreak, j - lastBreak,
+ mTextBuf.size(), bidiFlags, style, *paint);
+ ParaWidth hyphPostBreak = lastBreakWidth + layout.getAdvance();
+ paint->hyphenEdit = 0;
+ layout.doLayout(mTextBuf.data(), j, wordEnd - j,
+ mTextBuf.size(), bidiFlags, style, *paint);
+ ParaWidth hyphPreBreak = postBreak - layout.getAdvance();
+ addWordBreak(j, hyphPreBreak, hyphPostBreak, hyphenPenalty, hyph);
+ }
+ }
+ }
+
+ // Skip break for zero-width characters.
+ if (current == mTextBuf.size() || mCharWidths[current] > 0) {
+ addWordBreak(current, mWidth, postBreak, 0.0, 0);
+ }
+ lastBreak = current;
+ lastBreakWidth = mWidth;
}
current = (size_t)mBreakIterator->next();
}
@@ -129,7 +180,7 @@ float LineBreaker::addStyleRun(const MinikinPaint* paint, const FontCollection*
// add a word break (possibly for a hyphenated fragment), and add desperate breaks if
// needed (ie when word exceeds current line width)
void LineBreaker::addWordBreak(size_t offset, ParaWidth preBreak, ParaWidth postBreak,
- float penalty) {
+ float penalty, uint8_t hyph) {
Candidate cand;
ParaWidth width = mCandidates.back().preBreak;
if (postBreak - width > currentLineWidth()) {
@@ -145,6 +196,7 @@ void LineBreaker::addWordBreak(size_t offset, ParaWidth preBreak, ParaWidth post
cand.preBreak = width;
cand.postBreak = width;
cand.penalty = SCORE_DESPERATE;
+ cand.hyphenEdit = 0;
#if VERBOSE_DEBUG
ALOGD("desperate cand: %d %g:%g",
mCandidates.size(), cand.postBreak, cand.preBreak);
@@ -159,12 +211,24 @@ void LineBreaker::addWordBreak(size_t offset, ParaWidth preBreak, ParaWidth post
cand.preBreak = preBreak;
cand.postBreak = postBreak;
cand.penalty = penalty;
+ cand.hyphenEdit = hyph;
#if VERBOSE_DEBUG
ALOGD("cand: %d %g:%g", mCandidates.size(), cand.postBreak, cand.preBreak);
#endif
addCandidate(cand);
}
+// TODO: for justified text, refine with shrink/stretch
+float LineBreaker::computeScore(float delta, bool atEnd) {
+ if (delta < 0) {
+ return SCORE_OVERFULL;
+ } else if (atEnd && mStrategy != kBreakStrategy_Balanced) {
+ return 0.0;
+ } else {
+ return delta * delta;
+ }
+}
+
// TODO performance: could avoid populating mCandidates if greedy only
void LineBreaker::addCandidate(Candidate cand) {
size_t candIndex = mCandidates.size();
@@ -174,10 +238,8 @@ void LineBreaker::addCandidate(Candidate cand) {
if (mBestBreak == mLastBreak) {
mBestBreak = candIndex;
}
- mBreaks.push_back(mCandidates[mBestBreak].offset);
- mWidths.push_back(mCandidates[mBestBreak].postBreak - mPreBreak);
- mFlags.push_back(mFirstTabIndex < mBreaks.back());
- mFirstTabIndex = INT_MAX;
+ pushBreak(mCandidates[mBestBreak].offset, mCandidates[mBestBreak].postBreak - mPreBreak,
+ mCandidates[mBestBreak].hyphenEdit);
mBestScore = SCORE_INFTY;
#if VERBOSE_DEBUG
ALOGD("break: %d %g", mBreaks.back(), mWidths.back());
@@ -191,6 +253,15 @@ void LineBreaker::addCandidate(Candidate cand) {
}
}
+void LineBreaker::pushBreak(int offset, float width, uint8_t hyph) {
+ mBreaks.push_back(offset);
+ mWidths.push_back(width);
+ int flags = (mFirstTabIndex < mBreaks.back()) << kTab_Shift;
+ flags |= hyph;
+ mFlags.push_back(flags);
+ mFirstTabIndex = INT_MAX;
+}
+
void LineBreaker::addReplacement(size_t start, size_t end, float width) {
mCharWidths[start] = width;
std::fill(&mCharWidths[start + 1], &mCharWidths[end], 0.0f);
@@ -205,27 +276,87 @@ void LineBreaker::computeBreaksGreedy() {
// All breaks but the last have been added in addCandidate already.
size_t nCand = mCandidates.size();
if (nCand == 1 || mLastBreak != nCand - 1) {
- mBreaks.push_back(mCandidates[nCand - 1].offset);
- mWidths.push_back(mCandidates[nCand - 1].postBreak - mPreBreak);
- mFlags.push_back(mFirstTabIndex < mBreaks.back());
- // don't need to update mFirstTabIndex or mBestScore, because we're done
+ pushBreak(mCandidates[nCand - 1].offset, mCandidates[nCand - 1].postBreak - mPreBreak, 0);
+ // don't need to update mBestScore, because we're done
#if VERBOSE_DEBUG
ALOGD("final break: %d %g", mBreaks.back(), mWidths.back());
#endif
}
}
-void LineBreaker::computeBreaksOpt() {
+// Follow "prev" links in mCandidates array, and copy to result arrays.
+void LineBreaker::finishBreaksOptimal() {
// clear existing greedy break result
mBreaks.clear();
mWidths.clear();
mFlags.clear();
+ size_t nCand = mCandidates.size();
+ size_t prev;
+ for (size_t i = nCand - 1; i > 0; i = prev) {
+ prev = mCandidates[i].prev;
+ mBreaks.push_back(mCandidates[i].offset);
+ mWidths.push_back(mCandidates[i].postBreak - mCandidates[prev].preBreak);
+ mFlags.push_back(mCandidates[i].hyphenEdit);
+ }
+ std::reverse(mBreaks.begin(), mBreaks.end());
+ std::reverse(mWidths.begin(), mWidths.end());
+ std::reverse(mFlags.begin(), mFlags.end());
+}
+
+void LineBreaker::computeBreaksOptimal() {
+ size_t active = 0;
+ size_t nCand = mCandidates.size();
+ for (size_t i = 1; i < nCand; i++) {
+ bool atEnd = i == nCand - 1;
+ float best = SCORE_INFTY;
+ size_t bestPrev = 0;
+
+ size_t lineNumberLast = mCandidates[active].lineNumber;
+ float width = mLineWidths.getLineWidth(lineNumberLast);
+ ParaWidth leftEdge = mCandidates[i].postBreak - width;
+ float bestHope = 0;
+
+ for (size_t j = active; j < i; j++) {
+ size_t lineNumber = mCandidates[j].lineNumber;
+ if (lineNumber != lineNumberLast) {
+ float widthNew = mLineWidths.getLineWidth(lineNumber);
+ if (widthNew != width) {
+ leftEdge = mCandidates[i].postBreak - width;
+ bestHope = 0;
+ width = widthNew;
+ }
+ lineNumberLast = lineNumber;
+ }
+ float jScore = mCandidates[j].score;
+ if (jScore + bestHope >= best) continue;
+ float delta = mCandidates[j].preBreak - leftEdge;
+
+ float widthScore = computeScore(delta, atEnd);
+ if (delta < 0) {
+ active = j + 1;
+ } else {
+ bestHope = widthScore;
+ }
+
+ float score = jScore + widthScore;
+ if (score <= best) {
+ best = score;
+ bestPrev = j;
+ }
+ }
+ mCandidates[i].score = best + mCandidates[i].penalty;
+ mCandidates[i].prev = bestPrev;
+ mCandidates[i].lineNumber = mCandidates[bestPrev].lineNumber + 1;
+ }
+ finishBreaksOptimal();
+}
+
+void LineBreaker::computeBreaksOptimalRect() {
size_t active = 0;
size_t nCand = mCandidates.size();
float width = mLineWidths.getLineWidth(0);
- // TODO: actually support non-constant width
for (size_t i = 1; i < nCand; i++) {
- bool stretchIsFree = mStrategy != kBreakStrategy_Balanced && i == nCand - 1;
+ bool atEnd = i == nCand - 1;
float best = SCORE_INFTY;
size_t bestPrev = 0;
@@ -235,17 +366,15 @@ void LineBreaker::computeBreaksOpt() {
ParaWidth leftEdge = mCandidates[i].postBreak - width;
for (size_t j = active; j < i; j++) {
+ // TODO performance: can break if bestHope >= best; worth it?
float jScore = mCandidates[j].score;
if (jScore + bestHope >= best) continue;
float delta = mCandidates[j].preBreak - leftEdge;
- // TODO: for justified text, refine with shrink/stretch
- float widthScore;
+ float widthScore = computeScore(delta, atEnd);
if (delta < 0) {
- widthScore = SCORE_OVERFULL;
active = j + 1;
} else {
- widthScore = stretchIsFree ? 0 : delta * delta;
bestHope = widthScore;
}
@@ -258,23 +387,16 @@ void LineBreaker::computeBreaksOpt() {
mCandidates[i].score = best + mCandidates[i].penalty;
mCandidates[i].prev = bestPrev;
}
- size_t prev;
- for (size_t i = nCand - 1; i > 0; i = prev) {
- prev = mCandidates[i].prev;
- mBreaks.push_back(mCandidates[i].offset);
- mWidths.push_back(mCandidates[i].postBreak - mCandidates[prev].preBreak);
- mFlags.push_back(0);
- }
- std::reverse(mBreaks.begin(), mBreaks.end());
- std::reverse(mWidths.begin(), mWidths.end());
- std::reverse(mFlags.begin(), mFlags.end());
+ finishBreaksOptimal();
}
size_t LineBreaker::computeBreaks() {
if (mStrategy == kBreakStrategy_Greedy) {
computeBreaksGreedy();
+ } else if (mLineWidths.isConstant()) {
+ computeBreaksOptimalRect();
} else {
- computeBreaksOpt();
+ computeBreaksOptimal();
}
return mBreaks.size();
}
@@ -290,6 +412,8 @@ void LineBreaker::finish() {
mTextBuf.shrink_to_fit();
mCharWidths.clear();
mCharWidths.shrink_to_fit();
+ mHyphBuf.clear();
+ mHyphBuf.shrink_to_fit();
mCandidates.shrink_to_fit();
mBreaks.shrink_to_fit();
mWidths.shrink_to_fit();