Block offensive words in multi-word suggestions

If the user has chosen to block offensive words and types "aaaxbb", where "aaa" is an offensive word and "bb" is not, we should not suggest "aaa bb". Bug: 11031090 Change-Id: Ie23b8dd5d347bc26b1c046c3f5e8dfbc259bf528
author: Adrian Velicu <adrianv@google.com> 2014-10-21 22:11:23 +0900
committer: Adrian Velicu <adrianv@google.com> 2014-10-31 15:58:50 +0900
commit: 10416241f7badaedfbafd9858deda9dca496bd08 (patch)
tree: 61daf1b2421678f940cda0a5b9a1b03527799ba8 /native
parent: 61d43e5c941e7a76d614ffbe756137c02a34cdc1 (diff)
download: android_packages_inputmethods_LatinIME-10416241f7badaedfbafd9858deda9dca496bd08.tar.gz
android_packages_inputmethods_LatinIME-10416241f7badaedfbafd9858deda9dca496bd08.tar.bz2
android_packages_inputmethods_LatinIME-10416241f7badaedfbafd9858deda9dca496bd08.zip
3 files changed, 65 insertions, 13 deletions
diff --git a/native/jni/src/suggest/core/result/suggestions_output_utils.cpp b/native/jni/src/suggest/core/result/suggestions_output_utils.cpp
index 3283f6deb..23103b9f7 100644
--- a/native/jni/src/suggest/core/result/suggestions_output_utils.cpp
+++ b/native/jni/src/suggest/core/result/suggestions_output_utils.cpp
@@ -76,6 +76,52 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
             weightOfLangModelVsSpatialModelToOutputSuggestions, outSuggestionResults);
 }
 
+/* static */ bool SuggestionsOutputUtils::shouldBlockWord(
+        const SuggestOptions *const suggestOptions, const DicNode *const terminalDicNode,
+        const WordAttributes wordAttributes, const bool isLastWord) {
+    const bool currentWordExactMatch =
+            ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes());
+    // When we have to block offensive words, non-exact matched offensive words should not be
+    // output.
+    const bool shouldBlockOffensiveWords = suggestOptions->blockOffensiveWords();
+
+    const bool isBlockedOffensiveWord = shouldBlockOffensiveWords &&
+            wordAttributes.isPossiblyOffensive();
+
+    // This function is called in two situations:
+    //
+    // 1) At the end of a search, in which case terminalDicNode will point to the last DicNode
+    //    of the search, and isLastWord will be true.
+    //                    "fuck"
+    //                        |
+    //                        \ terminalDicNode (isLastWord=true, currentWordExactMatch=true)
+    //    In this case, if the current word is an exact match, we will always let the word
+    //    through, even if the user is blocking offensive words (it's exactly what they typed!)
+    //
+    // 2) In the middle of the search, when we hit a terminal node, to decide whether or not
+    //    to start a new search at root, to try to match the rest of the input. In this case,
+    //    terminalDicNode will point to the terminal node we just hit, and isLastWord will be
+    //    false.
+    //                    "fuckvthis"
+    //                        |
+    //                        \ terminalDicNode (isLastWord=false, currentWordExactMatch=true)
+    //
+    // In this case, we should NOT allow the match through (correcting "fuckthis" to "fuck this"
+    // when offensive words are blocked would be a bad idea).
+    //
+    // In the case of a multi-word correction where the offensive word is typed last (eg.
+    // for the input "allfuck"), this function will be called with isLastWord==true, but
+    // currentWordExactMatch==false. So we are OK in this case as well.
+    //                    "allfuck"
+    //                           |
+    //                           \ terminalDicNode (isLastWord=true, currentWordExactMatch=false)
+    if (isLastWord && currentWordExactMatch) {
+        return false;
+    } else {
+        return isBlockedOffensiveWord;
+    }
+}
+
 /* static */ void SuggestionsOutputUtils::outputSuggestionsOfDicNode(
         const Scoring *const scoringPolicy, DicTraverseSession *traverseSession,
         const DicNode *const terminalDicNode, const float weightOfLangModelVsSpatialModel,
@@ -98,24 +144,16 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
     const bool isExactMatchWithIntentionalOmission =
             ErrorTypeUtils::isExactMatchWithIntentionalOmission(
                     terminalDicNode->getContainedErrorTypes());
-    const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase();
-    // Heuristic: We exclude probability=0 first-char-uppercase words from exact match.
-    // (e.g. "AMD" and "and")
-    const bool isSafeExactMatch = isExactMatch
-            && !(wordAttributes.isPossiblyOffensive() && isFirstCharUppercase);
     const int outputTypeFlags =
             (wordAttributes.isPossiblyOffensive() ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0)
-            | ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0)
+            | ((isExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0)
             | (isExactMatchWithIntentionalOmission ?
                     Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0);
-
     // Entries that are blacklisted or do not represent a word should not be output.
     const bool isValidWord = !(wordAttributes.isBlacklisted() || wordAttributes.isNotAWord());
-    // When we have to block offensive words, non-exact matched offensive words should not be
-    // output.
-    const bool blockOffensiveWords = traverseSession->getSuggestOptions()->blockOffensiveWords();
-    const bool isBlockedOffensiveWord = blockOffensiveWords && wordAttributes.isPossiblyOffensive()
-            && !isSafeExactMatch;
+
+    const bool shouldBlockThisWord = shouldBlockWord(traverseSession->getSuggestOptions(),
+            terminalDicNode, wordAttributes, true /* isLastWord */);
 
     // Increase output score of top typing suggestion to ensure autocorrection.
     // TODO: Better integration with java side autocorrection logic.
@@ -127,7 +165,7 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
 
     // Don't output invalid or blocked offensive words. However, we still need to submit their
     // shortcuts if any.
-    if (isValidWord && !isBlockedOffensiveWord) {
+    if (isValidWord && !shouldBlockThisWord) {
         int codePoints[MAX_WORD_LENGTH];
         terminalDicNode->outputResult(codePoints);
         const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ?
diff --git a/native/jni/src/suggest/core/result/suggestions_output_utils.h b/native/jni/src/suggest/core/result/suggestions_output_utils.h
index bf8497828..eca1f78b2 100644
--- a/native/jni/src/suggest/core/result/suggestions_output_utils.h
+++ b/native/jni/src/suggest/core/result/suggestions_output_utils.h
@@ -18,6 +18,7 @@
 #define LATINIME_SUGGESTIONS_OUTPUT_UTILS
 
 #include "defines.h"
+#include "suggest/core/dictionary/word_attributes.h"
 
 namespace latinime {
 
@@ -25,11 +26,19 @@ class BinaryDictionaryShortcutIterator;
 class DicNode;
 class DicTraverseSession;
 class Scoring;
+class SuggestOptions;
 class SuggestionResults;
 
 class SuggestionsOutputUtils {
  public:
     /**
+     * Returns true if we should block the incoming word, in the context of the user's
+     * preferences to include or not include possibly offensive words
+     */
+    static bool shouldBlockWord(const SuggestOptions *const suggestOptions,
+            const DicNode *const terminalDicNode, const WordAttributes wordAttributes,
+            const bool isLastWord);
+    /**
      * Outputs the final list of suggestions (i.e., terminal nodes).
      */
     static void outputSuggestions(const Scoring *const scoringPolicy,
diff --git a/native/jni/src/suggest/core/suggest.cpp b/native/jni/src/suggest/core/suggest.cpp
index 68a36454e..c372d668b 100644
--- a/native/jni/src/suggest/core/suggest.cpp
+++ b/native/jni/src/suggest/core/suggest.cpp
@@ -416,6 +416,11 @@ void Suggest::createNextWordDicNode(DicTraverseSession *traverseSession, DicNode
             traverseSession->getDictionaryStructurePolicy()->getWordAttributesInContext(
                     dicNode->getPrevWordIds(), dicNode->getWordId(),
                     traverseSession->getMultiBigramMap());
+    if (SuggestionsOutputUtils::shouldBlockWord(traverseSession->getSuggestOptions(),
+            dicNode, wordAttributes, false /* isLastWord */)) {
+        return;
+    }
+
     if (!TRAVERSAL->isGoodToTraverseNextWord(dicNode, wordAttributes.getProbability())) {
         return;
     }
author	Adrian Velicu <adrianv@google.com>	2014-10-21 22:11:23 +0900
committer	Adrian Velicu <adrianv@google.com>	2014-10-31 15:58:50 +0900
commit	10416241f7badaedfbafd9858deda9dca496bd08 (patch)
tree	61daf1b2421678f940cda0a5b9a1b03527799ba8 /native
parent	61d43e5c941e7a76d614ffbe756137c02a34cdc1 (diff)
download	android_packages_inputmethods_LatinIME-10416241f7badaedfbafd9858deda9dca496bd08.tar.gz android_packages_inputmethods_LatinIME-10416241f7badaedfbafd9858deda9dca496bd08.tar.bz2 android_packages_inputmethods_LatinIME-10416241f7badaedfbafd9858deda9dca496bd08.zip