summaryrefslogtreecommitdiffstats
path: root/tools
diff options
context:
space:
mode:
authorJean Chalard <jchalard@google.com>2014-10-22 17:22:22 +0900
committerJean Chalard <jchalard@google.com>2014-10-22 17:28:33 +0900
commit90aa229f01f2a14ae5b4542e065d27d000dafb82 (patch)
treecc893439fdb5f4853eee708feac26a65a21afb64 /tools
parent1249395563d43c818e12038231ec89dcbcdc5cd0 (diff)
downloadandroid_packages_inputmethods_LatinIME-90aa229f01f2a14ae5b4542e065d27d000dafb82.tar.gz
android_packages_inputmethods_LatinIME-90aa229f01f2a14ae5b4542e065d27d000dafb82.tar.bz2
android_packages_inputmethods_LatinIME-90aa229f01f2a14ae5b4542e065d27d000dafb82.zip
Remove XML input/output from dicttool.
This hasn't been used for a while. It's deprecated. Let's kill it. Change-Id: Ib1c491fa14b6406f6f77f2b0869f4db1810eb078
Diffstat (limited to 'tools')
-rw-r--r--tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java15
-rw-r--r--tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java116
-rw-r--r--tools/dicttool/src/com/android/inputmethod/latin/dicttool/Makedict.java5
-rw-r--r--tools/dicttool/src/com/android/inputmethod/latin/dicttool/XmlDictInputOutput.java380
4 files changed, 13 insertions, 503 deletions
diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java
index 3ef03f4bd..4c7187fcd 100644
--- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java
+++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java
@@ -22,8 +22,6 @@ import com.android.inputmethod.latin.makedict.DictDecoder;
import com.android.inputmethod.latin.makedict.FusionDictionary;
import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
-import org.xml.sax.SAXException;
-
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
@@ -36,8 +34,6 @@ import java.io.InputStreamReader;
import java.io.OutputStream;
import java.util.ArrayList;
-import javax.xml.parsers.ParserConfigurationException;
-
/**
* Class grouping utilities for offline dictionary making.
*
@@ -177,14 +173,6 @@ public final class BinaryDictOffdeviceUtils {
System.out.println("Size : " + file.length() + " bytes");
}
try {
- if (XmlDictInputOutput.isXmlUnigramDictionary(filename)) {
- if (report) {
- System.out.println("Format : XML unigram list");
- }
- return XmlDictInputOutput.readDictionaryXml(
- new BufferedInputStream(new FileInputStream(file)),
- null /* shortcuts */, null /* bigrams */);
- }
final DecoderChainSpec decodedSpec = getRawDictionaryOrNull(file);
if (null == decodedSpec) {
throw new RuntimeException("Does not seem to be a dictionary file " + filename);
@@ -209,8 +197,7 @@ public final class BinaryDictOffdeviceUtils {
System.out.println("Uncompressed size : " + decodedSpec.mFile.length());
}
return dictDecoder.readDictionaryBinary(false /* deleteDictIfBroken */);
- } catch (final IOException | SAXException | ParserConfigurationException |
- UnsupportedFormatException e) {
+ } catch (final IOException | UnsupportedFormatException e) {
throw new RuntimeException("Can't read file " + filename, e);
}
}
diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java
index 2925fdc34..e04751ddc 100644
--- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java
+++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java
@@ -27,8 +27,6 @@ import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
import com.android.inputmethod.latin.makedict.Ver2DictEncoder;
import com.android.inputmethod.latin.makedict.Ver4DictEncoder;
-import org.xml.sax.SAXException;
-
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
@@ -41,8 +39,6 @@ import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.LinkedList;
-import javax.xml.parsers.ParserConfigurationException;
-
/**
* Main class/method for DictionaryMaker.
*/
@@ -52,10 +48,7 @@ public class DictionaryMaker {
private static final String OPTION_VERSION_2 = "-2";
private static final String OPTION_VERSION_4 = "-4";
private static final String OPTION_INPUT_SOURCE = "-s";
- private static final String OPTION_INPUT_BIGRAM_XML = "-b";
- private static final String OPTION_INPUT_SHORTCUT_XML = "-c";
private static final String OPTION_OUTPUT_BINARY = "-d";
- private static final String OPTION_OUTPUT_XML = "-x";
private static final String OPTION_OUTPUT_COMBINED = "-o";
private static final String OPTION_HELP = "-h";
private static final String OPTION_CODE_POINT_TABLE = "-t";
@@ -63,11 +56,7 @@ public class DictionaryMaker {
private static final String OPTION_CODE_POINT_TABLE_ON = "on";
public final String mInputBinary;
public final String mInputCombined;
- public final String mInputUnigramXml;
- public final String mInputShortcutXml;
- public final String mInputBigramXml;
public final String mOutputBinary;
- public final String mOutputXml;
public final String mOutputCombined;
public final int mOutputBinaryFormatVersion;
public final int mCodePointTableMode;
@@ -76,39 +65,20 @@ public class DictionaryMaker {
checkHasExactlyOneInput();
checkHasAtLeastOneOutput();
checkNotSameFile(mInputBinary, mOutputBinary);
- checkNotSameFile(mInputBinary, mOutputXml);
checkNotSameFile(mInputCombined, mOutputBinary);
- checkNotSameFile(mInputCombined, mOutputXml);
- checkNotSameFile(mInputUnigramXml, mOutputBinary);
- checkNotSameFile(mInputUnigramXml, mOutputXml);
- checkNotSameFile(mInputUnigramXml, mOutputCombined);
- checkNotSameFile(mInputShortcutXml, mOutputBinary);
- checkNotSameFile(mInputShortcutXml, mOutputXml);
- checkNotSameFile(mInputShortcutXml, mOutputCombined);
- checkNotSameFile(mInputBigramXml, mOutputBinary);
- checkNotSameFile(mInputBigramXml, mOutputXml);
- checkNotSameFile(mInputBigramXml, mOutputCombined);
- checkNotSameFile(mOutputBinary, mOutputXml);
checkNotSameFile(mOutputBinary, mOutputCombined);
- checkNotSameFile(mOutputXml, mOutputCombined);
}
private void checkHasExactlyOneInput() {
- if (null == mInputUnigramXml && null == mInputBinary && null == mInputCombined) {
+ if (null == mInputBinary && null == mInputCombined) {
throw new RuntimeException("No input file specified");
- } else if ((null != mInputUnigramXml && null != mInputBinary)
- || (null != mInputUnigramXml && null != mInputCombined)
- || (null != mInputBinary && null != mInputCombined)) {
+ } else if (null != mInputBinary && null != mInputCombined) {
throw new RuntimeException("Several input files specified");
- } else if ((null != mInputBinary || null != mInputCombined)
- && (null != mInputBigramXml || null != mInputShortcutXml)) {
- throw new RuntimeException("Separate bigrams/shortcut files are only supported"
- + " with XML input (other formats include bigrams and shortcuts already)");
}
}
private void checkHasAtLeastOneOutput() {
- if (null == mOutputBinary && null == mOutputXml && null == mOutputCombined) {
+ if (null == mOutputBinary && null == mOutputCombined) {
throw new RuntimeException("No output specified");
}
}
@@ -131,16 +101,14 @@ public class DictionaryMaker {
public static String getHelp() {
return "Usage: makedict "
- + "[-s <unigrams.xml> [-b <bigrams.xml>] [-c <shortcuts_and_whitelist.xml>] "
+ "| [-s <combined format input]"
- + "| [-s <binary input>] [-d <binary output>] [-x <xml output>] "
+ + "| [-s <binary input>] [-d <binary output>]"
+ " [-o <combined output>] [-t <code point table switch: on/off/auto>]"
+ "[-2] [-3] [-4]\n"
+ "\n"
+ " Converts a source dictionary file to one or several outputs.\n"
- + " Source can be an XML file, with an optional XML bigrams file, or a\n"
- + " binary dictionary file.\n"
- + " Binary version 2 (Jelly Bean), 3, 4, XML and\n"
+ + " Source can be a binary dictionary file or a combined format file.\n"
+ + " Binary version 2 (Jelly Bean), 3, 4, and\n"
+ " combined format outputs are supported.";
}
@@ -151,11 +119,7 @@ public class DictionaryMaker {
}
String inputBinary = null;
String inputCombined = null;
- String inputUnigramXml = null;
- String inputShortcutXml = null;
- String inputBigramXml = null;
String outputBinary = null;
- String outputXml = null;
String outputCombined = null;
int outputBinaryFormatVersion = FormatSpec.VERSION201; // the default version is 201.
// Don't use code point table by default.
@@ -180,9 +144,7 @@ public class DictionaryMaker {
String argValue = args.get(0);
args.remove(0);
if (OPTION_INPUT_SOURCE.equals(arg)) {
- if (XmlDictInputOutput.isXmlUnigramDictionary(argValue)) {
- inputUnigramXml = argValue;
- } else if (CombinedInputOutput.isCombinedDictionary(argValue)) {
+ if (CombinedInputOutput.isCombinedDictionary(argValue)) {
inputCombined = argValue;
} else if (BinaryDictDecoderUtils.isBinaryDictionary(argValue)) {
inputBinary = argValue;
@@ -190,14 +152,8 @@ public class DictionaryMaker {
throw new IllegalArgumentException(
"Unknown format for file " + argValue);
}
- } else if (OPTION_INPUT_SHORTCUT_XML.equals(arg)) {
- inputShortcutXml = argValue;
- } else if (OPTION_INPUT_BIGRAM_XML.equals(arg)) {
- inputBigramXml = argValue;
} else if (OPTION_OUTPUT_BINARY.equals(arg)) {
outputBinary = argValue;
- } else if (OPTION_OUTPUT_XML.equals(arg)) {
- outputXml = argValue;
} else if (OPTION_OUTPUT_COMBINED.equals(arg)) {
outputCombined = argValue;
} else if (OPTION_CODE_POINT_TABLE.equals(arg)) {
@@ -214,13 +170,13 @@ public class DictionaryMaker {
}
}
} else {
- if (null == inputBinary && null == inputUnigramXml) {
+ if (null == inputBinary) {
if (BinaryDictDecoderUtils.isBinaryDictionary(arg)) {
inputBinary = arg;
} else if (CombinedInputOutput.isCombinedDictionary(arg)) {
inputCombined = arg;
} else {
- inputUnigramXml = arg;
+ throw new IllegalArgumentException("Unknown format for file " + arg);
}
} else if (null == outputBinary) {
outputBinary = arg;
@@ -232,11 +188,7 @@ public class DictionaryMaker {
mInputBinary = inputBinary;
mInputCombined = inputCombined;
- mInputUnigramXml = inputUnigramXml;
- mInputShortcutXml = inputShortcutXml;
- mInputBigramXml = inputBigramXml;
mOutputBinary = outputBinary;
- mOutputXml = outputXml;
mOutputCombined = outputCombined;
mOutputBinaryFormatVersion = outputBinaryFormatVersion;
mCodePointTableMode = codePointTableMode;
@@ -245,8 +197,7 @@ public class DictionaryMaker {
}
public static void main(String[] args)
- throws FileNotFoundException, ParserConfigurationException, SAXException, IOException,
- UnsupportedFormatException {
+ throws FileNotFoundException, IOException, UnsupportedFormatException {
final Arguments parsedArgs = new Arguments(args);
FusionDictionary dictionary = readInputFromParsedArgs(parsedArgs);
writeOutputToParsedArgs(parsedArgs, dictionary);
@@ -259,14 +210,11 @@ public class DictionaryMaker {
* @return the read dictionary.
*/
private static FusionDictionary readInputFromParsedArgs(final Arguments args)
- throws IOException, UnsupportedFormatException, ParserConfigurationException,
- SAXException, FileNotFoundException {
+ throws IOException, UnsupportedFormatException, FileNotFoundException {
if (null != args.mInputBinary) {
return readBinaryFile(args.mInputBinary);
} else if (null != args.mInputCombined) {
return readCombinedFile(args.mInputCombined);
- } else if (null != args.mInputUnigramXml) {
- return readXmlFile(args.mInputUnigramXml, args.mInputShortcutXml, args.mInputBigramXml);
} else {
throw new RuntimeException("No input file specified");
}
@@ -314,30 +262,6 @@ public class DictionaryMaker {
}
/**
- * Read a dictionary from a unigram XML file, and optionally a bigram XML file.
- *
- * @param unigramXmlFilename the name of the unigram XML file. May not be null.
- * @param shortcutXmlFilename the name of the shortcut/whitelist XML file, or null if none.
- * @param bigramXmlFilename the name of the bigram XML file. Pass null if there are no bigrams.
- * @return the read dictionary.
- * @throws FileNotFoundException if one of the files can't be found
- * @throws SAXException if one or more of the XML files is not well-formed
- * @throws IOException if one the input files can't be read
- * @throws ParserConfigurationException if the system can't create a SAX parser
- */
- private static FusionDictionary readXmlFile(final String unigramXmlFilename,
- final String shortcutXmlFilename, final String bigramXmlFilename)
- throws FileNotFoundException, SAXException, IOException, ParserConfigurationException {
- try (
- final BufferedInputStream unigrams = getBufferedFileInputStream(unigramXmlFilename);
- final BufferedInputStream shortcuts = getBufferedFileInputStream(shortcutXmlFilename);
- final BufferedInputStream bigrams = getBufferedFileInputStream(bigramXmlFilename);
- ) {
- return XmlDictInputOutput.readDictionaryXml(unigrams, shortcuts, bigrams);
- }
- }
-
- /**
* Invoke the right output method according to args.
*
* This will write the passed dictionary to the file(s) passed in the command line arguments.
@@ -353,9 +277,6 @@ public class DictionaryMaker {
writeBinaryDictionary(args.mOutputBinary, dict, args.mOutputBinaryFormatVersion,
args.mCodePointTableMode);
}
- if (null != args.mOutputXml) {
- writeXmlDictionary(args.mOutputXml, dict);
- }
if (null != args.mOutputCombined) {
writeCombinedDictionary(args.mOutputCombined, dict);
}
@@ -387,21 +308,6 @@ public class DictionaryMaker {
}
/**
- * Write the dictionary in XML format to the specified filename.
- *
- * @param outputFilename the name of the file to write to.
- * @param dict the dictionary to write.
- * @throws FileNotFoundException if the output file can't be created.
- * @throws IOException if the output file can't be written to.
- */
- private static void writeXmlDictionary(final String outputFilename,
- final FusionDictionary dict) throws FileNotFoundException, IOException {
- try (final BufferedWriter writer = new BufferedWriter(new FileWriter(outputFilename))) {
- XmlDictInputOutput.writeDictionaryXml(writer, dict);
- }
- }
-
- /**
* Write the dictionary in the combined format to the specified filename.
*
* @param outputFilename the name of the file to write to.
diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Makedict.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Makedict.java
index 808e1d4c8..0b1fb88bc 100644
--- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Makedict.java
+++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Makedict.java
@@ -20,8 +20,6 @@ import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
import java.io.FileNotFoundException;
import java.io.IOException;
-import javax.xml.parsers.ParserConfigurationException;
-import org.xml.sax.SAXException;
public class Makedict extends Dicttool.Command {
public static final String COMMAND = "makedict";
@@ -35,8 +33,7 @@ public class Makedict extends Dicttool.Command {
}
@Override
- public void run() throws FileNotFoundException, IOException, ParserConfigurationException,
- SAXException, UnsupportedFormatException {
+ public void run() throws FileNotFoundException, IOException, UnsupportedFormatException {
DictionaryMaker.main(mArgs);
}
}
diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/XmlDictInputOutput.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/XmlDictInputOutput.java
deleted file mode 100644
index 7f3337949..000000000
--- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/XmlDictInputOutput.java
+++ /dev/null
@@ -1,380 +0,0 @@
-/*
- * Copyright (C) 2011 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.android.inputmethod.latin.dicttool;
-
-import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions;
-import com.android.inputmethod.latin.makedict.FusionDictionary;
-import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
-import com.android.inputmethod.latin.makedict.ProbabilityInfo;
-import com.android.inputmethod.latin.makedict.WeightedString;
-import com.android.inputmethod.latin.makedict.WordProperty;
-
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-import java.io.BufferedInputStream;
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.TreeSet;
-
-import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.parsers.SAXParser;
-import javax.xml.parsers.SAXParserFactory;
-
-/**
- * Reads and writes XML files for a FusionDictionary.
- *
- * All functions in this class are static.
- */
-public class XmlDictInputOutput {
-
- private static final String ROOT_TAG = "wordlist";
- private static final String WORD_TAG = "w";
- private static final String BIGRAM_TAG = "bigram";
- private static final String SHORTCUT_TAG = "shortcut";
- private static final String PROBABILITY_ATTR = "f";
- private static final String WORD_ATTR = "word";
- private static final String NOT_A_WORD_ATTR = "not_a_word";
-
- /**
- * SAX handler for a unigram XML file.
- */
- static private class UnigramHandler extends DefaultHandler {
- // Parser states
- private static final int START = 1;
- private static final int WORD = 2;
- private static final int UNKNOWN = 3;
- private static final int SHORTCUT_ONLY_WORD_PROBABILITY = 1;
-
- FusionDictionary mDictionary;
- int mState; // the state of the parser
- int mFreq; // the currently read freq
- String mWord; // the current word
- final HashMap<String, ArrayList<WeightedString>> mShortcutsMap;
-
- /**
- * Create the handler.
- *
- * @param shortcuts the shortcuts as a map. This may be empty, but may not be null.
- */
- public UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts) {
- mDictionary = null;
- mShortcutsMap = shortcuts;
- mWord = "";
- mState = START;
- mFreq = 0;
- }
-
- public FusionDictionary getFinalDictionary() {
- final FusionDictionary dict = mDictionary;
- for (final String shortcutOnly : mShortcutsMap.keySet()) {
- if (dict.hasWord(shortcutOnly)) continue;
- dict.add(shortcutOnly, new ProbabilityInfo(SHORTCUT_ONLY_WORD_PROBABILITY),
- mShortcutsMap.get(shortcutOnly), true /* isNotAWord */,
- false /* isPossiblyOffensive */);
- }
- mDictionary = null;
- mShortcutsMap.clear();
- mWord = "";
- mState = START;
- mFreq = 0;
- return dict;
- }
-
- @Override
- public void startElement(String uri, String localName, String qName, Attributes attrs) {
- if (WORD_TAG.equals(localName)) {
- mState = WORD;
- mWord = "";
- for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) {
- final String attrName = attrs.getLocalName(attrIndex);
- if (PROBABILITY_ATTR.equals(attrName)) {
- mFreq = Integer.parseInt(attrs.getValue(attrIndex));
- }
- }
- } else if (ROOT_TAG.equals(localName)) {
- final HashMap<String, String> attributes = new HashMap<>();
- for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) {
- final String attrName = attrs.getLocalName(attrIndex);
- attributes.put(attrName, attrs.getValue(attrIndex));
- }
- mDictionary = new FusionDictionary(new PtNodeArray(),
- new DictionaryOptions(attributes));
- } else {
- mState = UNKNOWN;
- }
- }
-
- @Override
- public void characters(char[] ch, int start, int length) {
- if (WORD == mState) {
- // The XML parser is free to return text in arbitrary chunks one after the
- // other. In particular, this happens in some implementations when it finds
- // an escape code like "&amp;".
- mWord += String.copyValueOf(ch, start, length);
- }
- }
-
- @Override
- public void endElement(String uri, String localName, String qName) {
- if (WORD == mState) {
- mDictionary.add(mWord, new ProbabilityInfo(mFreq), mShortcutsMap.get(mWord),
- false /* isNotAWord */, false /* isPossiblyOffensive */);
- mState = START;
- }
- }
- }
-
- static private class AssociativeListHandler extends DefaultHandler {
- private final String SRC_TAG;
- private final String SRC_ATTRIBUTE;
- private final String DST_TAG;
- private final String DST_ATTRIBUTE;
- private final String DST_FREQ;
-
- // In this version of the XML file, the bigram frequency is given as an int 0..XML_MAX
- private final static int XML_MAX = 256;
- // In memory and in the binary dictionary the bigram frequency is 0..MEMORY_MAX
- private final static int MEMORY_MAX = 256;
- private final static int XML_TO_MEMORY_RATIO = XML_MAX / MEMORY_MAX;
-
- private String mSrc;
- private final HashMap<String, ArrayList<WeightedString>> mAssocMap;
-
- public AssociativeListHandler(final String srcTag, final String srcAttribute,
- final String dstTag, final String dstAttribute, final String dstFreq) {
- SRC_TAG = srcTag;
- SRC_ATTRIBUTE = srcAttribute;
- DST_TAG = dstTag;
- DST_ATTRIBUTE = dstAttribute;
- DST_FREQ = dstFreq;
- mSrc = null;
- mAssocMap = new HashMap<>();
- }
-
- @Override
- public void startElement(String uri, String localName, String qName, Attributes attrs) {
- if (SRC_TAG.equals(localName)) {
- mSrc = attrs.getValue(uri, SRC_ATTRIBUTE);
- } else if (DST_TAG.equals(localName)) {
- String dst = attrs.getValue(uri, DST_ATTRIBUTE);
- int freq = getValueFromFreqString(attrs.getValue(uri, DST_FREQ));
- WeightedString bigram = new WeightedString(dst, freq / XML_TO_MEMORY_RATIO);
- ArrayList<WeightedString> bigramList = mAssocMap.get(mSrc);
- if (null == bigramList) bigramList = new ArrayList<>();
- bigramList.add(bigram);
- mAssocMap.put(mSrc, bigramList);
- }
- }
-
- protected int getValueFromFreqString(final String freqString) {
- return Integer.parseInt(freqString);
- }
-
- // This may return an empty map, but will never return null.
- public HashMap<String, ArrayList<WeightedString>> getAssocMap() {
- return mAssocMap;
- }
- }
-
- /**
- * SAX handler for a bigram XML file.
- */
- static private class BigramHandler extends AssociativeListHandler {
- private final static String BIGRAM_W1_TAG = "bi";
- private final static String BIGRAM_W2_TAG = "w";
- private final static String BIGRAM_W1_ATTRIBUTE = "w1";
- private final static String BIGRAM_W2_ATTRIBUTE = "w2";
- private final static String BIGRAM_FREQ_ATTRIBUTE = "p";
-
- public BigramHandler() {
- super(BIGRAM_W1_TAG, BIGRAM_W1_ATTRIBUTE, BIGRAM_W2_TAG, BIGRAM_W2_ATTRIBUTE,
- BIGRAM_FREQ_ATTRIBUTE);
- }
-
- // As per getAssocMap(), this never returns null.
- public HashMap<String, ArrayList<WeightedString>> getBigramMap() {
- return getAssocMap();
- }
- }
-
- /**
- * SAX handler for a shortcut & whitelist XML file.
- */
- static private class ShortcutAndWhitelistHandler extends AssociativeListHandler {
- private final static String ENTRY_TAG = "entry";
- private final static String ENTRY_ATTRIBUTE = "shortcut";
- private final static String TARGET_TAG = "target";
- private final static String REPLACEMENT_ATTRIBUTE = "replacement";
- private final static String TARGET_PRIORITY_ATTRIBUTE = "priority";
- private final static String WHITELIST_MARKER = "whitelist";
- private final static int WHITELIST_FREQ_VALUE = 15;
- private final static int MIN_FREQ = 0;
- private final static int MAX_FREQ = 14;
-
- public ShortcutAndWhitelistHandler() {
- super(ENTRY_TAG, ENTRY_ATTRIBUTE, TARGET_TAG, REPLACEMENT_ATTRIBUTE,
- TARGET_PRIORITY_ATTRIBUTE);
- }
-
- @Override
- protected int getValueFromFreqString(final String freqString) {
- if (WHITELIST_MARKER.equals(freqString)) {
- return WHITELIST_FREQ_VALUE;
- }
- final int intValue = super.getValueFromFreqString(freqString);
- if (intValue < MIN_FREQ || intValue > MAX_FREQ) {
- throw new RuntimeException("Shortcut freq out of range. Accepted range is "
- + MIN_FREQ + ".." + MAX_FREQ);
- }
- return intValue;
- }
-
- // As per getAssocMap(), this never returns null.
- public HashMap<String, ArrayList<WeightedString>> getShortcutAndWhitelistMap() {
- return getAssocMap();
- }
- }
-
- /**
- * Basic test to find out whether the file is in the unigram XML format or not.
- *
- * Concretely this only tests the header line.
- *
- * @param filename The name of the file to test.
- * @return true if the file is in the unigram XML format, false otherwise
- */
- public static boolean isXmlUnigramDictionary(final String filename) {
- try (final BufferedReader reader = new BufferedReader(
- new InputStreamReader(new FileInputStream(filename), "UTF-8"))) {
- final String firstLine = reader.readLine();
- return firstLine.matches("^\\s*<wordlist .*>\\s*$");
- } catch (final IOException e) {
- return false;
- }
- }
-
- /**
- * Reads a dictionary from an XML file.
- *
- * This is the public method that will parse an XML file and return the corresponding memory
- * representation.
- *
- * @param unigrams the file to read the data from.
- * @param shortcuts the file to read the shortcuts & whitelist from, or null.
- * @param bigrams the file to read the bigrams from, or null.
- * @return the in-memory representation of the dictionary.
- */
- public static FusionDictionary readDictionaryXml(final BufferedInputStream unigrams,
- final BufferedInputStream shortcuts, final BufferedInputStream bigrams)
- throws SAXException, IOException, ParserConfigurationException {
- final SAXParserFactory factory = SAXParserFactory.newInstance();
- factory.setNamespaceAware(true);
- final SAXParser parser = factory.newSAXParser();
- final BigramHandler bigramHandler = new BigramHandler();
- if (null != bigrams) parser.parse(bigrams, bigramHandler);
-
- final ShortcutAndWhitelistHandler shortcutAndWhitelistHandler =
- new ShortcutAndWhitelistHandler();
- if (null != shortcuts) parser.parse(shortcuts, shortcutAndWhitelistHandler);
-
- final UnigramHandler unigramHandler =
- new UnigramHandler(shortcutAndWhitelistHandler.getShortcutAndWhitelistMap());
- parser.parse(unigrams, unigramHandler);
- final FusionDictionary dict = unigramHandler.getFinalDictionary();
- final HashMap<String, ArrayList<WeightedString>> bigramMap = bigramHandler.getBigramMap();
- for (final String firstWord : bigramMap.keySet()) {
- if (!dict.hasWord(firstWord)) continue;
- final ArrayList<WeightedString> bigramList = bigramMap.get(firstWord);
- for (final WeightedString bigram : bigramList) {
- if (!dict.hasWord(bigram.mWord)) continue;
- dict.setBigram(firstWord, bigram.mWord, bigram.mProbabilityInfo);
- }
- }
- return dict;
- }
-
- /**
- * Reads a dictionary in the first, legacy XML format
- *
- * This method reads data from the parser and creates a new FusionDictionary with it.
- * The format parsed by this method is the format used before Ice Cream Sandwich,
- * which has no support for bigrams or shortcuts/whitelist.
- * It is important to note that this method expects the parser to have already eaten
- * the first, all-encompassing tag.
- *
- * @param xpp the parser to read the data from.
- * @return the parsed dictionary.
- */
-
- /**
- * Writes a dictionary to an XML file.
- *
- * The output format is the "second" format, which supports bigrams and shortcuts/whitelist.
- *
- * @param destination a destination stream to write to.
- * @param dict the dictionary to write.
- */
- public static void writeDictionaryXml(final BufferedWriter destination,
- final FusionDictionary dict) throws IOException {
- final TreeSet<WordProperty> wordPropertiesInDict = new TreeSet<>();
- for (WordProperty wordProperty : dict) {
- wordPropertiesInDict.add(wordProperty);
- }
- // TODO: use an XMLSerializer if this gets big
- destination.write("<wordlist format=\"2\"");
- for (final String key : dict.mOptions.mAttributes.keySet()) {
- final String value = dict.mOptions.mAttributes.get(key);
- destination.write(" " + key + "=\"" + value + "\"");
- }
- destination.write(">\n");
- destination.write("<!-- Warning: there is no code to read this format yet. -->\n");
- for (WordProperty wordProperty : wordPropertiesInDict) {
- destination.write(" <" + WORD_TAG + " " + WORD_ATTR + "=\"" + wordProperty.mWord
- + "\" " + PROBABILITY_ATTR + "=\"" + wordProperty.getProbability()
- + (wordProperty.mIsNotAWord ? "\" " + NOT_A_WORD_ATTR + "=\"true" : "")
- + "\">");
- if (wordProperty.mHasShortcuts) {
- destination.write("\n");
- for (WeightedString target : wordProperty.mShortcutTargets) {
- destination.write(" <" + SHORTCUT_TAG + " " + PROBABILITY_ATTR + "=\""
- + target.getProbability() + "\">" + target.mWord + "</" + SHORTCUT_TAG
- + ">\n");
- }
- destination.write(" ");
- }
- if (wordProperty.mHasNgrams) {
- destination.write("\n");
- for (WeightedString bigram : wordProperty.getBigrams()) {
- destination.write(" <" + BIGRAM_TAG + " " + PROBABILITY_ATTR + "=\""
- + bigram.getProbability() + "\">" + bigram.mWord
- + "</" + BIGRAM_TAG + ">\n");
- }
- destination.write(" ");
- }
- destination.write("</" + WORD_TAG + ">\n");
- }
- destination.write("</wordlist>\n");
- destination.close();
- }
-}