diff options
Diffstat (limited to 'src/com/android/emailcommon/mail/Snippet.java')
-rw-r--r-- | src/com/android/emailcommon/mail/Snippet.java | 487 |
1 files changed, 487 insertions, 0 deletions
diff --git a/src/com/android/emailcommon/mail/Snippet.java b/src/com/android/emailcommon/mail/Snippet.java new file mode 100644 index 000000000..38f982e5d --- /dev/null +++ b/src/com/android/emailcommon/mail/Snippet.java @@ -0,0 +1,487 @@ +/* + * Copyright (C) 2010 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.emailcommon.mail; + +import android.text.TextUtils; + +import java.util.HashMap; +import java.util.Map; + +/** + * Class to generate a short 'snippet' from either plain text or html text + * + * If the sync protocol can get plain text, that's great, but we'll still strip out extraneous + * whitespace. If it's HTML, we'll 1) strip out tags, 2) turn entities into the appropriate + * characters, and 3) strip out extraneous whitespace, all in one pass + * + * Why not use an existing class? The best answer is performance; yet another answer is + * correctness (e.g. Html.textFromHtml simply doesn't generate well-stripped text). But performance + * is key; we frequently sync text that is 10K or (much) longer, yet we really only care about a + * small amount of text for the snippet. So it's critically important that we just stop when we've + * gotten enough; existing methods that exist will go through the entire incoming string, at great + * (and useless) expense. + */ +public class Snippet { + // This is how many chars we'll allow in a snippet + private static final int MAX_PLAIN_TEXT_SCAN_LENGTH = 200; + // For some reason, isWhitespace() returns false with the following... + /*package*/ static final char NON_BREAKING_SPACE_CHARACTER = (char)160; + + // Tags whose content must be stripped as well + static final String[] STRIP_TAGS = + new String[] {"title", "script", "style", "applet", "head"}; + // The number of characters we peel off for testing against STRIP_TAGS + static final int STRIP_TAG_LENGTH = 6; + + static final Map<String, Character> ESCAPE_STRINGS; + static { + // HTML character entity references as defined in HTML 4 + // see http://www.w3.org/TR/REC-html40/sgml/entities.html + ESCAPE_STRINGS = new HashMap<String, Character>(252); + + ESCAPE_STRINGS.put(" ", '\u00A0'); + ESCAPE_STRINGS.put("¡", '\u00A1'); + ESCAPE_STRINGS.put("¢", '\u00A2'); + ESCAPE_STRINGS.put("£", '\u00A3'); + ESCAPE_STRINGS.put("¤", '\u00A4'); + ESCAPE_STRINGS.put("¥", '\u00A5'); + ESCAPE_STRINGS.put("¦", '\u00A6'); + ESCAPE_STRINGS.put("§", '\u00A7'); + ESCAPE_STRINGS.put("¨", '\u00A8'); + ESCAPE_STRINGS.put("©", '\u00A9'); + ESCAPE_STRINGS.put("ª", '\u00AA'); + ESCAPE_STRINGS.put("«", '\u00AB'); + ESCAPE_STRINGS.put("¬", '\u00AC'); + ESCAPE_STRINGS.put("­", '\u00AD'); + ESCAPE_STRINGS.put("®", '\u00AE'); + ESCAPE_STRINGS.put("¯", '\u00AF'); + ESCAPE_STRINGS.put("°", '\u00B0'); + ESCAPE_STRINGS.put("±", '\u00B1'); + ESCAPE_STRINGS.put("²", '\u00B2'); + ESCAPE_STRINGS.put("³", '\u00B3'); + ESCAPE_STRINGS.put("´", '\u00B4'); + ESCAPE_STRINGS.put("µ", '\u00B5'); + ESCAPE_STRINGS.put("¶", '\u00B6'); + ESCAPE_STRINGS.put("·", '\u00B7'); + ESCAPE_STRINGS.put("¸", '\u00B8'); + ESCAPE_STRINGS.put("¹", '\u00B9'); + ESCAPE_STRINGS.put("º", '\u00BA'); + ESCAPE_STRINGS.put("»", '\u00BB'); + ESCAPE_STRINGS.put("¼", '\u00BC'); + ESCAPE_STRINGS.put("½", '\u00BD'); + ESCAPE_STRINGS.put("¾", '\u00BE'); + ESCAPE_STRINGS.put("¿", '\u00BF'); + ESCAPE_STRINGS.put("À", '\u00C0'); + ESCAPE_STRINGS.put("Á", '\u00C1'); + ESCAPE_STRINGS.put("Â", '\u00C2'); + ESCAPE_STRINGS.put("Ã", '\u00C3'); + ESCAPE_STRINGS.put("Ä", '\u00C4'); + ESCAPE_STRINGS.put("Å", '\u00C5'); + ESCAPE_STRINGS.put("Æ", '\u00C6'); + ESCAPE_STRINGS.put("Ç", '\u00C7'); + ESCAPE_STRINGS.put("È", '\u00C8'); + ESCAPE_STRINGS.put("É", '\u00C9'); + ESCAPE_STRINGS.put("Ê", '\u00CA'); + ESCAPE_STRINGS.put("Ë", '\u00CB'); + ESCAPE_STRINGS.put("Ì", '\u00CC'); + ESCAPE_STRINGS.put("Í", '\u00CD'); + ESCAPE_STRINGS.put("Î", '\u00CE'); + ESCAPE_STRINGS.put("Ï", '\u00CF'); + ESCAPE_STRINGS.put("Ð", '\u00D0'); + ESCAPE_STRINGS.put("Ñ", '\u00D1'); + ESCAPE_STRINGS.put("Ò", '\u00D2'); + ESCAPE_STRINGS.put("Ó", '\u00D3'); + ESCAPE_STRINGS.put("Ô", '\u00D4'); + ESCAPE_STRINGS.put("Õ", '\u00D5'); + ESCAPE_STRINGS.put("Ö", '\u00D6'); + ESCAPE_STRINGS.put("×", '\u00D7'); + ESCAPE_STRINGS.put("Ø", '\u00D8'); + ESCAPE_STRINGS.put("Ù", '\u00D9'); + ESCAPE_STRINGS.put("Ú", '\u00DA'); + ESCAPE_STRINGS.put("Û", '\u00DB'); + ESCAPE_STRINGS.put("Ü", '\u00DC'); + ESCAPE_STRINGS.put("Ý", '\u00DD'); + ESCAPE_STRINGS.put("Þ", '\u00DE'); + ESCAPE_STRINGS.put("ß", '\u00DF'); + ESCAPE_STRINGS.put("à", '\u00E0'); + ESCAPE_STRINGS.put("á", '\u00E1'); + ESCAPE_STRINGS.put("â", '\u00E2'); + ESCAPE_STRINGS.put("ã", '\u00E3'); + ESCAPE_STRINGS.put("ä", '\u00E4'); + ESCAPE_STRINGS.put("å", '\u00E5'); + ESCAPE_STRINGS.put("æ", '\u00E6'); + ESCAPE_STRINGS.put("ç", '\u00E7'); + ESCAPE_STRINGS.put("è", '\u00E8'); + ESCAPE_STRINGS.put("é", '\u00E9'); + ESCAPE_STRINGS.put("ê", '\u00EA'); + ESCAPE_STRINGS.put("ë", '\u00EB'); + ESCAPE_STRINGS.put("ì", '\u00EC'); + ESCAPE_STRINGS.put("í", '\u00ED'); + ESCAPE_STRINGS.put("î", '\u00EE'); + ESCAPE_STRINGS.put("ï", '\u00EF'); + ESCAPE_STRINGS.put("ð", '\u00F0'); + ESCAPE_STRINGS.put("ñ", '\u00F1'); + ESCAPE_STRINGS.put("ò", '\u00F2'); + ESCAPE_STRINGS.put("ó", '\u00F3'); + ESCAPE_STRINGS.put("ô", '\u00F4'); + ESCAPE_STRINGS.put("õ", '\u00F5'); + ESCAPE_STRINGS.put("ö", '\u00F6'); + ESCAPE_STRINGS.put("÷", '\u00F7'); + ESCAPE_STRINGS.put("ø", '\u00F8'); + ESCAPE_STRINGS.put("ù", '\u00F9'); + ESCAPE_STRINGS.put("ú", '\u00FA'); + ESCAPE_STRINGS.put("û", '\u00FB'); + ESCAPE_STRINGS.put("ü", '\u00FC'); + ESCAPE_STRINGS.put("ý", '\u00FD'); + ESCAPE_STRINGS.put("þ", '\u00FE'); + ESCAPE_STRINGS.put("ÿ", '\u00FF'); + ESCAPE_STRINGS.put("&fnof", '\u0192'); + ESCAPE_STRINGS.put("&Alpha", '\u0391'); + ESCAPE_STRINGS.put("&Beta", '\u0392'); + ESCAPE_STRINGS.put("&Gamma", '\u0393'); + ESCAPE_STRINGS.put("&Delta", '\u0394'); + ESCAPE_STRINGS.put("&Epsilon", '\u0395'); + ESCAPE_STRINGS.put("&Zeta", '\u0396'); + ESCAPE_STRINGS.put("&Eta", '\u0397'); + ESCAPE_STRINGS.put("&Theta", '\u0398'); + ESCAPE_STRINGS.put("&Iota", '\u0399'); + ESCAPE_STRINGS.put("&Kappa", '\u039A'); + ESCAPE_STRINGS.put("&Lambda", '\u039B'); + ESCAPE_STRINGS.put("&Mu", '\u039C'); + ESCAPE_STRINGS.put("&Nu", '\u039D'); + ESCAPE_STRINGS.put("&Xi", '\u039E'); + ESCAPE_STRINGS.put("&Omicron", '\u039F'); + ESCAPE_STRINGS.put("&Pi", '\u03A0'); + ESCAPE_STRINGS.put("&Rho", '\u03A1'); + ESCAPE_STRINGS.put("&Sigma", '\u03A3'); + ESCAPE_STRINGS.put("&Tau", '\u03A4'); + ESCAPE_STRINGS.put("&Upsilon", '\u03A5'); + ESCAPE_STRINGS.put("&Phi", '\u03A6'); + ESCAPE_STRINGS.put("&Chi", '\u03A7'); + ESCAPE_STRINGS.put("&Psi", '\u03A8'); + ESCAPE_STRINGS.put("&Omega", '\u03A9'); + ESCAPE_STRINGS.put("&alpha", '\u03B1'); + ESCAPE_STRINGS.put("&beta", '\u03B2'); + ESCAPE_STRINGS.put("&gamma", '\u03B3'); + ESCAPE_STRINGS.put("&delta", '\u03B4'); + ESCAPE_STRINGS.put("&epsilon", '\u03B5'); + ESCAPE_STRINGS.put("&zeta", '\u03B6'); + ESCAPE_STRINGS.put("&eta", '\u03B7'); + ESCAPE_STRINGS.put("&theta", '\u03B8'); + ESCAPE_STRINGS.put("&iota", '\u03B9'); + ESCAPE_STRINGS.put("&kappa", '\u03BA'); + ESCAPE_STRINGS.put("&lambda", '\u03BB'); + ESCAPE_STRINGS.put("&mu", '\u03BC'); + ESCAPE_STRINGS.put("&nu", '\u03BD'); + ESCAPE_STRINGS.put("&xi", '\u03BE'); + ESCAPE_STRINGS.put("&omicron", '\u03BF'); + ESCAPE_STRINGS.put("&pi", '\u03C0'); + ESCAPE_STRINGS.put("&rho", '\u03C1'); + ESCAPE_STRINGS.put("&sigmaf", '\u03C2'); + ESCAPE_STRINGS.put("&sigma", '\u03C3'); + ESCAPE_STRINGS.put("&tau", '\u03C4'); + ESCAPE_STRINGS.put("&upsilon", '\u03C5'); + ESCAPE_STRINGS.put("&phi", '\u03C6'); + ESCAPE_STRINGS.put("&chi", '\u03C7'); + ESCAPE_STRINGS.put("&psi", '\u03C8'); + ESCAPE_STRINGS.put("&omega", '\u03C9'); + ESCAPE_STRINGS.put("&thetasym", '\u03D1'); + ESCAPE_STRINGS.put("&upsih", '\u03D2'); + ESCAPE_STRINGS.put("&piv", '\u03D6'); + ESCAPE_STRINGS.put("&bull", '\u2022'); + ESCAPE_STRINGS.put("&hellip", '\u2026'); + ESCAPE_STRINGS.put("&prime", '\u2032'); + ESCAPE_STRINGS.put("&Prime", '\u2033'); + ESCAPE_STRINGS.put("&oline", '\u203E'); + ESCAPE_STRINGS.put("&frasl", '\u2044'); + ESCAPE_STRINGS.put("&weierp", '\u2118'); + ESCAPE_STRINGS.put("&image", '\u2111'); + ESCAPE_STRINGS.put("&real", '\u211C'); + ESCAPE_STRINGS.put("&trade", '\u2122'); + ESCAPE_STRINGS.put("&alefsym", '\u2135'); + ESCAPE_STRINGS.put("&larr", '\u2190'); + ESCAPE_STRINGS.put("&uarr", '\u2191'); + ESCAPE_STRINGS.put("&rarr", '\u2192'); + ESCAPE_STRINGS.put("&darr", '\u2193'); + ESCAPE_STRINGS.put("&harr", '\u2194'); + ESCAPE_STRINGS.put("&crarr", '\u21B5'); + ESCAPE_STRINGS.put("&lArr", '\u21D0'); + ESCAPE_STRINGS.put("&uArr", '\u21D1'); + ESCAPE_STRINGS.put("&rArr", '\u21D2'); + ESCAPE_STRINGS.put("&dArr", '\u21D3'); + ESCAPE_STRINGS.put("&hArr", '\u21D4'); + ESCAPE_STRINGS.put("&forall", '\u2200'); + ESCAPE_STRINGS.put("&part", '\u2202'); + ESCAPE_STRINGS.put("&exist", '\u2203'); + ESCAPE_STRINGS.put("&empty", '\u2205'); + ESCAPE_STRINGS.put("&nabla", '\u2207'); + ESCAPE_STRINGS.put("&isin", '\u2208'); + ESCAPE_STRINGS.put("¬in", '\u2209'); + ESCAPE_STRINGS.put("&ni", '\u220B'); + ESCAPE_STRINGS.put("&prod", '\u220F'); + ESCAPE_STRINGS.put("&sum", '\u2211'); + ESCAPE_STRINGS.put("&minus", '\u2212'); + ESCAPE_STRINGS.put("&lowast", '\u2217'); + ESCAPE_STRINGS.put("&radic", '\u221A'); + ESCAPE_STRINGS.put("&prop", '\u221D'); + ESCAPE_STRINGS.put("&infin", '\u221E'); + ESCAPE_STRINGS.put("&ang", '\u2220'); + ESCAPE_STRINGS.put("&and", '\u2227'); + ESCAPE_STRINGS.put("&or", '\u2228'); + ESCAPE_STRINGS.put("&cap", '\u2229'); + ESCAPE_STRINGS.put("&cup", '\u222A'); + ESCAPE_STRINGS.put("&int", '\u222B'); + ESCAPE_STRINGS.put("&there4", '\u2234'); + ESCAPE_STRINGS.put("&sim", '\u223C'); + ESCAPE_STRINGS.put("&cong", '\u2245'); + ESCAPE_STRINGS.put("&asymp", '\u2248'); + ESCAPE_STRINGS.put("&ne", '\u2260'); + ESCAPE_STRINGS.put("&equiv", '\u2261'); + ESCAPE_STRINGS.put("&le", '\u2264'); + ESCAPE_STRINGS.put("&ge", '\u2265'); + ESCAPE_STRINGS.put("&sub", '\u2282'); + ESCAPE_STRINGS.put("&sup", '\u2283'); + ESCAPE_STRINGS.put("&nsub", '\u2284'); + ESCAPE_STRINGS.put("&sube", '\u2286'); + ESCAPE_STRINGS.put("&supe", '\u2287'); + ESCAPE_STRINGS.put("&oplus", '\u2295'); + ESCAPE_STRINGS.put("&otimes", '\u2297'); + ESCAPE_STRINGS.put("&perp", '\u22A5'); + ESCAPE_STRINGS.put("&sdot", '\u22C5'); + ESCAPE_STRINGS.put("&lceil", '\u2308'); + ESCAPE_STRINGS.put("&rceil", '\u2309'); + ESCAPE_STRINGS.put("&lfloor", '\u230A'); + ESCAPE_STRINGS.put("&rfloor", '\u230B'); + ESCAPE_STRINGS.put("&lang", '\u2329'); + ESCAPE_STRINGS.put("&rang", '\u232A'); + ESCAPE_STRINGS.put("&loz", '\u25CA'); + ESCAPE_STRINGS.put("&spades", '\u2660'); + ESCAPE_STRINGS.put("&clubs", '\u2663'); + ESCAPE_STRINGS.put("&hearts", '\u2665'); + ESCAPE_STRINGS.put("&diams", '\u2666'); + ESCAPE_STRINGS.put(""", '\u0022'); + ESCAPE_STRINGS.put("&", '\u0026'); + ESCAPE_STRINGS.put("<", '\u003C'); + ESCAPE_STRINGS.put(">", '\u003E'); + ESCAPE_STRINGS.put("&OElig", '\u0152'); + ESCAPE_STRINGS.put("&oelig", '\u0153'); + ESCAPE_STRINGS.put("&Scaron", '\u0160'); + ESCAPE_STRINGS.put("&scaron", '\u0161'); + ESCAPE_STRINGS.put("&Yuml", '\u0178'); + ESCAPE_STRINGS.put("&circ", '\u02C6'); + ESCAPE_STRINGS.put("&tilde", '\u02DC'); + ESCAPE_STRINGS.put("&ensp", '\u2002'); + ESCAPE_STRINGS.put("&emsp", '\u2003'); + ESCAPE_STRINGS.put("&thinsp", '\u2009'); + ESCAPE_STRINGS.put("&zwnj", '\u200C'); + ESCAPE_STRINGS.put("&zwj", '\u200D'); + ESCAPE_STRINGS.put("&lrm", '\u200E'); + ESCAPE_STRINGS.put("&rlm", '\u200F'); + ESCAPE_STRINGS.put("&ndash", '\u2013'); + ESCAPE_STRINGS.put("&mdash", '\u2014'); + ESCAPE_STRINGS.put("&lsquo", '\u2018'); + ESCAPE_STRINGS.put("&rsquo", '\u2019'); + ESCAPE_STRINGS.put("&sbquo", '\u201A'); + ESCAPE_STRINGS.put("&ldquo", '\u201C'); + ESCAPE_STRINGS.put("&rdquo", '\u201D'); + ESCAPE_STRINGS.put("&bdquo", '\u201E'); + ESCAPE_STRINGS.put("&dagger", '\u2020'); + ESCAPE_STRINGS.put("&Dagger", '\u2021'); + ESCAPE_STRINGS.put("&permil", '\u2030'); + ESCAPE_STRINGS.put("&lsaquo", '\u2039'); + ESCAPE_STRINGS.put("&rsaquo", '\u203A'); + ESCAPE_STRINGS.put("&euro", '\u20AC'); + } + + public static String fromHtmlText(String text) { + return fromText(text, true); + } + + public static String fromPlainText(String text) { + return fromText(text, false); + } + + /** + * Find the end of this tag; there are two alternatives: <tag .../> or <tag ...> ... </tag> + * @param htmlText some HTML text + * @param tag the HTML tag + * @param startPos the start position in the HTML text where the tag starts + * @return the position just before the end of the tag or -1 if not found + */ + /*package*/ static int findTagEnd(String htmlText, String tag, int startPos) { + if (tag.endsWith(" ")) { + tag = tag.substring(0, tag.length() - 1); + } + int length = htmlText.length(); + char prevChar = 0; + for (int i = startPos; i < length; i++) { + char c = htmlText.charAt(i); + if (c == '>') { + if (prevChar == '/') { + return i - 1; + } + break; + } + prevChar = c; + } + // We didn't find /> at the end of the tag so find </tag> + return htmlText.indexOf("/" + tag, startPos); + } + + public static String fromText(String text, boolean stripHtml) { + // Handle null and empty string + if (TextUtils.isEmpty(text)) return ""; + + final int length = text.length(); + // Use char[] instead of StringBuilder purely for performance; fewer method calls, etc. + char[] buffer = new char[MAX_PLAIN_TEXT_SCAN_LENGTH]; + // skipCount is an array of a single int; that int is set inside stripHtmlEntity and is + // used to determine how many characters can be "skipped" due to the transformation of the + // entity to a single character. When Java allows multiple return values, we can make this + // much cleaner :-) + int[] skipCount = new int[1]; + int bufferCount = 0; + // Start with space as last character to avoid leading whitespace + char last = ' '; + // Indicates whether we're in the middle of an HTML tag + boolean inTag = false; + + // Walk through the text until we're done with the input OR we've got a large enough snippet + for (int i = 0; i < length && bufferCount < MAX_PLAIN_TEXT_SCAN_LENGTH; i++) { + char c = text.charAt(i); + if (stripHtml && !inTag && (c == '<')) { + // Find tags to strip; they will begin with <! or !- or </ or <letter + if (i < (length - 1)) { + char peek = text.charAt(i + 1); + if (peek == '!' || peek == '-' || peek == '/' || Character.isLetter(peek)) { + inTag = true; + // Strip content of title, script, style and applet tags + if (i < (length - (STRIP_TAG_LENGTH + 2))) { + String tag = text.substring(i + 1, i + STRIP_TAG_LENGTH + 1); + String tagLowerCase = tag.toLowerCase(); + boolean stripContent = false; + for (String stripTag: STRIP_TAGS) { + if (tagLowerCase.startsWith(stripTag)) { + stripContent = true; + tag = tag.substring(0, stripTag.length()); + break; + } + } + if (stripContent) { + // Look for the end of this tag + int endTagPosition = findTagEnd(text, tag, i); + if (endTagPosition < 0) { + break; + } else { + i = endTagPosition; + } + } + } + } + } + } else if (stripHtml && inTag && (c == '>')) { + // Terminate stripping here + inTag = false; + continue; + } + + if (inTag) { + // We just skip by everything while we're in a tag + continue; + } else if (stripHtml && (c == '&')) { + // Handle a possible HTML entity here + // We always get back a character to use; we also get back a "skip count", + // indicating how many characters were eaten from the entity + c = stripHtmlEntity(text, i, skipCount); + i += skipCount[0]; + } + + if (Character.isWhitespace(c) || (c == NON_BREAKING_SPACE_CHARACTER)) { + // The idea is to find the content in the message, not the whitespace, so we'll + // turn any combination of contiguous whitespace into a single space + if (last == ' ') { + continue; + } else { + // Make every whitespace character a simple space + c = ' '; + } + } else if ((c == '-' || c == '=') && (last == c)) { + // Lots of messages (especially digests) have whole lines of --- or === + // We'll get rid of those duplicates here + continue; + } + + // After all that, maybe we've got a character for our snippet + buffer[bufferCount++] = c; + last = c; + } + + // Lose trailing space and return our snippet + if ((bufferCount > 0) && (last == ' ')) { + bufferCount--; + } + return new String(buffer, 0, bufferCount); + } + + static /*package*/ char stripHtmlEntity(String text, int pos, int[] skipCount) { + int length = text.length(); + // Ugly, but we store our skip count in this array; we can't use a static here, because + // multiple threads might be calling in + skipCount[0] = 0; + // All entities are <= 8 characters long, so that's how far we'll look for one (+ & and ;) + int end = pos + 10; + String entity = null; + // Isolate the entity + for (int i = pos; (i < length) && (i < end); i++) { + if (text.charAt(i) == ';') { + entity = text.substring(pos, i); + break; + } + } + if (entity == null) { + // This wasn't really an HTML entity + return '&'; + } else { + // Skip count is the length of the entity + Character mapping = ESCAPE_STRINGS.get(entity); + int entityLength = entity.length(); + if (mapping != null) { + skipCount[0] = entityLength; + return mapping; + } else if ((entityLength > 2) && (entity.charAt(1) == '#')) { + // &#nn; means ascii nn (decimal) and &#xnn means ascii nn (hex) + char c = '?'; + try { + int i; + if ((entity.charAt(2) == 'x') && (entityLength > 3)) { + i = Integer.parseInt(entity.substring(3), 16); + } else { + i = Integer.parseInt(entity.substring(2)); + } + c = (char)i; + } catch (NumberFormatException e) { + // We'll just return the ? in this case + } + skipCount[0] = entityLength; + return c; + } + } + // Worst case, we return the original start character, ampersand + return '&'; + } + +} |