1 files changed, 287 insertions, 0 deletions
diff --git a/scripts/generator/cpp/tokenize.py b/scripts/generator/cpp/tokenize.py
new file mode 100755
index 0000000..28c3345
--- /dev/null
+++ b/scripts/generator/cpp/tokenize.py
@@ -0,0 +1,287 @@
+#!/usr/bin/env python
+#
+# Copyright 2007 Neal Norwitz
+# Portions Copyright 2007 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenize C++ source code."""
+
+__author__ = 'nnorwitz@google.com (Neal Norwitz)'
+
+
+try:
+    # Python 3.x
+    import builtins
+except ImportError:
+    # Python 2.x
+    import __builtin__ as builtins
+
+
+import sys
+
+from cpp import utils
+
+
+if not hasattr(builtins, 'set'):
+    # Nominal support for Python 2.3.
+    from sets import Set as set
+
+
+# Add $ as a valid identifier char since so much code uses it.
+_letters = 'abcdefghijklmnopqrstuvwxyz'
+VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
+HEX_DIGITS = set('0123456789abcdefABCDEF')
+INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
+
+
+# C++0x string preffixes.
+_STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
+
+
+# Token types.
+UNKNOWN = 'UNKNOWN'
+SYNTAX = 'SYNTAX'
+CONSTANT = 'CONSTANT'
+NAME = 'NAME'
+PREPROCESSOR = 'PREPROCESSOR'
+
+# Where the token originated from.  This can be used for backtracking.
+# It is always set to WHENCE_STREAM in this code.
+WHENCE_STREAM, WHENCE_QUEUE = range(2)
+
+
+class Token(object):
+    """Data container to represent a C++ token.
+
+    Tokens can be identifiers, syntax char(s), constants, or
+    pre-processor directives.
+
+    start contains the index of the first char of the token in the source
+    end contains the index of the last char of the token in the source
+    """
+
+    def __init__(self, token_type, name, start, end):
+        self.token_type = token_type
+        self.name = name
+        self.start = start
+        self.end = end
+        self.whence = WHENCE_STREAM
+
+    def __str__(self):
+        if not utils.DEBUG:
+            return 'Token(%r)' % self.name
+        return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
+
+    __repr__ = __str__
+
+
+def _GetString(source, start, i):
+    i = source.find('"', i+1)
+    while source[i-1] == '\\':
+        # Count the trailing backslashes.
+        backslash_count = 1
+        j = i - 2
+        while source[j] == '\\':
+            backslash_count += 1
+            j -= 1
+        # When trailing backslashes are even, they escape each other.
+        if (backslash_count % 2) == 0:
+            break
+        i = source.find('"', i+1)
+    return i + 1
+
+
+def _GetChar(source, start, i):
+    # NOTE(nnorwitz): may not be quite correct, should be good enough.
+    i = source.find("'", i+1)
+    while source[i-1] == '\\':
+        # Need to special case '\\'.
+        if (i - 2) > start and source[i-2] == '\\':
+            break
+        i = source.find("'", i+1)
+    # Try to handle unterminated single quotes (in a #if 0 block).
+    if i < 0:
+        i = start
+    return i + 1
+
+
+def GetTokens(source):
+    """Returns a sequence of Tokens.
+
+    Args:
+      source: string of C++ source code.
+
+    Yields:
+      Token that represents the next token in the source.
+    """
+    # Cache various valid character sets for speed.
+    valid_identifier_chars = VALID_IDENTIFIER_CHARS
+    hex_digits = HEX_DIGITS
+    int_or_float_digits = INT_OR_FLOAT_DIGITS
+    int_or_float_digits2 = int_or_float_digits | set('.')
+
+    # Only ignore errors while in a #if 0 block.
+    ignore_errors = False
+    count_ifs = 0
+
+    i = 0
+    end = len(source)
+    while i < end:
+        # Skip whitespace.
+        while i < end and source[i].isspace():
+            i += 1
+        if i >= end:
+            return
+
+        token_type = UNKNOWN
+        start = i
+        c = source[i]
+        if c.isalpha() or c == '_':              # Find a string token.
+            token_type = NAME
+            while source[i] in valid_identifier_chars:
+                i += 1
+            # String and character constants can look like a name if
+            # they are something like L"".
+            if (source[i] == "'" and (i - start) == 1 and
+                source[start:i] in 'uUL'):
+                # u, U, and L are valid C++0x character preffixes.
+                token_type = CONSTANT
+                i = _GetChar(source, start, i)
+            elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
+                token_type = CONSTANT
+                i = _GetString(source, start, i)
+        elif c == '/' and source[i+1] == '/':    # Find // comments.
+            i = source.find('\n', i)
+            if i == -1:  # Handle EOF.
+                i = end
+            continue
+        elif c == '/' and source[i+1] == '*':    # Find /* comments. */
+            i = source.find('*/', i) + 2
+            continue
+        elif c in ':+-<>&|*=':                   # : or :: (plus other chars).
+            token_type = SYNTAX
+            i += 1
+            new_ch = source[i]
+            if new_ch == c:
+                i += 1
+            elif c == '-' and new_ch == '>':
+                i += 1
+            elif new_ch == '=':
+                i += 1
+        elif c in '()[]{}~!?^%;/.,':             # Handle single char tokens.
+            token_type = SYNTAX
+            i += 1
+            if c == '.' and source[i].isdigit():
+                token_type = CONSTANT
+                i += 1
+                while source[i] in int_or_float_digits:
+                    i += 1
+                # Handle float suffixes.
+                for suffix in ('l', 'f'):
+                    if suffix == source[i:i+1].lower():
+                        i += 1
+                        break
+        elif c.isdigit():                        # Find integer.
+            token_type = CONSTANT
+            if c == '0' and source[i+1] in 'xX':
+                # Handle hex digits.
+                i += 2
+                while source[i] in hex_digits:
+                    i += 1
+            else:
+                while source[i] in int_or_float_digits2:
+                    i += 1
+            # Handle integer (and float) suffixes.
+            for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
+                size = len(suffix)
+                if suffix == source[i:i+size].lower():
+                    i += size
+                    break
+        elif c == '"':                           # Find string.
+            token_type = CONSTANT
+            i = _GetString(source, start, i)
+        elif c == "'":                           # Find char.
+            token_type = CONSTANT
+            i = _GetChar(source, start, i)
+        elif c == '#':                           # Find pre-processor command.
+            token_type = PREPROCESSOR
+            got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
+            if got_if:
+                count_ifs += 1
+            elif source[i:i+6] == '#endif':
+                count_ifs -= 1
+                if count_ifs == 0:
+                    ignore_errors = False
+
+            # TODO(nnorwitz): handle preprocessor statements (\ continuations).
+            while 1:
+                i1 = source.find('\n', i)
+                i2 = source.find('//', i)
+                i3 = source.find('/*', i)
+                i4 = source.find('"', i)
+                # NOTE(nnorwitz): doesn't handle comments in #define macros.
+                # Get the first important symbol (newline, comment, EOF/end).
+                i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
+
+                # Handle #include "dir//foo.h" properly.
+                if source[i] == '"':
+                    i = source.find('"', i+1) + 1
+                    assert i > 0
+                    continue
+                # Keep going if end of the line and the line ends with \.
+                if not (i == i1 and source[i-1] == '\\'):
+                    if got_if:
+                        condition = source[start+4:i].lstrip()
+                        if (condition.startswith('0') or
+                            condition.startswith('(0)')):
+                            ignore_errors = True
+                    break
+                i += 1
+        elif c == '\\':                          # Handle \ in code.
+            # This is different from the pre-processor \ handling.
+            i += 1
+            continue
+        elif ignore_errors:
+            # The tokenizer seems to be in pretty good shape.  This
+            # raise is conditionally disabled so that bogus code
+            # in an #if 0 block can be handled.  Since we will ignore
+            # it anyways, this is probably fine.  So disable the
+            # exception and  return the bogus char.
+            i += 1
+        else:
+            sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
+                             ('?', i, c, source[i-10:i+10]))
+            raise RuntimeError('unexpected token')
+
+        if i <= 0:
+            print('Invalid index, exiting now.')
+            return
+        yield Token(token_type, source[start:i], start, i)
+
+
+if __name__ == '__main__':
+    def main(argv):
+        """Driver mostly for testing purposes."""
+        for filename in argv[1:]:
+            source = utils.ReadFile(filename)
+            if source is None:
+                continue
+
+            for token in GetTokens(source):
+                print('%-12s: %s' % (token.token_type, token.name))
+                # print('\r%6.2f%%' % (100.0 * index / token.end),)
+            sys.stdout.write('\n')
+
+
+    main(sys.argv)