aboutsummaryrefslogtreecommitdiffstats
path: root/scripts/generator/cpp/tokenize.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/generator/cpp/tokenize.py')
-rwxr-xr-xscripts/generator/cpp/tokenize.py287
1 files changed, 287 insertions, 0 deletions
diff --git a/scripts/generator/cpp/tokenize.py b/scripts/generator/cpp/tokenize.py
new file mode 100755
index 0000000..28c3345
--- /dev/null
+++ b/scripts/generator/cpp/tokenize.py
@@ -0,0 +1,287 @@
+#!/usr/bin/env python
+#
+# Copyright 2007 Neal Norwitz
+# Portions Copyright 2007 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenize C++ source code."""
+
+__author__ = 'nnorwitz@google.com (Neal Norwitz)'
+
+
+try:
+ # Python 3.x
+ import builtins
+except ImportError:
+ # Python 2.x
+ import __builtin__ as builtins
+
+
+import sys
+
+from cpp import utils
+
+
+if not hasattr(builtins, 'set'):
+ # Nominal support for Python 2.3.
+ from sets import Set as set
+
+
+# Add $ as a valid identifier char since so much code uses it.
+_letters = 'abcdefghijklmnopqrstuvwxyz'
+VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
+HEX_DIGITS = set('0123456789abcdefABCDEF')
+INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
+
+
+# C++0x string preffixes.
+_STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
+
+
+# Token types.
+UNKNOWN = 'UNKNOWN'
+SYNTAX = 'SYNTAX'
+CONSTANT = 'CONSTANT'
+NAME = 'NAME'
+PREPROCESSOR = 'PREPROCESSOR'
+
+# Where the token originated from. This can be used for backtracking.
+# It is always set to WHENCE_STREAM in this code.
+WHENCE_STREAM, WHENCE_QUEUE = range(2)
+
+
+class Token(object):
+ """Data container to represent a C++ token.
+
+ Tokens can be identifiers, syntax char(s), constants, or
+ pre-processor directives.
+
+ start contains the index of the first char of the token in the source
+ end contains the index of the last char of the token in the source
+ """
+
+ def __init__(self, token_type, name, start, end):
+ self.token_type = token_type
+ self.name = name
+ self.start = start
+ self.end = end
+ self.whence = WHENCE_STREAM
+
+ def __str__(self):
+ if not utils.DEBUG:
+ return 'Token(%r)' % self.name
+ return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
+
+ __repr__ = __str__
+
+
+def _GetString(source, start, i):
+ i = source.find('"', i+1)
+ while source[i-1] == '\\':
+ # Count the trailing backslashes.
+ backslash_count = 1
+ j = i - 2
+ while source[j] == '\\':
+ backslash_count += 1
+ j -= 1
+ # When trailing backslashes are even, they escape each other.
+ if (backslash_count % 2) == 0:
+ break
+ i = source.find('"', i+1)
+ return i + 1
+
+
+def _GetChar(source, start, i):
+ # NOTE(nnorwitz): may not be quite correct, should be good enough.
+ i = source.find("'", i+1)
+ while source[i-1] == '\\':
+ # Need to special case '\\'.
+ if (i - 2) > start and source[i-2] == '\\':
+ break
+ i = source.find("'", i+1)
+ # Try to handle unterminated single quotes (in a #if 0 block).
+ if i < 0:
+ i = start
+ return i + 1
+
+
+def GetTokens(source):
+ """Returns a sequence of Tokens.
+
+ Args:
+ source: string of C++ source code.
+
+ Yields:
+ Token that represents the next token in the source.
+ """
+ # Cache various valid character sets for speed.
+ valid_identifier_chars = VALID_IDENTIFIER_CHARS
+ hex_digits = HEX_DIGITS
+ int_or_float_digits = INT_OR_FLOAT_DIGITS
+ int_or_float_digits2 = int_or_float_digits | set('.')
+
+ # Only ignore errors while in a #if 0 block.
+ ignore_errors = False
+ count_ifs = 0
+
+ i = 0
+ end = len(source)
+ while i < end:
+ # Skip whitespace.
+ while i < end and source[i].isspace():
+ i += 1
+ if i >= end:
+ return
+
+ token_type = UNKNOWN
+ start = i
+ c = source[i]
+ if c.isalpha() or c == '_': # Find a string token.
+ token_type = NAME
+ while source[i] in valid_identifier_chars:
+ i += 1
+ # String and character constants can look like a name if
+ # they are something like L"".
+ if (source[i] == "'" and (i - start) == 1 and
+ source[start:i] in 'uUL'):
+ # u, U, and L are valid C++0x character preffixes.
+ token_type = CONSTANT
+ i = _GetChar(source, start, i)
+ elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
+ token_type = CONSTANT
+ i = _GetString(source, start, i)
+ elif c == '/' and source[i+1] == '/': # Find // comments.
+ i = source.find('\n', i)
+ if i == -1: # Handle EOF.
+ i = end
+ continue
+ elif c == '/' and source[i+1] == '*': # Find /* comments. */
+ i = source.find('*/', i) + 2
+ continue
+ elif c in ':+-<>&|*=': # : or :: (plus other chars).
+ token_type = SYNTAX
+ i += 1
+ new_ch = source[i]
+ if new_ch == c:
+ i += 1
+ elif c == '-' and new_ch == '>':
+ i += 1
+ elif new_ch == '=':
+ i += 1
+ elif c in '()[]{}~!?^%;/.,': # Handle single char tokens.
+ token_type = SYNTAX
+ i += 1
+ if c == '.' and source[i].isdigit():
+ token_type = CONSTANT
+ i += 1
+ while source[i] in int_or_float_digits:
+ i += 1
+ # Handle float suffixes.
+ for suffix in ('l', 'f'):
+ if suffix == source[i:i+1].lower():
+ i += 1
+ break
+ elif c.isdigit(): # Find integer.
+ token_type = CONSTANT
+ if c == '0' and source[i+1] in 'xX':
+ # Handle hex digits.
+ i += 2
+ while source[i] in hex_digits:
+ i += 1
+ else:
+ while source[i] in int_or_float_digits2:
+ i += 1
+ # Handle integer (and float) suffixes.
+ for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
+ size = len(suffix)
+ if suffix == source[i:i+size].lower():
+ i += size
+ break
+ elif c == '"': # Find string.
+ token_type = CONSTANT
+ i = _GetString(source, start, i)
+ elif c == "'": # Find char.
+ token_type = CONSTANT
+ i = _GetChar(source, start, i)
+ elif c == '#': # Find pre-processor command.
+ token_type = PREPROCESSOR
+ got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
+ if got_if:
+ count_ifs += 1
+ elif source[i:i+6] == '#endif':
+ count_ifs -= 1
+ if count_ifs == 0:
+ ignore_errors = False
+
+ # TODO(nnorwitz): handle preprocessor statements (\ continuations).
+ while 1:
+ i1 = source.find('\n', i)
+ i2 = source.find('//', i)
+ i3 = source.find('/*', i)
+ i4 = source.find('"', i)
+ # NOTE(nnorwitz): doesn't handle comments in #define macros.
+ # Get the first important symbol (newline, comment, EOF/end).
+ i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
+
+ # Handle #include "dir//foo.h" properly.
+ if source[i] == '"':
+ i = source.find('"', i+1) + 1
+ assert i > 0
+ continue
+ # Keep going if end of the line and the line ends with \.
+ if not (i == i1 and source[i-1] == '\\'):
+ if got_if:
+ condition = source[start+4:i].lstrip()
+ if (condition.startswith('0') or
+ condition.startswith('(0)')):
+ ignore_errors = True
+ break
+ i += 1
+ elif c == '\\': # Handle \ in code.
+ # This is different from the pre-processor \ handling.
+ i += 1
+ continue
+ elif ignore_errors:
+ # The tokenizer seems to be in pretty good shape. This
+ # raise is conditionally disabled so that bogus code
+ # in an #if 0 block can be handled. Since we will ignore
+ # it anyways, this is probably fine. So disable the
+ # exception and return the bogus char.
+ i += 1
+ else:
+ sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
+ ('?', i, c, source[i-10:i+10]))
+ raise RuntimeError('unexpected token')
+
+ if i <= 0:
+ print('Invalid index, exiting now.')
+ return
+ yield Token(token_type, source[start:i], start, i)
+
+
+if __name__ == '__main__':
+ def main(argv):
+ """Driver mostly for testing purposes."""
+ for filename in argv[1:]:
+ source = utils.ReadFile(filename)
+ if source is None:
+ continue
+
+ for token in GetTokens(source):
+ print('%-12s: %s' % (token.token_type, token.name))
+ # print('\r%6.2f%%' % (100.0 * index / token.end),)
+ sys.stdout.write('\n')
+
+
+ main(sys.argv)