diff options
Diffstat (limited to 'src/scanner.h')
-rw-r--r-- | src/scanner.h | 353 |
1 files changed, 172 insertions, 181 deletions
diff --git a/src/scanner.h b/src/scanner.h index 045e7d27..16c3a427 100644 --- a/src/scanner.h +++ b/src/scanner.h @@ -41,26 +41,6 @@ namespace v8 { namespace internal { - -// General collection of (multi-)bit-flags that can be passed to scanners and -// parsers to signify their (initial) mode of operation. -enum ParsingFlags { - kNoParsingFlags = 0, - // Embed LanguageMode values in parsing flags, i.e., equivalent to: - // CLASSIC_MODE = 0, - // STRICT_MODE, - // EXTENDED_MODE, - kLanguageModeMask = 0x03, - kAllowLazy = 0x04, - kAllowNativesSyntax = 0x08, - kAllowModules = 0x10 -}; - -STATIC_ASSERT((kLanguageModeMask & CLASSIC_MODE) == CLASSIC_MODE); -STATIC_ASSERT((kLanguageModeMask & STRICT_MODE) == STRICT_MODE); -STATIC_ASSERT((kLanguageModeMask & EXTENDED_MODE) == EXTENDED_MODE); - - // Returns the value (0 .. 15) of a hexadecimal character c. // If c is not a legal hexadecimal character, returns a value < 0. inline int HexValue(uc32 c) { @@ -73,17 +53,15 @@ inline int HexValue(uc32 c) { // --------------------------------------------------------------------- -// Buffered stream of UTF-16 code units, using an internal UTF-16 buffer. -// A code unit is a 16 bit value representing either a 16 bit code point -// or one part of a surrogate pair that make a single 21 bit code point. +// Buffered stream of characters, using an internal UC16 buffer. -class Utf16CharacterStream { +class UC16CharacterStream { public: - Utf16CharacterStream() : pos_(0) { } - virtual ~Utf16CharacterStream() { } + UC16CharacterStream() : pos_(0) { } + virtual ~UC16CharacterStream() { } - // Returns and advances past the next UTF-16 code unit in the input - // stream. If there are no more code units, it returns a negative + // Returns and advances past the next UC16 character in the input + // stream. If there are no more characters, it returns a negative // value. inline uc32 Advance() { if (buffer_cursor_ < buffer_end_ || ReadBlock()) { @@ -92,47 +70,47 @@ class Utf16CharacterStream { } // Note: currently the following increment is necessary to avoid a // parser problem! The scanner treats the final kEndOfInput as - // a code unit with a position, and does math relative to that + // a character with a position, and does math relative to that // position. pos_++; return kEndOfInput; } - // Return the current position in the code unit stream. + // Return the current position in the character stream. // Starts at zero. inline unsigned pos() const { return pos_; } - // Skips forward past the next code_unit_count UTF-16 code units + // Skips forward past the next character_count UC16 characters // in the input, or until the end of input if that comes sooner. - // Returns the number of code units actually skipped. If less - // than code_unit_count, - inline unsigned SeekForward(unsigned code_unit_count) { + // Returns the number of characters actually skipped. If less + // than character_count, + inline unsigned SeekForward(unsigned character_count) { unsigned buffered_chars = static_cast<unsigned>(buffer_end_ - buffer_cursor_); - if (code_unit_count <= buffered_chars) { - buffer_cursor_ += code_unit_count; - pos_ += code_unit_count; - return code_unit_count; + if (character_count <= buffered_chars) { + buffer_cursor_ += character_count; + pos_ += character_count; + return character_count; } - return SlowSeekForward(code_unit_count); + return SlowSeekForward(character_count); } - // Pushes back the most recently read UTF-16 code unit (or negative + // Pushes back the most recently read UC16 character (or negative // value if at end of input), i.e., the value returned by the most recent // call to Advance. // Must not be used right after calling SeekForward. - virtual void PushBack(int32_t code_unit) = 0; + virtual void PushBack(int32_t character) = 0; protected: static const uc32 kEndOfInput = -1; - // Ensures that the buffer_cursor_ points to the code_unit at + // Ensures that the buffer_cursor_ points to the character at // position pos_ of the input, if possible. If the position // is at or after the end of the input, return false. If there - // are more code_units available, return true. + // are more characters available, return true. virtual bool ReadBlock() = 0; - virtual unsigned SlowSeekForward(unsigned code_unit_count) = 0; + virtual unsigned SlowSeekForward(unsigned character_count) = 0; const uc16* buffer_cursor_; const uc16* buffer_end_; @@ -180,24 +158,23 @@ class LiteralBuffer { } } - INLINE(void AddChar(uint32_t code_unit)) { + inline void AddChar(uc16 character) { if (position_ >= backing_store_.length()) ExpandBuffer(); if (is_ascii_) { - if (code_unit < kMaxAsciiCharCodeU) { - backing_store_[position_] = static_cast<byte>(code_unit); + if (character < kMaxAsciiCharCodeU) { + backing_store_[position_] = static_cast<byte>(character); position_ += kASCIISize; return; } - ConvertToUtf16(); + ConvertToUC16(); } - ASSERT(code_unit < 0x10000u); - *reinterpret_cast<uc16*>(&backing_store_[position_]) = code_unit; + *reinterpret_cast<uc16*>(&backing_store_[position_]) = character; position_ += kUC16Size; } bool is_ascii() { return is_ascii_; } - Vector<const uc16> utf16_literal() { + Vector<const uc16> uc16_literal() { ASSERT(!is_ascii_); ASSERT((position_ & 0x1) == 0); return Vector<const uc16>( @@ -239,13 +216,13 @@ class LiteralBuffer { backing_store_ = new_store; } - void ConvertToUtf16() { + void ConvertToUC16() { ASSERT(is_ascii_); Vector<byte> new_store; int new_content_size = position_ * kUC16Size; if (new_content_size >= backing_store_.length()) { - // Ensure room for all currently read code units as UC16 as well - // as the code unit about to be stored. + // Ensure room for all currently read characters as UC16 as well + // as the character about to be stored. new_store = Vector<byte>::New(NewCapacity(new_content_size)); } else { new_store = backing_store_; @@ -272,32 +249,35 @@ class LiteralBuffer { // ---------------------------------------------------------------------------- -// JavaScript Scanner. +// Scanner base-class. +// Generic functionality used by both JSON and JavaScript scanners. class Scanner { public: - // Scoped helper for literal recording. Automatically drops the literal - // if aborting the scanning before it's complete. + // -1 is outside of the range of any real source code. + static const int kNoOctalLocation = -1; + + typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; + class LiteralScope { public: - explicit LiteralScope(Scanner* self) - : scanner_(self), complete_(false) { - scanner_->StartLiteral(); - } - ~LiteralScope() { - if (!complete_) scanner_->DropLiteral(); - } - void Complete() { - scanner_->TerminateLiteral(); - complete_ = true; - } + explicit LiteralScope(Scanner* self); + ~LiteralScope(); + void Complete(); private: Scanner* scanner_; bool complete_; }; - // Representation of an interval of source positions. + explicit Scanner(UnicodeCache* scanner_contants); + + // Returns the current token again. + Token::Value current_token() { return current_.token; } + + // One token look-ahead (past the token returned by Next()). + Token::Value peek() const { return next_.token; } + struct Location { Location(int b, int e) : beg_pos(b), end_pos(e) { } Location() : beg_pos(0), end_pos(0) { } @@ -312,39 +292,28 @@ class Scanner { int end_pos; }; - // -1 is outside of the range of any real source code. - static const int kNoOctalLocation = -1; - - typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; - - explicit Scanner(UnicodeCache* scanner_contants); - - void Initialize(Utf16CharacterStream* source); - - // Returns the next token and advances input. - Token::Value Next(); - // Returns the current token again. - Token::Value current_token() { return current_.token; } // Returns the location information for the current token - // (the token last returned by Next()). + // (the token returned by Next()). Location location() const { return current_.location; } + Location peek_location() const { return next_.location; } + // Returns the literal string, if any, for the current token (the - // token last returned by Next()). The string is 0-terminated. - // Literal strings are collected for identifiers, strings, and - // numbers. + // token returned by Next()). The string is 0-terminated and in + // UTF-8 format; they may contain 0-characters. Literal strings are + // collected for identifiers, strings, and numbers. // These functions only give the correct result if the literal // was scanned between calls to StartLiteral() and TerminateLiteral(). - Vector<const char> literal_ascii_string() { + bool is_literal_ascii() { ASSERT_NOT_NULL(current_.literal_chars); - return current_.literal_chars->ascii_literal(); + return current_.literal_chars->is_ascii(); } - Vector<const uc16> literal_utf16_string() { + Vector<const char> literal_ascii_string() { ASSERT_NOT_NULL(current_.literal_chars); - return current_.literal_chars->utf16_literal(); + return current_.literal_chars->ascii_literal(); } - bool is_literal_ascii() { + Vector<const uc16> literal_uc16_string() { ASSERT_NOT_NULL(current_.literal_chars); - return current_.literal_chars->is_ascii(); + return current_.literal_chars->uc16_literal(); } int literal_length() const { ASSERT_NOT_NULL(current_.literal_chars); @@ -361,26 +330,19 @@ class Scanner { return current_.literal_chars->length() != source_length; } - // Similar functions for the upcoming token. - - // One token look-ahead (past the token returned by Next()). - Token::Value peek() const { return next_.token; } - - Location peek_location() const { return next_.location; } - // Returns the literal string for the next token (the token that // would be returned if Next() were called). - Vector<const char> next_literal_ascii_string() { + bool is_next_literal_ascii() { ASSERT_NOT_NULL(next_.literal_chars); - return next_.literal_chars->ascii_literal(); + return next_.literal_chars->is_ascii(); } - Vector<const uc16> next_literal_utf16_string() { + Vector<const char> next_literal_ascii_string() { ASSERT_NOT_NULL(next_.literal_chars); - return next_.literal_chars->utf16_literal(); + return next_.literal_chars->ascii_literal(); } - bool is_next_literal_ascii() { + Vector<const uc16> next_literal_uc16_string() { ASSERT_NOT_NULL(next_.literal_chars); - return next_.literal_chars->is_ascii(); + return next_.literal_chars->uc16_literal(); } int next_literal_length() const { ASSERT_NOT_NULL(next_.literal_chars); @@ -391,52 +353,7 @@ class Scanner { static const int kCharacterLookaheadBufferSize = 1; - // Scans octal escape sequence. Also accepts "\0" decimal escape sequence. - uc32 ScanOctalEscape(uc32 c, int length); - - // Returns the location of the last seen octal literal. - Location octal_position() const { return octal_pos_; } - void clear_octal_position() { octal_pos_ = Location::invalid(); } - - // Seek forward to the given position. This operation does not - // work in general, for instance when there are pushed back - // characters, but works for seeking forward until simple delimiter - // tokens, which is what it is used for. - void SeekForward(int pos); - - bool HarmonyScoping() const { - return harmony_scoping_; - } - void SetHarmonyScoping(bool scoping) { - harmony_scoping_ = scoping; - } - bool HarmonyModules() const { - return harmony_modules_; - } - void SetHarmonyModules(bool modules) { - harmony_modules_ = modules; - } - - - // Returns true if there was a line terminator before the peek'ed token, - // possibly inside a multi-line comment. - bool HasAnyLineTerminatorBeforeNext() const { - return has_line_terminator_before_next_ || - has_multiline_comment_before_next_; - } - - // Scans the input as a regular expression pattern, previous - // character(s) must be /(=). Returns true if a pattern is scanned. - bool ScanRegExpPattern(bool seen_equal); - // Returns true if regexp flags are scanned (always since flags can - // be empty). - bool ScanRegExpFlags(); - - // Tells whether the buffer contains an identifier (no escapes). - // Used for checking if a property name is an identifier. - static bool IsIdentifier(unibrow::CharacterStream* buffer); - - private: + protected: // The current and look-ahead token. struct TokenDesc { Token::Value token; @@ -461,7 +378,7 @@ class Scanner { next_.literal_chars = free_buffer; } - INLINE(void AddLiteralChar(uc32 c)) { + inline void AddLiteralChar(uc32 c) { ASSERT_NOT_NULL(next_.literal_chars); next_.literal_chars->AddChar(c); } @@ -506,14 +423,107 @@ class Scanner { uc32 ScanHexNumber(int expected_length); - // Scans a single JavaScript token. - void Scan(); + // Return the current source position. + int source_pos() { + return source_->pos() - kCharacterLookaheadBufferSize; + } + + UnicodeCache* unicode_cache_; + + // Buffers collecting literal strings, numbers, etc. + LiteralBuffer literal_buffer1_; + LiteralBuffer literal_buffer2_; + + TokenDesc current_; // desc for current token (as returned by Next()) + TokenDesc next_; // desc for next token (one token look-ahead) + + // Input stream. Must be initialized to an UC16CharacterStream. + UC16CharacterStream* source_; + + // One Unicode character look-ahead; c0_ < 0 at the end of the input. + uc32 c0_; +}; + +// ---------------------------------------------------------------------------- +// JavaScriptScanner - base logic for JavaScript scanning. + +class JavaScriptScanner : public Scanner { + public: + // A LiteralScope that disables recording of some types of JavaScript + // literals. If the scanner is configured to not record the specific + // type of literal, the scope will not call StartLiteral. + class LiteralScope { + public: + explicit LiteralScope(JavaScriptScanner* self) + : scanner_(self), complete_(false) { + scanner_->StartLiteral(); + } + ~LiteralScope() { + if (!complete_) scanner_->DropLiteral(); + } + void Complete() { + scanner_->TerminateLiteral(); + complete_ = true; + } + + private: + JavaScriptScanner* scanner_; + bool complete_; + }; + + explicit JavaScriptScanner(UnicodeCache* scanner_contants); + + void Initialize(UC16CharacterStream* source); + // Returns the next token. + Token::Value Next(); + + // Returns true if there was a line terminator before the peek'ed token, + // possibly inside a multi-line comment. + bool HasAnyLineTerminatorBeforeNext() const { + return has_line_terminator_before_next_ || + has_multiline_comment_before_next_; + } + + // Scans the input as a regular expression pattern, previous + // character(s) must be /(=). Returns true if a pattern is scanned. + bool ScanRegExpPattern(bool seen_equal); + // Returns true if regexp flags are scanned (always since flags can + // be empty). + bool ScanRegExpFlags(); + + // Tells whether the buffer contains an identifier (no escapes). + // Used for checking if a property name is an identifier. + static bool IsIdentifier(unibrow::CharacterStream* buffer); + + // Scans octal escape sequence. Also accepts "\0" decimal escape sequence. + uc32 ScanOctalEscape(uc32 c, int length); + + // Returns the location of the last seen octal literal + Location octal_position() const { return octal_pos_; } + void clear_octal_position() { octal_pos_ = Location::invalid(); } + + // Seek forward to the given position. This operation does not + // work in general, for instance when there are pushed back + // characters, but works for seeking forward until simple delimiter + // tokens, which is what it is used for. + void SeekForward(int pos); + + bool HarmonyBlockScoping() const { + return harmony_block_scoping_; + } + void SetHarmonyBlockScoping(bool block_scoping) { + harmony_block_scoping_ = block_scoping; + } + + + protected: bool SkipWhiteSpace(); Token::Value SkipSingleLineComment(); Token::Value SkipMultiLineComment(); - // Scans a possible HTML comment -- begins with '<!'. - Token::Value ScanHtmlComment(); + + // Scans a single JavaScript token. + void Scan(); void ScanDecimalDigits(); Token::Value ScanNumber(bool seen_period); @@ -523,6 +533,9 @@ class Scanner { void ScanEscape(); Token::Value ScanString(); + // Scans a possible HTML comment -- begins with '<!'. + Token::Value ScanHtmlComment(); + // Decodes a unicode escape-sequence which is part of an identifier. // If the escape sequence cannot be decoded the result is kBadChar. uc32 ScanIdentifierUnicodeEscape(); @@ -531,30 +544,9 @@ class Scanner { // flags. bool ScanLiteralUnicodeEscape(); - // Return the current source position. - int source_pos() { - return source_->pos() - kCharacterLookaheadBufferSize; - } - - UnicodeCache* unicode_cache_; - - // Buffers collecting literal strings, numbers, etc. - LiteralBuffer literal_buffer1_; - LiteralBuffer literal_buffer2_; - - TokenDesc current_; // desc for current token (as returned by Next()) - TokenDesc next_; // desc for next token (one token look-ahead) - - // Input stream. Must be initialized to an Utf16CharacterStream. - Utf16CharacterStream* source_; - - // Start position of the octal literal last scanned. Location octal_pos_; - // One Unicode character look-ahead; c0_ < 0 at the end of the input. - uc32 c0_; - // Whether there is a line terminator whitespace character after // the current token, and before the next. Does not count newlines // inside multiline comments. @@ -562,10 +554,9 @@ class Scanner { // Whether there is a multi-line comment that contains a // line-terminator after the current token, and before the next. bool has_multiline_comment_before_next_; - // Whether we scan 'let' as a keyword for harmony block-scoped let bindings. - bool harmony_scoping_; - // Whether we scan 'module', 'import', 'export' as keywords. - bool harmony_modules_; + // Whether we scan 'let' as a keyword for harmony block scoped + // let bindings. + bool harmony_block_scoping_; }; } } // namespace v8::internal |