Class RegexLexer

java.lang.Object
com.oracle.truffle.regex.tregex.parser.RegexLexer
Direct Known Subclasses:
JavaRegexLexer, JSRegexLexer, OracleDBRegexLexer, PythonRegexLexer

public abstract class RegexLexer extends Object
  • Field Details

    • PREDEFINED_CHAR_CLASSES

      protected static final TBitSet PREDEFINED_CHAR_CLASSES
    • DEFAULT_WHITESPACE

      protected static final TBitSet DEFAULT_WHITESPACE
    • source

      public final RegexSource source
    • pattern

      protected final String pattern
      The source of the input pattern.
    • position

      protected int position
      The index of the next character in pattern to be parsed.
    • namedCaptureGroups

      protected Map<String,List<Integer>> namedCaptureGroups
    • compilationBuffer

      protected final CompilationBuffer compilationBuffer
  • Constructor Details

  • Method Details

    • getCompilationBuffer

      public CompilationBuffer getCompilationBuffer()
    • featureEnabledIgnoreCase

      protected abstract boolean featureEnabledIgnoreCase()
      Returns true if ignore-case mode is currently enabled.
    • featureEnabledAZPositionAssertions

      protected abstract boolean featureEnabledAZPositionAssertions()
      Returns true if \A and \Z position assertions are supported.
    • featureEnabledZLowerCaseAssertion

      protected abstract boolean featureEnabledZLowerCaseAssertion()
      Returns true if \z position assertion is supported.
    • featureEnabledBoundedQuantifierEmptyMin

      protected abstract boolean featureEnabledBoundedQuantifierEmptyMin()
      Returns true if empty minimum values in bounded quantifiers (e.g. {,1}) are allowed and treated as zero.
    • featureEnabledPossessiveQuantifiers

      protected abstract boolean featureEnabledPossessiveQuantifiers()
      Returns true if possessive quantifiers (+ suffix) are allowed.
    • featureEnabledCharClassFirstBracketIsLiteral

      protected abstract boolean featureEnabledCharClassFirstBracketIsLiteral()
      Returns true if the first character in a character class must be interpreted as part of the character set, even if it is the closing bracket ']'.
    • featureEnabledCCRangeWithPredefCharClass

      protected abstract boolean featureEnabledCCRangeWithPredefCharClass()
      Try to parse ranges with pre-defined inner character classes, e.g. [\w-a].
    • featureEnabledNestedCharClasses

      protected abstract boolean featureEnabledNestedCharClasses()
      Returns true if nested character classes are supported. This is required for featureEnabledPOSIXCharClasses() .
    • featureEnabledPOSIXCharClasses

      protected abstract boolean featureEnabledPOSIXCharClasses()
      Returns true if POSIX character classes, character equivalence classes, and the POSIX Collating Element Operator are supported. Requires featureEnabledNestedCharClasses().
    • getPOSIXCharClass

      protected abstract CodePointSet getPOSIXCharClass(String name)
      Returns the POSIX character class associated to the given name.
    • validatePOSIXCollationElement

      protected abstract void validatePOSIXCollationElement(String sequence)
      Checks if the given string is a valid collation element.
    • validatePOSIXEquivalenceClass

      protected abstract void validatePOSIXEquivalenceClass(String sequence)
      Checks if the given string is a valid equivalence class.
    • featureEnabledForwardReferences

      protected abstract boolean featureEnabledForwardReferences()
      Returns true if forward references are allowed.
    • featureEnabledGroupComments

      protected abstract boolean featureEnabledGroupComments()
      Returns true if group comments (e.g. (# ... )) are supported.
    • featureEnabledLineComments

      protected abstract boolean featureEnabledLineComments()
      Returns true if line comments (e.g. # ... ) are supported.
    • featureEnabledIgnoreWhiteSpace

      protected abstract boolean featureEnabledIgnoreWhiteSpace()
      Returns true if white space in the pattern is ignored. This is relevant only if line comments are not supported.
    • getWhitespace

      protected abstract TBitSet getWhitespace()
      The set of codepoints to consider as whitespace in comments and "ignore white space" mode.
    • featureEnabledOctalEscapes

      protected abstract boolean featureEnabledOctalEscapes()
      Returns true if octal escapes (e.g. \012) are supported.
    • featureEnabledSpecialGroups

      protected abstract boolean featureEnabledSpecialGroups()
      Returns true if any constructs that alter a capture group's function, such as non-capturing groups (?:) or look-around assertions (?=), are supported. If this flag is false, groups starting with a question mark (? do not have any special meaning.
    • featureEnabledUnicodePropertyEscapes

      protected abstract boolean featureEnabledUnicodePropertyEscapes()
      Returns true if unicode property escapes (e.g. \p{...}) are supported.
    • featureEnabledClassSetExpressions

      protected abstract boolean featureEnabledClassSetExpressions()
      Returns true if class set expressions (e.g. [[\w\q{abc|xyz}]--[a-cx-z]]) are supported.
    • caseFoldUnfold

      protected abstract void caseFoldUnfold(CodePointSetAccumulator charClass)
      Updates a character set by expanding it to the set of characters that case fold to the same characters as the characters currently in the set. This is done by case folding the set and then "unfolding" it by finding all inverse case fold mappings.
    • caseFoldClassSetAtom

      protected abstract ClassSetContents caseFoldClassSetAtom(ClassSetContents classSetContents)
      Case folds an atom in a class set expression. This maps the elements of the expression into their case folded variant.
    • complementClassSet

      protected abstract CodePointSet complementClassSet(CodePointSet codePointSet)
      Returns the complement of a class set element. In ECMAScript, this behavior can vary with the flags.
    • getDotCodePointSet

      protected abstract CodePointSet getDotCodePointSet()
      Returns the code point set represented by the dot operator.
    • getIdStart

      protected abstract CodePointSet getIdStart()
      Returns the set of all codepoints a group identifier may begin with.
    • getIdContinue

      protected abstract CodePointSet getIdContinue()
      Returns the set of all codepoints a group identifier may continue with.
    • getMaxBackReferenceDigits

      protected abstract int getMaxBackReferenceDigits()
      Returns the maximum number of digits to parse when parsing a back-reference.
    • isPredefCharClass

      protected boolean isPredefCharClass(char c)
      Returns true iff the given character is a predefined character class when preceded with a backslash (e.g. \d).
    • getPredefinedCharClass

      protected abstract CodePointSet getPredefinedCharClass(char c)
      Returns the CodePointSet associated with the given predefined character class (e.g. \d).

      Note that the CodePointSet returned by this function has already been case-folded and negated.

    • boundedQuantifierMaxValue

      protected abstract long boundedQuantifierMaxValue()
      The maximum value allowed while parsing bounded quantifiers. Larger values will cause a call to handleBoundedQuantifierOverflow(long, long).
    • handleBoundedQuantifierOutOfOrder

      protected abstract RegexSyntaxException handleBoundedQuantifierOutOfOrder()
      Handle {2,1}.
    • handleBoundedQuantifierEmptyOrMissingMin

      protected abstract Token handleBoundedQuantifierEmptyOrMissingMin()
      Handle missing } or minimum value in bounded quantifiers.
    • handleBoundedQuantifierInvalidCharacter

      protected abstract Token handleBoundedQuantifierInvalidCharacter()
      Handle non-digit characters in bounded quantifiers.
    • handleBoundedQuantifierOverflow

      protected abstract Token handleBoundedQuantifierOverflow(long min, long max)
      Handle integer overflows in quantifier bounds, e.g. {2147483649}. If this method returns a non-null value, it will be returned instead of the current quantifier.
    • handleBoundedQuantifierOverflowMin

      protected abstract Token handleBoundedQuantifierOverflowMin(long min, long max)
      Handle integer overflows in quantifier bounds, e.g. {2147483649}. If this method returns a non-null value, it will be returned instead of the current quantifier. This method is called when no explicit max value is present.
    • handleCCRangeOutOfOrder

      protected abstract RegexSyntaxException handleCCRangeOutOfOrder(int startPos)
      Handle out of order character class range elements, e.g. [b-a].
    • handleCCRangeWithPredefCharClass

      protected abstract void handleCCRangeWithPredefCharClass(int startPos, ClassSetContents firstAtom, ClassSetContents secondAtom)
      Handle non-codepoint character class range elements, e.g. [\w-a].
    • handleComplementOfStringSet

      protected abstract RegexSyntaxException handleComplementOfStringSet()
      Handle complement of class set expressions containing strings, e.g. [^\q{abc}] or \P{RGI_Emoji}.
    • handleGroupRedefinition

      protected abstract void handleGroupRedefinition(String name, int newId, int oldId)
    • handleIncompleteEscapeX

      protected abstract void handleIncompleteEscapeX()
      Handle incomplete hex escapes, e.g. \x1.
    • handleInvalidBackReference

      protected abstract Token handleInvalidBackReference(int reference)
      Handle group references to non-existent groups.
    • handleInvalidCharInCharClass

      protected abstract RegexSyntaxException handleInvalidCharInCharClass()
    • handleInvalidGroupBeginQ

      protected abstract RegexSyntaxException handleInvalidGroupBeginQ()
      Handle groups starting with (? and invalid next char.
    • handleMixedClassSetOperators

      protected abstract RegexSyntaxException handleMixedClassSetOperators(RegexLexer.ClassSetOperator leftOperator, RegexLexer.ClassSetOperator rightOperator)
      Handle class set expressions with mixed set operators in the same nested set.
    • handleMissingClassSetOperand

      protected abstract RegexSyntaxException handleMissingClassSetOperand(RegexLexer.ClassSetOperator operator)
      Handle missing operands in class set expressions, e.g. [\s&&] or [\w--].
    • handleOctalOutOfRange

      protected abstract void handleOctalOutOfRange()
      Handle octal values larger than 255.
    • handleRangeAsClassSetOperand

      protected abstract RegexSyntaxException handleRangeAsClassSetOperand(RegexLexer.ClassSetOperator operator)
      Handle character ranges as operands in class set expressions with operators other than union.
    • handleUnfinishedEscape

      protected abstract void handleUnfinishedEscape()
      Handle unfinished escape (e.g. \).
    • handleUnfinishedGroupComment

      protected abstract void handleUnfinishedGroupComment()
      Handle unfinished group comment (#...).
    • handleUnfinishedGroupQ

      protected abstract RegexSyntaxException handleUnfinishedGroupQ()
      Handle unfinished group with question mark (?.
    • handleUnfinishedRangeInClassSet

      protected abstract RegexSyntaxException handleUnfinishedRangeInClassSet()
      Handle unfinished range in class set expression [a-].
    • handleUnmatchedRightBrace

      protected abstract void handleUnmatchedRightBrace()
      Handle unmatched }.
    • handleUnmatchedLeftBracket

      protected abstract RegexSyntaxException handleUnmatchedLeftBracket()
      Handle unmatched [.
    • handleUnmatchedRightBracket

      protected abstract void handleUnmatchedRightBracket()
      Handle unmatched ].
    • checkClassSetCharacter

      protected abstract void checkClassSetCharacter(int codePoint) throws RegexSyntaxException
      Checks whether codepoint can appear as an unescaped literal class set character.
      Throws:
      RegexSyntaxException
    • parseCodePointInGroupName

      protected abstract int parseCodePointInGroupName() throws RegexSyntaxException
      Parse the next codepoint in a group name and return it.
      Throws:
      RegexSyntaxException
    • parseCustomEscape

      protected abstract Token parseCustomEscape(char c)
      Parse any escape sequence starting with \ and the argument c.
    • parseCustomEscapeChar

      protected abstract int parseCustomEscapeChar(char c, boolean inCharClass)
      Parse an escape character sequence (inside character class, or other escapes have already been tried) starting with \ and the argument {code c}.
    • parseCustomEscapeCharFallback

      protected abstract int parseCustomEscapeCharFallback(int c, boolean inCharClass)
      Parse an escape character sequence (inside character class, or other escapes have already been tried) starting with \ and the code point c.This method is called after all other means of parsing the escape sequence have been exhausted.
    • parseCustomGroupBeginQ

      protected abstract Token parseCustomGroupBeginQ(char charAfterQuestionMark)
      Parse group starting with (?.
    • parseGroupLt

      protected abstract Token parseGroupLt()
      Parse group starting with (<.
    • findChars

      protected boolean findChars(char... chars)
    • advance

      protected void advance()
    • retreat

      protected void retreat()
    • hasNext

      public boolean hasNext()
    • next

      public Token next() throws RegexSyntaxException
      Throws:
      RegexSyntaxException
    • getLastTokenPosition

      public int getLastTokenPosition()
      Returns the last token's position in the pattern string.
    • getLastCharacterClassBeginPosition

      public int getLastCharacterClassBeginPosition()
    • getLastAtomPosition

      protected int getLastAtomPosition()
    • curChar

      protected char curChar()
    • consumeChar

      protected char consumeChar()
    • advance

      protected void advance(int len)
    • lookahead

      protected boolean lookahead(String match)
    • lookahead

      protected boolean lookahead(Predicate<Character> predicate, int length)
    • consumingLookahead

      protected boolean consumingLookahead(char character)
    • consumingLookahead

      protected boolean consumingLookahead(String match)
    • consumingLookahead

      protected boolean consumingLookahead(Predicate<Character> predicate, int length)
    • lookbehind

      protected boolean lookbehind(char c)
    • count

      protected int count(Predicate<Character> predicate)
    • countUpTo

      protected int countUpTo(Predicate<Character> predicate, int max)
    • countFrom

      protected int countFrom(Predicate<Character> predicate, int fromIndex)
    • count

      protected int count(Predicate<Character> predicate, int fromIndex, int toIndex)
    • atEnd

      protected boolean atEnd()
    • inCharacterClass

      public boolean inCharacterClass()
    • isCurCharClassInverted

      public boolean isCurCharClassInverted()
    • getNumberOfParsedGroups

      protected int getNumberOfParsedGroups()
      Get the number of capture groups parsed so far.
    • totalNumberOfCaptureGroups

      public int totalNumberOfCaptureGroups() throws RegexSyntaxException
      Throws:
      RegexSyntaxException
    • numberOfCaptureGroupsSoFar

      public int numberOfCaptureGroupsSoFar()
    • getNamedCaptureGroups

      public Map<String,List<Integer>> getNamedCaptureGroups() throws RegexSyntaxException
      Throws:
      RegexSyntaxException
    • hasNamedCaptureGroups

      protected boolean hasNamedCaptureGroups() throws RegexSyntaxException
      Checks whether this regular expression contains any named capture groups.

      This method is a way to check whether we are parsing the goal symbol Pattern[~U, +N] or Pattern[~U, ~N] (see the ECMAScript RegExp grammar).

      Throws:
      RegexSyntaxException
    • registerNamedCaptureGroup

      protected void registerNamedCaptureGroup(String name)
    • getSingleNamedGroupNumber

      protected int getSingleNamedGroupNumber(String name)
    • literalChar

      protected Token literalChar(int codePoint)
    • parseGroupName

      protected RegexLexer.ParseGroupNameResult parseGroupName(char terminator) throws RegexSyntaxException
      Parse a GroupName, i.e. <RegExpIdentifierName>, assuming that the opening < bracket was already read.
      Returns:
      the StringValue of the RegExpIdentifierName
      Throws:
      RegexSyntaxException
    • parseIntSaturated

      protected int parseIntSaturated(int firstDigit, int length, int returnOnOverflow)
    • parseIntSaturated

      protected long parseIntSaturated(int firstDigit, int length, int returnOnOverflow, long maxValue)
    • countDecimalDigits

      protected int countDecimalDigits()
    • parseCharClassAtomPredefCharClass

      protected ClassSetContents parseCharClassAtomPredefCharClass(char c) throws RegexSyntaxException
      Throws:
      RegexSyntaxException
    • parseCharClassAtomCodePoint

      protected int parseCharClassAtomCodePoint(char c) throws RegexSyntaxException
      Throws:
      RegexSyntaxException
    • parseClassSetExpression

      protected ClassSetContents parseClassSetExpression() throws RegexSyntaxException
      Throws:
      RegexSyntaxException
    • parseUnicodeCharacterProperty

      protected ClassSetContents parseUnicodeCharacterProperty(boolean invert) throws RegexSyntaxException
      Throws:
      RegexSyntaxException
    • finishSurrogatePair

      protected int finishSurrogatePair(char c)
    • parseOctal

      protected int parseOctal(int firstDigit, int maxDigits)
    • parseHex

      protected int parseHex(int minDigits, int maxDigits, int maxValue, Runnable handleTooFewDigits, Runnable handleValueTooLarge)
    • syntaxError

      public RegexSyntaxException syntaxError(String msg)
    • isDecimalDigit

      public static boolean isDecimalDigit(int c)
    • isOctalDigit

      public static boolean isOctalDigit(int c)
    • isHexDigit

      public static boolean isHexDigit(int c)
    • isAscii

      public static boolean isAscii(int c)