package syntax import ( "strings" "unicode" "unicode/utf8" ) type token struct { kind tokenKind pos Position } func (tok token) String() string { return tok.kind.String() } type tokenKind byte //go:generate stringer -type=tokenKind -trimprefix=tok -linecomment=true const ( tokNone tokenKind = iota tokChar tokGroupFlags tokPosixClass tokConcat tokRepeat tokEscapeChar tokEscapeMeta tokEscapeOctal tokEscapeUni tokEscapeUniFull tokEscapeHex tokEscapeHexFull tokComment tokQ // \Q tokMinus // - tokLbracket // [ tokLbracketCaret // [^ tokRbracket // ] tokDollar // $ tokCaret // ^ tokQuestion // ? tokDot // . tokPlus // + tokStar // * tokPipe // | tokLparen // ( tokLparenName // (?P tokLparenNameAngle // (? tokLparenNameQuote // (?'name' tokLparenFlags // (?flags tokLparenAtomic // (?> tokLparenPositiveLookahead // (?= tokLparenPositiveLookbehind // (?<= tokLparenNegativeLookahead // (?! tokLparenNegativeLookbehind // (? unicode.MaxASCII { _, size := utf8.DecodeRuneInString(l.input[l.pos:]) l.pushTok(tokChar, size) l.maybeInsertConcat() continue } switch ch { case '\\': l.scanEscape(false) case '.': l.pushTok(tokDot, 1) case '+': l.pushTok(tokPlus, 1) case '*': l.pushTok(tokStar, 1) case '^': l.pushTok(tokCaret, 1) case '$': l.pushTok(tokDollar, 1) case '?': l.pushTok(tokQuestion, 1) case ')': l.pushTok(tokRparen, 1) case '|': l.pushTok(tokPipe, 1) case '[': if l.byteAt(l.pos+1) == '^' { l.pushTok(tokLbracketCaret, 2) } else { l.pushTok(tokLbracket, 1) } l.scanCharClass() case '(': if l.byteAt(l.pos+1) == '?' { switch { case l.byteAt(l.pos+2) == '>': l.pushTok(tokLparenAtomic, len("(?>")) case l.byteAt(l.pos+2) == '=': l.pushTok(tokLparenPositiveLookahead, len("(?=")) case l.byteAt(l.pos+2) == '!': l.pushTok(tokLparenNegativeLookahead, len("(?!")) case l.byteAt(l.pos+2) == '<' && l.byteAt(l.pos+3) == '=': l.pushTok(tokLparenPositiveLookbehind, len("(?<=")) case l.byteAt(l.pos+2) == '<' && l.byteAt(l.pos+3) == '!': l.pushTok(tokLparenNegativeLookbehind, len("(?= 0 { l.pushTok(tokRepeat, len("{")+j) } else { l.pushTok(tokChar, 1) } default: l.pushTok(tokChar, 1) } l.maybeInsertConcat() } } func (l *lexer) scanCharClass() { l.maybeInsertConcat() // We need to handle first `]` in a special way. See #3. if l.byteAt(l.pos) == ']' { l.pushTok(tokChar, 1) } for l.pos < len(l.input) { ch := l.input[l.pos] if ch > unicode.MaxASCII { _, size := utf8.DecodeRuneInString(l.input[l.pos:]) l.pushTok(tokChar, size) continue } switch ch { case '\\': l.scanEscape(true) case '[': isPosixClass := false if l.byteAt(l.pos+1) == ':' { j := l.stringIndex(l.pos+2, ":]") if j >= 0 { isPosixClass = true l.pushTok(tokPosixClass, j+len("[::]")) } } if !isPosixClass { l.pushTok(tokChar, 1) } case '-': l.pushTok(tokMinus, 1) case ']': l.pushTok(tokRbracket, 1) return // Stop scanning in the char context default: l.pushTok(tokChar, 1) } } } func (l *lexer) scanEscape(insideCharClass bool) { s := l.input if l.pos+1 >= len(s) { throwErrorf(l.pos, l.pos+1, `unexpected end of pattern: trailing '\'`) } switch { case s[l.pos+1] == 'p' || s[l.pos+1] == 'P': if l.pos+2 >= len(s) { throwErrorf(l.pos, l.pos+2, "unexpected end of pattern: expected uni-class-short or '{'") } if s[l.pos+2] == '{' { j := strings.IndexByte(s[l.pos+2:], '}') if j < 0 { throwErrorf(l.pos, l.pos+2, "can't find closing '}'") } l.pushTok(tokEscapeUniFull, len(`\p{`)+j) } else { l.pushTok(tokEscapeUni, len(`\pL`)) } case s[l.pos+1] == 'x': if l.pos+2 >= len(s) { throwErrorf(l.pos, l.pos+2, "unexpected end of pattern: expected hex-digit or '{'") } if s[l.pos+2] == '{' { j := strings.IndexByte(s[l.pos+2:], '}') if j < 0 { throwErrorf(l.pos, l.pos+2, "can't find closing '}'") } l.pushTok(tokEscapeHexFull, len(`\x{`)+j) } else { if isHexDigit(l.byteAt(l.pos + 3)) { l.pushTok(tokEscapeHex, len(`\xFF`)) } else { l.pushTok(tokEscapeHex, len(`\xF`)) } } case isOctalDigit(s[l.pos+1]): digits := 1 if isOctalDigit(l.byteAt(l.pos + 2)) { if isOctalDigit(l.byteAt(l.pos + 3)) { digits = 3 } else { digits = 2 } } l.pushTok(tokEscapeOctal, len(`\`)+digits) case s[l.pos+1] == 'Q': size := len(s) - l.pos // Until the pattern ends j := l.stringIndex(l.pos+2, `\E`) if j >= 0 { size = j + len(`\Q\E`) } l.pushTok(tokQ, size) default: ch := l.byteAt(l.pos + 1) if ch > unicode.MaxASCII { _, size := utf8.DecodeRuneInString(l.input[l.pos+1:]) l.pushTok(tokEscapeChar, len(`\`)+size) return } kind := tokEscapeChar if insideCharClass { if charClassMetachar[ch] { kind = tokEscapeMeta } } else { if reMetachar[ch] { kind = tokEscapeMeta } } l.pushTok(kind, 2) } } func (l *lexer) maybeInsertConcat() { if l.isConcatPos() { last := len(l.tokens) - 1 tok := l.tokens[last] l.tokens[last].kind = tokConcat l.tokens = append(l.tokens, tok) } } func (l *lexer) Init(s string) { l.pos = 0 l.tokens = l.tokens[:0] l.input = s l.scan() l.pos = 0 } func (l *lexer) tryScanGroupName(pos int) bool { tok := tokLparenName endCh := byte('>') offset := 1 switch l.byteAt(pos) { case '\'': endCh = '\'' tok = tokLparenNameQuote case '<': tok = tokLparenNameAngle case 'P': offset = 2 default: return false } if pos+offset >= len(l.input) { return false } end := strings.IndexByte(l.input[pos+offset:], endCh) if end < 0 { return false } l.pushTok(tok, len("(?")+offset+end+1) return true } func (l *lexer) tryScanGroupFlags(pos int) bool { colonPos := strings.IndexByte(l.input[pos:], ':') parenPos := strings.IndexByte(l.input[pos:], ')') if parenPos < 0 { return false } end := parenPos if colonPos >= 0 && colonPos < parenPos { end = colonPos + len(":") } l.pushTok(tokLparenFlags, len("(?")+end) return true } func (l *lexer) tryScanComment(pos int) bool { if l.byteAt(pos) != '#' { return false } parenPos := strings.IndexByte(l.input[pos:], ')') if parenPos < 0 { return false } l.pushTok(tokComment, len("(?")+parenPos+len(")")) return true } func (l *lexer) repeatWidth(pos int) int { j := pos for isDigit(l.byteAt(j)) { j++ } if j == pos { return -1 } if l.byteAt(j) == '}' { return (j + len("}")) - pos // {min} } if l.byteAt(j) != ',' { return -1 } j += len(",") for isDigit(l.byteAt(j)) { j++ } if l.byteAt(j) == '}' { return (j + len("}")) - pos // {min,} or {min,max} } return -1 } func (l *lexer) stringIndex(offset int, s string) int { if offset < len(l.input) { return strings.Index(l.input[offset:], s) } return -1 } func (l *lexer) byteAt(pos int) byte { if pos >= 0 && pos < len(l.input) { return l.input[pos] } return 0 } func (l *lexer) pushTok(kind tokenKind, size int) { l.tokens = append(l.tokens, token{ kind: kind, pos: Position{Begin: uint16(l.pos), End: uint16(l.pos + size)}, }) l.pos += size } func (l *lexer) isConcatPos() bool { if len(l.tokens) < 2 { return false } x := l.tokens[len(l.tokens)-2].kind if concatTable[x]&concatX != 0 { return false } y := l.tokens[len(l.tokens)-1].kind return concatTable[y]&concatY == 0 } const ( concatX byte = 1 << iota concatY ) var concatTable = [256]byte{ tokPipe: concatX | concatY, tokLparen: concatX, tokLparenFlags: concatX, tokLparenName: concatX, tokLparenNameAngle: concatX, tokLparenNameQuote: concatX, tokLparenAtomic: concatX, tokLbracket: concatX, tokLbracketCaret: concatX, tokLparenPositiveLookahead: concatX, tokLparenPositiveLookbehind: concatX, tokLparenNegativeLookahead: concatX, tokLparenNegativeLookbehind: concatX, tokRparen: concatY, tokRbracket: concatY, tokPlus: concatY, tokStar: concatY, tokQuestion: concatY, tokRepeat: concatY, }