aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/github.com/quasilyte/regex/syntax/parser.go
diff options
context:
space:
mode:
authorDmitry Vyukov <dvyukov@google.com>2020-09-15 18:05:35 +0200
committerDmitry Vyukov <dvyukov@google.com>2020-09-15 19:34:30 +0200
commit712de1c63d9db97c81af68cd0dc4372c53d2e57a (patch)
treeae1761fec52c3ae4ddd003a4130ddbda8d0a2d69 /vendor/github.com/quasilyte/regex/syntax/parser.go
parent298a69c38dd5c8a9bbd7a022e88f4ddbcf885e16 (diff)
vendor/github.com/golangci/golangci-lint: update to v1.31
Diffstat (limited to 'vendor/github.com/quasilyte/regex/syntax/parser.go')
-rw-r--r--vendor/github.com/quasilyte/regex/syntax/parser.go503
1 files changed, 503 insertions, 0 deletions
diff --git a/vendor/github.com/quasilyte/regex/syntax/parser.go b/vendor/github.com/quasilyte/regex/syntax/parser.go
new file mode 100644
index 000000000..c540ac593
--- /dev/null
+++ b/vendor/github.com/quasilyte/regex/syntax/parser.go
@@ -0,0 +1,503 @@
+package syntax
+
+import (
+ "errors"
+ "strings"
+)
+
+type ParserOptions struct {
+ // NoLiterals disables OpChar merging into OpLiteral.
+ NoLiterals bool
+}
+
+func NewParser(opts *ParserOptions) *Parser {
+ return newParser(opts)
+}
+
+type Parser struct {
+ out Regexp
+ lexer lexer
+ exprPool []Expr
+
+ prefixParselets [256]prefixParselet
+ infixParselets [256]infixParselet
+
+ charClass []Expr
+ allocated uint
+
+ opts ParserOptions
+}
+
+// ParsePCRE parses PHP-style pattern with delimiters.
+// An example of such pattern is `/foo/i`.
+func (p *Parser) ParsePCRE(pattern string) (*RegexpPCRE, error) {
+ pcre, err := p.newPCRE(pattern)
+ if err != nil {
+ return nil, err
+ }
+ if pcre.HasModifier('x') {
+ return nil, errors.New("'x' modifier is not supported")
+ }
+ re, err := p.Parse(pcre.Pattern)
+ if re != nil {
+ pcre.Expr = re.Expr
+ }
+ return pcre, err
+}
+
+func (p *Parser) Parse(pattern string) (result *Regexp, err error) {
+ defer func() {
+ r := recover()
+ if r == nil {
+ return
+ }
+ if err2, ok := r.(ParseError); ok {
+ err = err2
+ return
+ }
+ panic(r)
+ }()
+
+ p.lexer.Init(pattern)
+ p.allocated = 0
+ p.out.Pattern = pattern
+ if pattern == "" {
+ p.out.Expr = *p.newExpr(OpConcat, Position{})
+ } else {
+ p.out.Expr = *p.parseExpr(0)
+ }
+
+ if !p.opts.NoLiterals {
+ p.mergeChars(&p.out.Expr)
+ }
+ p.setValues(&p.out.Expr)
+
+ return &p.out, nil
+}
+
+type prefixParselet func(token) *Expr
+
+type infixParselet func(*Expr, token) *Expr
+
+func newParser(opts *ParserOptions) *Parser {
+ var p Parser
+
+ if opts != nil {
+ p.opts = *opts
+ }
+ p.exprPool = make([]Expr, 256)
+
+ for tok, op := range tok2op {
+ if op != 0 {
+ p.prefixParselets[tokenKind(tok)] = p.parsePrefixElementary
+ }
+ }
+
+ p.prefixParselets[tokQ] = func(tok token) *Expr {
+ litPos := tok.pos
+ litPos.Begin += uint16(len(`\Q`))
+ form := FormQuoteUnclosed
+ if strings.HasSuffix(p.tokenValue(tok), `\E`) {
+ litPos.End -= uint16(len(`\E`))
+ form = FormDefault
+ }
+ lit := p.newExpr(OpString, litPos)
+ return p.newExprForm(OpQuote, form, tok.pos, lit)
+ }
+
+ p.prefixParselets[tokEscapeHexFull] = func(tok token) *Expr {
+ litPos := tok.pos
+ litPos.Begin += uint16(len(`\x{`))
+ litPos.End -= uint16(len(`}`))
+ lit := p.newExpr(OpString, litPos)
+ return p.newExprForm(OpEscapeHex, FormEscapeHexFull, tok.pos, lit)
+ }
+ p.prefixParselets[tokEscapeUniFull] = func(tok token) *Expr {
+ litPos := tok.pos
+ litPos.Begin += uint16(len(`\p{`))
+ litPos.End -= uint16(len(`}`))
+ lit := p.newExpr(OpString, litPos)
+ return p.newExprForm(OpEscapeUni, FormEscapeUniFull, tok.pos, lit)
+ }
+
+ p.prefixParselets[tokEscapeHex] = func(tok token) *Expr { return p.parseEscape(OpEscapeHex, `\x`, tok) }
+ p.prefixParselets[tokEscapeOctal] = func(tok token) *Expr { return p.parseEscape(OpEscapeOctal, `\`, tok) }
+ p.prefixParselets[tokEscapeChar] = func(tok token) *Expr { return p.parseEscape(OpEscapeChar, `\`, tok) }
+ p.prefixParselets[tokEscapeMeta] = func(tok token) *Expr { return p.parseEscape(OpEscapeMeta, `\`, tok) }
+ p.prefixParselets[tokEscapeUni] = func(tok token) *Expr { return p.parseEscape(OpEscapeUni, `\p`, tok) }
+
+ p.prefixParselets[tokLparen] = func(tok token) *Expr { return p.parseGroup(OpCapture, tok) }
+ p.prefixParselets[tokLparenAtomic] = func(tok token) *Expr { return p.parseGroup(OpAtomicGroup, tok) }
+ p.prefixParselets[tokLparenPositiveLookahead] = func(tok token) *Expr { return p.parseGroup(OpPositiveLookahead, tok) }
+ p.prefixParselets[tokLparenNegativeLookahead] = func(tok token) *Expr { return p.parseGroup(OpNegativeLookahead, tok) }
+ p.prefixParselets[tokLparenPositiveLookbehind] = func(tok token) *Expr { return p.parseGroup(OpPositiveLookbehind, tok) }
+ p.prefixParselets[tokLparenNegativeLookbehind] = func(tok token) *Expr { return p.parseGroup(OpNegativeLookbehind, tok) }
+
+ p.prefixParselets[tokLparenName] = func(tok token) *Expr {
+ return p.parseNamedCapture(FormDefault, tok)
+ }
+ p.prefixParselets[tokLparenNameAngle] = func(tok token) *Expr {
+ return p.parseNamedCapture(FormNamedCaptureAngle, tok)
+ }
+ p.prefixParselets[tokLparenNameQuote] = func(tok token) *Expr {
+ return p.parseNamedCapture(FormNamedCaptureQuote, tok)
+ }
+
+ p.prefixParselets[tokLparenFlags] = p.parseGroupWithFlags
+
+ p.prefixParselets[tokPipe] = func(tok token) *Expr {
+ // We need prefix pipe parselet to handle `(|x)` syntax.
+ right := p.parseExpr(1)
+ return p.newExpr(OpAlt, tok.pos, p.newEmpty(tok.pos), right)
+ }
+ p.prefixParselets[tokLbracket] = func(tok token) *Expr {
+ return p.parseCharClass(OpCharClass, tok)
+ }
+ p.prefixParselets[tokLbracketCaret] = func(tok token) *Expr {
+ return p.parseCharClass(OpNegCharClass, tok)
+ }
+
+ p.infixParselets[tokRepeat] = func(left *Expr, tok token) *Expr {
+ repeatLit := p.newExpr(OpString, tok.pos)
+ return p.newExpr(OpRepeat, combinePos(left.Pos, tok.pos), left, repeatLit)
+ }
+ p.infixParselets[tokStar] = func(left *Expr, tok token) *Expr {
+ return p.newExpr(OpStar, combinePos(left.Pos, tok.pos), left)
+ }
+ p.infixParselets[tokConcat] = func(left *Expr, tok token) *Expr {
+ right := p.parseExpr(2)
+ if left.Op == OpConcat {
+ left.Args = append(left.Args, *right)
+ left.Pos.End = right.End()
+ return left
+ }
+ return p.newExpr(OpConcat, combinePos(left.Pos, right.Pos), left, right)
+ }
+ p.infixParselets[tokPipe] = p.parseAlt
+ p.infixParselets[tokMinus] = p.parseMinus
+ p.infixParselets[tokPlus] = p.parsePlus
+ p.infixParselets[tokQuestion] = p.parseQuestion
+
+ return &p
+}
+
+func (p *Parser) setValues(e *Expr) {
+ for i := range e.Args {
+ p.setValues(&e.Args[i])
+ }
+ e.Value = p.exprValue(e)
+}
+
+func (p *Parser) tokenValue(tok token) string {
+ return p.out.Pattern[tok.pos.Begin:tok.pos.End]
+}
+
+func (p *Parser) exprValue(e *Expr) string {
+ return p.out.Pattern[e.Begin():e.End()]
+}
+
+func (p *Parser) mergeChars(e *Expr) {
+ for i := range e.Args {
+ p.mergeChars(&e.Args[i])
+ }
+ if e.Op != OpConcat || len(e.Args) < 2 {
+ return
+ }
+
+ args := e.Args[:0]
+ i := 0
+ for i < len(e.Args) {
+ first := i
+ chars := 0
+ for j := i; j < len(e.Args) && e.Args[j].Op == OpChar; j++ {
+ chars++
+ }
+ if chars > 1 {
+ c1 := e.Args[first]
+ c2 := e.Args[first+chars-1]
+ lit := p.newExpr(OpLiteral, combinePos(c1.Pos, c2.Pos))
+ for j := 0; j < chars; j++ {
+ lit.Args = append(lit.Args, e.Args[first+j])
+ }
+ args = append(args, *lit)
+ i += chars
+ } else {
+ args = append(args, e.Args[i])
+ i++
+ }
+ }
+ if len(args) == 1 {
+ *e = args[0] // Turn OpConcat into OpLiteral
+ } else {
+ e.Args = args
+ }
+}
+
+func (p *Parser) newEmpty(pos Position) *Expr {
+ return p.newExpr(OpConcat, pos)
+}
+
+func (p *Parser) newExprForm(op Operation, form Form, pos Position, args ...*Expr) *Expr {
+ e := p.newExpr(op, pos, args...)
+ e.Form = form
+ return e
+}
+
+func (p *Parser) newExpr(op Operation, pos Position, args ...*Expr) *Expr {
+ e := p.allocExpr()
+ *e = Expr{
+ Op: op,
+ Pos: pos,
+ Args: e.Args[:0],
+ }
+ for _, arg := range args {
+ e.Args = append(e.Args, *arg)
+ }
+ return e
+}
+
+func (p *Parser) allocExpr() *Expr {
+ i := p.allocated
+ if i < uint(len(p.exprPool)) {
+ p.allocated++
+ return &p.exprPool[i]
+ }
+ return &Expr{}
+}
+
+func (p *Parser) expect(kind tokenKind) Position {
+ tok := p.lexer.NextToken()
+ if tok.kind != kind {
+ throwExpectedFound(tok.pos, kind.String(), tok.kind.String())
+ }
+ return tok.pos
+}
+
+func (p *Parser) parseExpr(precedence int) *Expr {
+ tok := p.lexer.NextToken()
+ prefix := p.prefixParselets[tok.kind]
+ if prefix == nil {
+ throwUnexpectedToken(tok.pos, tok.String())
+ }
+ left := prefix(tok)
+
+ for precedence < p.precedenceOf(p.lexer.Peek()) {
+ tok := p.lexer.NextToken()
+ infix := p.infixParselets[tok.kind]
+ left = infix(left, tok)
+ }
+
+ return left
+}
+
+func (p *Parser) parsePrefixElementary(tok token) *Expr {
+ return p.newExpr(tok2op[tok.kind], tok.pos)
+}
+
+func (p *Parser) parseCharClass(op Operation, tok token) *Expr {
+ var endPos Position
+ p.charClass = p.charClass[:0]
+ for {
+ p.charClass = append(p.charClass, *p.parseExpr(0))
+ next := p.lexer.Peek()
+ if next.kind == tokRbracket {
+ endPos = next.pos
+ p.lexer.NextToken()
+ break
+ }
+ if next.kind == tokNone {
+ throw(tok.pos, "unterminated '['")
+ }
+ }
+
+ result := p.newExpr(op, combinePos(tok.pos, endPos))
+ result.Args = append(result.Args, p.charClass...)
+ return result
+}
+
+func (p *Parser) parseMinus(left *Expr, tok token) *Expr {
+ if p.isValidCharRangeOperand(left) {
+ if p.lexer.Peek().kind != tokRbracket {
+ right := p.parseExpr(2)
+ return p.newExpr(OpCharRange, combinePos(left.Pos, right.Pos), left, right)
+ }
+ }
+ p.charClass = append(p.charClass, *left)
+ return p.newExpr(OpChar, tok.pos)
+}
+
+func (p *Parser) isValidCharRangeOperand(e *Expr) bool {
+ switch e.Op {
+ case OpEscapeHex, OpEscapeOctal, OpEscapeMeta, OpChar:
+ return true
+ case OpEscapeChar:
+ switch p.exprValue(e) {
+ case `\\`, `\|`, `\*`, `\+`, `\?`, `\.`, `\[`, `\^`, `\$`, `\(`, `\)`:
+ return true
+ }
+ }
+ return false
+}
+
+func (p *Parser) parsePlus(left *Expr, tok token) *Expr {
+ op := OpPlus
+ switch left.Op {
+ case OpPlus, OpStar, OpQuestion, OpRepeat:
+ op = OpPossessive
+ }
+ return p.newExpr(op, combinePos(left.Pos, tok.pos), left)
+}
+
+func (p *Parser) parseQuestion(left *Expr, tok token) *Expr {
+ op := OpQuestion
+ switch left.Op {
+ case OpPlus, OpStar, OpQuestion, OpRepeat:
+ op = OpNonGreedy
+ }
+ return p.newExpr(op, combinePos(left.Pos, tok.pos), left)
+}
+
+func (p *Parser) parseAlt(left *Expr, tok token) *Expr {
+ var right *Expr
+ switch p.lexer.Peek().kind {
+ case tokRparen, tokNone:
+ // This is needed to handle `(x|)` syntax.
+ right = p.newEmpty(tok.pos)
+ default:
+ right = p.parseExpr(1)
+ }
+ if left.Op == OpAlt {
+ left.Args = append(left.Args, *right)
+ left.Pos.End = right.End()
+ return left
+ }
+ return p.newExpr(OpAlt, combinePos(left.Pos, right.Pos), left, right)
+}
+
+func (p *Parser) parseGroupItem(tok token) *Expr {
+ if p.lexer.Peek().kind == tokRparen {
+ // This is needed to handle `() syntax.`
+ return p.newEmpty(tok.pos)
+ }
+ return p.parseExpr(0)
+}
+
+func (p *Parser) parseGroup(op Operation, tok token) *Expr {
+ x := p.parseGroupItem(tok)
+ result := p.newExpr(op, tok.pos, x)
+ result.Pos.End = p.expect(tokRparen).End
+ return result
+}
+
+func (p *Parser) parseNamedCapture(form Form, tok token) *Expr {
+ prefixLen := len("(?<")
+ if form == FormDefault {
+ prefixLen = len("(?P<")
+ }
+ name := p.newExpr(OpString, Position{
+ Begin: tok.pos.Begin + uint16(prefixLen),
+ End: tok.pos.End - uint16(len(">")),
+ })
+ x := p.parseGroupItem(tok)
+ result := p.newExprForm(OpNamedCapture, form, tok.pos, x, name)
+ result.Pos.End = p.expect(tokRparen).End
+ return result
+}
+
+func (p *Parser) parseGroupWithFlags(tok token) *Expr {
+ var result *Expr
+ val := p.out.Pattern[tok.pos.Begin+1 : tok.pos.End]
+ switch {
+ case !strings.HasSuffix(val, ":"):
+ flags := p.newExpr(OpString, Position{
+ Begin: tok.pos.Begin + uint16(len("(?")),
+ End: tok.pos.End,
+ })
+ result = p.newExpr(OpFlagOnlyGroup, tok.pos, flags)
+ case val == "?:":
+ x := p.parseGroupItem(tok)
+ result = p.newExpr(OpGroup, tok.pos, x)
+ default:
+ flags := p.newExpr(OpString, Position{
+ Begin: tok.pos.Begin + uint16(len("(?")),
+ End: tok.pos.End - uint16(len(":")),
+ })
+ x := p.parseGroupItem(tok)
+ result = p.newExpr(OpGroupWithFlags, tok.pos, x, flags)
+ }
+ result.Pos.End = p.expect(tokRparen).End
+ return result
+}
+
+func (p *Parser) parseEscape(op Operation, prefix string, tok token) *Expr {
+ litPos := tok.pos
+ litPos.Begin += uint16(len(prefix))
+ lit := p.newExpr(OpString, litPos)
+ return p.newExpr(op, tok.pos, lit)
+}
+
+func (p *Parser) precedenceOf(tok token) int {
+ switch tok.kind {
+ case tokPipe:
+ return 1
+ case tokConcat, tokMinus:
+ return 2
+ case tokPlus, tokStar, tokQuestion, tokRepeat:
+ return 3
+ default:
+ return 0
+ }
+}
+
+func (p *Parser) newPCRE(source string) (*RegexpPCRE, error) {
+ if source == "" {
+ return nil, errors.New("empty pattern: can't find delimiters")
+ }
+
+ delim := source[0]
+ endDelim := delim
+ switch delim {
+ case '(':
+ endDelim = ')'
+ case '{':
+ endDelim = '}'
+ case '[':
+ endDelim = ']'
+ case '<':
+ endDelim = '>'
+ case '\\':
+ return nil, errors.New("'\\' is not a valid delimiter")
+ default:
+ if isSpace(delim) {
+ return nil, errors.New("whitespace is not a valid delimiter")
+ }
+ if isAlphanumeric(delim) {
+ return nil, errors.New("'" + string(delim) + "' is not a valid delimiter")
+ }
+ }
+
+ const delimLen = 1
+ j := strings.LastIndexByte(source[delimLen:], endDelim)
+ if j == -1 {
+ return nil, errors.New("can't find '" + string(endDelim) + "' ending delimiter")
+ }
+ j += delimLen
+
+ pcre := &RegexpPCRE{
+ Pattern: source[1:j],
+ Source: source,
+ Delim: [2]byte{delim, endDelim},
+ Modifiers: source[j+1:],
+ }
+ return pcre, nil
+}
+
+var tok2op = [256]Operation{
+ tokDollar: OpDollar,
+ tokCaret: OpCaret,
+ tokDot: OpDot,
+ tokChar: OpChar,
+ tokMinus: OpChar,
+ tokPosixClass: OpPosixClass,
+ tokComment: OpComment,
+}