Skip to content

Commit

Permalink
Introduce simple regex-like matcher (#2433)
Browse files Browse the repository at this point in the history
Provide match.Matcher and match.ExactMatcher using regular expressions for
matching use-case only.

The matchers compile a regular expression into a Matcher, which only provides
the Match functionality. This gives us a chance to optimize/replace some common
cases used for matching:
- replace capture-groups by non-capturing groups
- remove leading/trailing `.*` expressions (Match already searches for
  sub-string matching the regex)
- replace simple literal searches with `==` and `strings.Contains` and
  `strings.startsWith`
- replace regex for alternative literals (e.g. `DEBUG|INFO|ERROR`) with
  strings.Contains over set of literals
- optimized empty-lines checks

If input regular expression can not be matched to a simple case, regexp.Regexp
will be used.

The `ExactMatcher` will embedd `<regex>` into `^<regex>$` by default.

Note: Matcher does currently not split simple cases. e.g. `abc.*def` or
`abc.def` will still fallback to regexp.Regexp.
  • Loading branch information
Steffen Siering authored and andrewkroh committed Jan 20, 2017
1 parent ac7f7c9 commit 0a8ca7d
Show file tree
Hide file tree
Showing 7 changed files with 1,411 additions and 0 deletions.
257 changes: 257 additions & 0 deletions libbeat/common/match/cmp.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
package match

import "regexp/syntax"

// common predefined patterns
var (
patDotStar = mustParse(`.*`)
patNullBeginDotStar = mustParse(`^.*`)
patNullEndDotStar = mustParse(`.*$`)

patEmptyText = mustParse(`^$`)
patEmptyWhiteText = mustParse(`^\s*$`)

// patterns matching any content
patAny1 = patDotStar
patAny2 = mustParse(`^.*`)
patAny3 = mustParse(`^.*$`)
patAny4 = mustParse(`.*$`)

patBeginText = mustParse(`^`)
patEndText = mustParse(`$`)

patDigits = mustParse(`\d`)
)

// isPrefixLiteral checks regular expression being literal checking string
// starting with literal pattern (like '^PATTERN')
func isPrefixLiteral(r *syntax.Regexp) bool {
return r.Op == syntax.OpConcat &&
len(r.Sub) == 2 &&
r.Sub[0].Op == syntax.OpBeginText &&
r.Sub[1].Op == syntax.OpLiteral
}

func isAltLiterals(r *syntax.Regexp) bool {
if r.Op != syntax.OpAlternate {
return false
}

for _, sub := range r.Sub {
if sub.Op != syntax.OpLiteral {
return false
}
}
return true
}

func isExactLiteral(r *syntax.Regexp) bool {
return r.Op == syntax.OpConcat &&
len(r.Sub) == 3 &&
r.Sub[0].Op == syntax.OpBeginText &&
r.Sub[1].Op == syntax.OpLiteral &&
r.Sub[2].Op == syntax.OpEndText
}

func isOneOfLiterals(r *syntax.Regexp) bool {
return r.Op == syntax.OpConcat &&
len(r.Sub) == 3 &&
r.Sub[0].Op == syntax.OpBeginText &&
isAltLiterals(r.Sub[1]) &&
r.Sub[2].Op == syntax.OpEndText
}

// isPrefixAltLiterals checks regular expression being alternative literals
// starting with literal pattern (like '^PATTERN')
func isPrefixAltLiterals(r *syntax.Regexp) bool {
isPrefixAlt := r.Op == syntax.OpConcat &&
len(r.Sub) == 2 &&
r.Sub[0].Op == syntax.OpBeginText &&
r.Sub[1].Op == syntax.OpAlternate
if !isPrefixAlt {
return false
}

for _, sub := range r.Sub[1].Sub {
if sub.Op != syntax.OpLiteral {
return false
}
}
return true
}

func isPrefixNumDate(r *syntax.Regexp) bool {
if r.Op != syntax.OpConcat || r.Sub[0].Op != syntax.OpBeginText {
return false
}

i := 1
if r.Sub[i].Op == syntax.OpLiteral {
i++
}

// check digits
if !isMultiDigits(r.Sub[i]) {
return false
}
i++

for i < len(r.Sub) {
// check separator
if r.Sub[i].Op != syntax.OpLiteral {
return false
}
i++

// check digits
if !isMultiDigits(r.Sub[i]) {
return false
}
i++
}

return true
}

// isdotStar checks the term being `.*`.
func isdotStar(r *syntax.Regexp) bool {
return eqRegex(r, patDotStar)
}

func isEmptyText(r *syntax.Regexp) bool {
return eqRegex(r, patEmptyText)
}

func isEmptyTextWithWhitespace(r *syntax.Regexp) bool {
return eqRegex(r, patEmptyWhiteText)
}

func isAnyMatch(r *syntax.Regexp) bool {
return eqRegex(r, patAny1) ||
eqRegex(r, patAny2) ||
eqRegex(r, patAny3) ||
eqRegex(r, patAny4)
}

func isDigitMatch(r *syntax.Regexp) bool {
return eqRegex(r, patDigits)
}

func isMultiDigits(r *syntax.Regexp) bool {
return isConcatRepetition(r) && isDigitMatch(r.Sub[0])
}

func isConcatRepetition(r *syntax.Regexp) bool {
if r.Op != syntax.OpConcat {
return false
}

first := r.Sub[0]
for _, other := range r.Sub {
if other != first { // concat repetitions reuse references => compare pointers
return false
}
}

return true
}

func eqRegex(r, proto *syntax.Regexp) bool {
unmatchable := r.Op != proto.Op || r.Flags != proto.Flags ||
(r.Min != proto.Min) || (r.Max != proto.Max) ||
(len(r.Sub) != len(proto.Sub)) ||
(len(r.Rune) != len(proto.Rune))

if unmatchable {
return false
}

for i := range r.Sub {
if !eqRegex(r.Sub[i], proto.Sub[i]) {
return false
}
}

for i := range r.Rune {
if r.Rune[i] != proto.Rune[i] {
return false
}
}
return true
}

func eqPrefixAnyRegex(r *syntax.Regexp, protos ...*syntax.Regexp) bool {
for _, proto := range protos {
if eqPrefixRegex(r, proto) {
return true
}
}
return false
}

func eqPrefixRegex(r, proto *syntax.Regexp) bool {
if r.Op != syntax.OpConcat {
return false
}

if proto.Op != syntax.OpConcat {
if len(r.Sub) == 0 {
return false
}
return eqRegex(r.Sub[0], proto)
}

if len(r.Sub) < len(proto.Sub) {
return false
}

for i := range proto.Sub {
if !eqRegex(r.Sub[i], proto.Sub[i]) {
return false
}
}
return true
}

func eqSuffixAnyRegex(r *syntax.Regexp, protos ...*syntax.Regexp) bool {
for _, proto := range protos {
if eqSuffixRegex(r, proto) {
return true
}
}
return false
}

func eqSuffixRegex(r, proto *syntax.Regexp) bool {
if r.Op != syntax.OpConcat {
return false
}

if proto.Op != syntax.OpConcat {
i := len(r.Sub) - 1
if i < 0 {
return false
}
return eqRegex(r.Sub[i], proto)
}

if len(r.Sub) < len(proto.Sub) {
return false
}

d := len(r.Sub) - len(proto.Sub)
for i := range proto.Sub {
if !eqRegex(r.Sub[d+i], proto.Sub[i]) {
return false
}
}
return true
}

func mustParse(pattern string) *syntax.Regexp {
r, err := syntax.Parse(pattern, syntax.Perl)
if err != nil {
panic(err)
}
return r
}
111 changes: 111 additions & 0 deletions libbeat/common/match/compile.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package match

import (
"regexp"
"regexp/syntax"
)

func compile(r *syntax.Regexp) (stringMatcher, error) {
switch {
case r.Op == syntax.OpLiteral:
s := string(r.Rune)
return &substringMatcher{s, []byte(s)}, nil

case isExactLiteral(r):
s := string(r.Sub[1].Rune)
return &equalsMatcher{s, []byte(s)}, nil

case isAltLiterals(r):
var literals [][]byte
for _, sub := range r.Sub {
literals = append(literals, []byte(string(sub.Rune)))
}
return &altSubstringMatcher{literals}, nil

case isOneOfLiterals(r):
var literals [][]byte
for _, sub := range r.Sub[1].Sub {
literals = append(literals, []byte(string(sub.Rune)))
}
return &oneOfMatcher{literals}, nil

case isPrefixLiteral(r):
s := []byte(string(r.Sub[1].Rune))
return &prefixMatcher{s}, nil

case isPrefixAltLiterals(r):
var literals [][]byte
for _, sub := range r.Sub[1].Sub {
literals = append(literals, []byte(string(sub.Rune)))
}
return &altPrefixMatcher{literals}, nil

case isPrefixNumDate(r):
return compilePrefixNumDate(r)

case isEmptyText(r):
var m *emptyStringMatcher
return m, nil

case isEmptyTextWithWhitespace(r):
var m *emptyWhiteStringMatcher
return m, nil

case isAnyMatch(r):
var m *matchAny
return m, nil

default:

r, err := regexp.Compile(r.String())
if err != nil {
return nil, err
}
return r, nil
}
}

func compilePrefixNumDate(r *syntax.Regexp) (stringMatcher, error) {
m := &prefixNumDate{}

i := 1
if r.Sub[i].Op == syntax.OpLiteral {
m.prefix = []byte(string(r.Sub[i].Rune))
i++
}

digitLen := func(r *syntax.Regexp) int {
if r.Op == syntax.OpConcat {
return len(r.Sub)
}
return 1
}

var digits []int
var seps [][]byte

digits = append(digits, digitLen(r.Sub[i]))
i++

for i < len(r.Sub) {
seps = append(seps, []byte(string(r.Sub[i].Rune)))
i++

digits = append(digits, digitLen(r.Sub[i]))
i++
}

minLen := len(m.prefix)
for _, d := range digits {
minLen += d
}
for _, sep := range seps {
minLen += len(sep)
}

m.digits = digits
m.seps = seps
m.minLen = minLen

return m, nil
}
Loading

0 comments on commit 0a8ca7d

Please sign in to comment.