From c12ebacd37a6c9160c0419320bc6bf1c114c32c4 Mon Sep 17 00:00:00 2001 From: Christopher LaPointe Date: Tue, 31 Dec 2024 16:03:25 -0500 Subject: [PATCH 01/14] First dissect implementation. 5-10x faster than re2. 20% faster than pcre --- pkg/dissect/dissect.go | 173 ++++++++++++++++++++++++++++++++++++ pkg/dissect/dissect_test.go | 92 +++++++++++++++++++ pkg/dissect/errors.go | 9 ++ 3 files changed, 274 insertions(+) create mode 100644 pkg/dissect/dissect.go create mode 100644 pkg/dissect/dissect_test.go create mode 100644 pkg/dissect/errors.go diff --git a/pkg/dissect/dissect.go b/pkg/dissect/dissect.go new file mode 100644 index 00000000..400d02c9 --- /dev/null +++ b/pkg/dissect/dissect.go @@ -0,0 +1,173 @@ +package dissect + +import ( + "rare/pkg/slicepool" + "strings" + "unsafe" +) + +// https://www.elastic.co/guide/en/logstash/current/plugins-filters-dissect.html + +// Because of how rare works, and the need to implement `FindSubmatchIndex` +// this is a subset of functionality +// %{key} -- Named key +// %{} or %{?key} -- Named skipped key +// %{+key} -- Append key, but MUST appear in-order, and will include delim +// Does NOT support reference keys directly + +// Like fastregex, Dissect is NOT thread-safe, and an instance should be created +// per-thread, or it should be locked. This is primarily because of the memory pool + +type token struct { + name, until string + skip bool +} + +type Dissect struct { + tokens []token + prefix string + + groupNames map[string]int + groupCount int +} + +type DissectInstance struct { + *Dissect + groupPool *slicepool.IntPool +} + +func New(expr string) (*Dissect, error) { + + parts := make([]token, 0) + groupNames := make(map[string]int) + var prefix string + + groupIndex := 0 + for { + start := strings.Index(expr, "%{") + if start < 0 { + if groupIndex == 0 { // no tokens in expr + prefix = expr + } + break + } + if groupIndex == 0 { + prefix = expr[:start] + } + expr = expr[start+2:] + + stop := strings.Index(expr, "}") + if stop < 0 { + return nil, ErrorUnclosedToken + } + + keyName := expr[:stop] + expr = expr[stop+1:] + + // end is the next token OR end of expr + end := strings.Index(expr, "%") + if end < 0 { + end = len(expr) + } else if end == 0 { + return nil, ErrorSequentialToken + } + keyUntil := expr[:end] + expr = expr[end:] + + skipped := false + + switch { + case len(keyName) == 0: + skipped = true + case keyName[0] == '?': + skipped = true + keyName = keyName[1:] + // TODO: Append + } + + parts = append(parts, token{ + name: keyName, + until: keyUntil, + skip: skipped, + }) + + if !skipped { + if _, ok := groupNames[keyName]; ok { + return nil, ErrorKeyConflict + } + groupIndex++ + groupNames[keyName] = groupIndex + } + } + + return &Dissect{ + groupNames: groupNames, + groupCount: groupIndex, + tokens: parts, + prefix: prefix, + }, nil +} + +func MustNew(expr string) *Dissect { + d, err := New(expr) + if err != nil { + panic(err) + } + return d +} + +func (s *Dissect) CreateInstance() *DissectInstance { + return &DissectInstance{ + s, + slicepool.NewIntPool((s.groupCount*2 + 2) * 1024), + } +} + +// returns indexes of match [first, last, key0Start, key0End, key1Start, ...] +// nil on no match +// replicates logic from regex +func (s *DissectInstance) FindSubmatchIndex(b []byte) []int { + str := *(*string)(unsafe.Pointer(&b)) + + start := 0 + if s.prefix != "" { + start = strings.Index(str, s.prefix) + if start < 0 { + return nil + } + start += len(s.prefix) + } + + ret := s.groupPool.Get(s.groupCount*2 + 2) + ret[0] = start - len(s.prefix) + + idx := 2 + for _, token := range s.tokens { + + endOffset := 0 + if token.until == "" { + endOffset = len(str[start:]) + } else { + endOffset = strings.Index(str[start:], token.until) + if endOffset < 0 { + return nil + } + } + + if !token.skip { + ret[idx] = start + ret[idx+1] = start + endOffset + idx += 2 + } + start = start + endOffset + len(token.until) + } + + ret[1] = start + + return ret +} + +// Map of key-names to index's in FindSubmatchIndex's return +func (s *Dissect) SubexpNameTable() map[string]int { + return s.groupNames +} diff --git a/pkg/dissect/dissect_test.go b/pkg/dissect/dissect_test.go new file mode 100644 index 00000000..7bdff13f --- /dev/null +++ b/pkg/dissect/dissect_test.go @@ -0,0 +1,92 @@ +package dissect + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestDissectBasic(t *testing.T) { + d := MustNew("%{val};%{};%{?skip} - %{val2}").CreateInstance() + + assert.Equal(t, []int{0, 17, 0, 5, 12, 17}, d.FindSubmatchIndex([]byte("Hello;a;b - there"))) + + assert.Equal(t, map[string]int{ + "val": 1, + "val2": 2, + }, d.SubexpNameTable()) +} + +func TestEmpty(t *testing.T) { + d := MustNew("").CreateInstance() + assert.Equal(t, []int{0, 0}, d.FindSubmatchIndex([]byte("hello"))) + assert.Equal(t, []int{0, 0}, d.FindSubmatchIndex([]byte(""))) +} + +func TestNoTokens(t *testing.T) { + d := MustNew("test").CreateInstance() + + assert.Nil(t, d.FindSubmatchIndex([]byte("hello there"))) + assert.Equal(t, []int{0, 4}, d.FindSubmatchIndex([]byte("test"))) + assert.Equal(t, []int{1, 5}, d.FindSubmatchIndex([]byte("atest"))) + assert.Equal(t, []int{3, 7}, d.FindSubmatchIndex([]byte("abctest"))) + assert.Equal(t, []int{0, 4}, d.FindSubmatchIndex([]byte("testa"))) + assert.Equal(t, []int{0, 4}, d.FindSubmatchIndex([]byte("testabc"))) + assert.Equal(t, []int{3, 7}, d.FindSubmatchIndex([]byte("abctestabc"))) +} + +func TestPrefix(t *testing.T) { + d := MustNew("mid %{val};%{val2} after").CreateInstance() + + assert.Equal(t, []int{12, 29, 16, 19, 20, 23}, d.FindSubmatchIndex([]byte("string with mid 123;456 after k"))) + assert.Nil(t, d.FindSubmatchIndex([]byte("string with mi 123;456 after k"))) + assert.Nil(t, d.FindSubmatchIndex([]byte("string with mid 123;456 boom k"))) + assert.Nil(t, d.FindSubmatchIndex([]byte(""))) +} + +func TestSuffix(t *testing.T) { + d := MustNew("%{val};%{val2} after").CreateInstance() + + assert.Equal(t, []int{0, 13, 0, 3, 4, 7}, d.FindSubmatchIndex([]byte("123;456 after k"))) + assert.Equal(t, []int{0, 17, 0, 7, 8, 11}, d.FindSubmatchIndex([]byte("hah 123;456 after k"))) + assert.Nil(t, d.FindSubmatchIndex([]byte("123;456 boom k"))) + assert.Nil(t, d.FindSubmatchIndex([]byte(""))) + + assert.Equal(t, []int{2, 13, 6, 13}, MustNew("end %{nada}").CreateInstance().FindSubmatchIndex([]byte("a end nothing"))) +} + +func TestNoPrefixSuffix(t *testing.T) { + d := MustNew("%{onlymatch}").CreateInstance() + assert.Equal(t, []int{0, 5, 0, 5}, d.FindSubmatchIndex([]byte("a b c"))) +} + +func TestErrorCompile(t *testing.T) { + // Unclosed + _, err := New("unclosed %{") + assert.ErrorIs(t, err, ErrorUnclosedToken) + + // Dupe key + _, err = New("a %{a} %{a}") + assert.ErrorIs(t, err, ErrorKeyConflict) + + // Sequential tokens + _, err = New("a %{a}%{b}") + assert.ErrorIs(t, err, ErrorSequentialToken) +} + +func TestMustPanics(t *testing.T) { + assert.Panics(t, func() { + MustNew("%{bad expr") + }) +} + +// 88 ns +func BenchmarkDissect(b *testing.B) { + d, _ := New("t%{val} ") + di := d.CreateInstance() + val := []byte("this is a test ") + + for i := 0; i < b.N; i++ { + di.FindSubmatchIndex(val) + } +} diff --git a/pkg/dissect/errors.go b/pkg/dissect/errors.go new file mode 100644 index 00000000..cac69b56 --- /dev/null +++ b/pkg/dissect/errors.go @@ -0,0 +1,9 @@ +package dissect + +import "errors" + +var ( + ErrorKeyConflict = errors.New("key conflict") + ErrorUnclosedToken = errors.New("unclosed token") + ErrorSequentialToken = errors.New("sequential token") +) From c87e087a7c84166cd40a4cf246bb7d348951c0a1 Mon Sep 17 00:00:00 2001 From: Christopher LaPointe Date: Tue, 31 Dec 2024 16:42:46 -0500 Subject: [PATCH 02/14] wont support append for now --- pkg/dissect/dissect.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/pkg/dissect/dissect.go b/pkg/dissect/dissect.go index 400d02c9..26e60ed3 100644 --- a/pkg/dissect/dissect.go +++ b/pkg/dissect/dissect.go @@ -12,7 +12,6 @@ import ( // this is a subset of functionality // %{key} -- Named key // %{} or %{?key} -- Named skipped key -// %{+key} -- Append key, but MUST appear in-order, and will include delim // Does NOT support reference keys directly // Like fastregex, Dissect is NOT thread-safe, and an instance should be created @@ -82,7 +81,6 @@ func New(expr string) (*Dissect, error) { case keyName[0] == '?': skipped = true keyName = keyName[1:] - // TODO: Append } parts = append(parts, token{ From ea76f22bcd5bff8876651049076b8e7659c8d0a3 Mon Sep 17 00:00:00 2001 From: Christopher LaPointe Date: Wed, 1 Jan 2025 14:02:27 -0500 Subject: [PATCH 03/14] Repackage dissect and regex into matchers --- cmd/helpers/extractorBuilder.go | 2 +- main.go | 2 +- pkg/extractor/extractor.go | 2 +- pkg/{ => matchers}/dissect/dissect.go | 0 pkg/{ => matchers}/dissect/dissect_test.go | 2 +- pkg/{ => matchers}/dissect/errors.go | 0 pkg/matchers/factory.go | 17 +++++++++++++++++ pkg/{ => matchers}/fastregex/fastregex.go | 2 +- pkg/{ => matchers}/fastregex/fastregex_test.go | 0 pkg/{ => matchers}/fastregex/pcre2.go | 0 pkg/{ => matchers}/fastregex/pcre2_test.go | 0 .../fallback.go => matchers/fastregex/re2.go} | 0 pkg/matchers/intf.go | 12 ++++++++++++ 13 files changed, 34 insertions(+), 5 deletions(-) rename pkg/{ => matchers}/dissect/dissect.go (100%) rename pkg/{ => matchers}/dissect/dissect_test.go (98%) rename pkg/{ => matchers}/dissect/errors.go (100%) create mode 100644 pkg/matchers/factory.go rename pkg/{ => matchers}/fastregex/fastregex.go (94%) rename pkg/{ => matchers}/fastregex/fastregex_test.go (100%) rename pkg/{ => matchers}/fastregex/pcre2.go (100%) rename pkg/{ => matchers}/fastregex/pcre2_test.go (100%) rename pkg/{fastregex/fallback.go => matchers/fastregex/re2.go} (100%) create mode 100644 pkg/matchers/intf.go diff --git a/cmd/helpers/extractorBuilder.go b/cmd/helpers/extractorBuilder.go index eba4a7ba..f7d2d90b 100644 --- a/cmd/helpers/extractorBuilder.go +++ b/cmd/helpers/extractorBuilder.go @@ -146,7 +146,7 @@ func getExtractorFlags() []cli.Flag { Usage: "Compile regex as against posix standard", }, &cli.StringFlag{ - Name: "match,m", + Name: "match", Aliases: []string{"m"}, Category: cliCategoryMatching, Usage: "Regex to create match groups to summarize on", diff --git a/main.go b/main.go index f2c71744..34ad5d6b 100644 --- a/main.go +++ b/main.go @@ -10,9 +10,9 @@ import ( "rare/pkg/expressions/funcfile" "rare/pkg/expressions/funclib" "rare/pkg/expressions/stdlib" - "rare/pkg/fastregex" "rare/pkg/humanize" "rare/pkg/logger" + "rare/pkg/matchers/fastregex" "rare/pkg/multiterm" "rare/pkg/multiterm/termunicode" diff --git a/pkg/extractor/extractor.go b/pkg/extractor/extractor.go index 5c7db8b1..d0a489f1 100644 --- a/pkg/extractor/extractor.go +++ b/pkg/extractor/extractor.go @@ -3,7 +3,7 @@ package extractor import ( "rare/pkg/expressions" "rare/pkg/expressions/funclib" - "rare/pkg/fastregex" + "rare/pkg/matchers/fastregex" "sync" "sync/atomic" "unsafe" diff --git a/pkg/dissect/dissect.go b/pkg/matchers/dissect/dissect.go similarity index 100% rename from pkg/dissect/dissect.go rename to pkg/matchers/dissect/dissect.go diff --git a/pkg/dissect/dissect_test.go b/pkg/matchers/dissect/dissect_test.go similarity index 98% rename from pkg/dissect/dissect_test.go rename to pkg/matchers/dissect/dissect_test.go index 7bdff13f..20cafa7a 100644 --- a/pkg/dissect/dissect_test.go +++ b/pkg/matchers/dissect/dissect_test.go @@ -60,7 +60,7 @@ func TestNoPrefixSuffix(t *testing.T) { assert.Equal(t, []int{0, 5, 0, 5}, d.FindSubmatchIndex([]byte("a b c"))) } -func TestErrorCompile(t *testing.T) { +func TestErrorNew(t *testing.T) { // Unclosed _, err := New("unclosed %{") assert.ErrorIs(t, err, ErrorUnclosedToken) diff --git a/pkg/dissect/errors.go b/pkg/matchers/dissect/errors.go similarity index 100% rename from pkg/dissect/errors.go rename to pkg/matchers/dissect/errors.go diff --git a/pkg/matchers/factory.go b/pkg/matchers/factory.go new file mode 100644 index 00000000..2cbad3e0 --- /dev/null +++ b/pkg/matchers/factory.go @@ -0,0 +1,17 @@ +package matchers + +type LikeFactory[T Matcher] interface { + CreateInstance() T +} + +type factoryWrapper[T Matcher] struct { + matcher LikeFactory[T] +} + +func (s *factoryWrapper[T]) CreateInstance() Matcher { + return s.matcher.CreateInstance() +} + +func ToFactory[T Matcher](f LikeFactory[T]) Factory { + return &factoryWrapper[T]{f} +} diff --git a/pkg/fastregex/fastregex.go b/pkg/matchers/fastregex/fastregex.go similarity index 94% rename from pkg/fastregex/fastregex.go rename to pkg/matchers/fastregex/fastregex.go index 2adbb7f6..b20a8daf 100644 --- a/pkg/fastregex/fastregex.go +++ b/pkg/matchers/fastregex/fastregex.go @@ -7,7 +7,7 @@ type CompiledRegexp interface { // Regexp serves as an abstraction interface for regex classes // and shares the same methods as the re2/regexp implementation -// which allows for easy fallback. This interface is expeted +// which allows for easy fallback. This interface is expected // to only be used by a single thread/goroutine type Regexp interface { Match(b []byte) bool diff --git a/pkg/fastregex/fastregex_test.go b/pkg/matchers/fastregex/fastregex_test.go similarity index 100% rename from pkg/fastregex/fastregex_test.go rename to pkg/matchers/fastregex/fastregex_test.go diff --git a/pkg/fastregex/pcre2.go b/pkg/matchers/fastregex/pcre2.go similarity index 100% rename from pkg/fastregex/pcre2.go rename to pkg/matchers/fastregex/pcre2.go diff --git a/pkg/fastregex/pcre2_test.go b/pkg/matchers/fastregex/pcre2_test.go similarity index 100% rename from pkg/fastregex/pcre2_test.go rename to pkg/matchers/fastregex/pcre2_test.go diff --git a/pkg/fastregex/fallback.go b/pkg/matchers/fastregex/re2.go similarity index 100% rename from pkg/fastregex/fallback.go rename to pkg/matchers/fastregex/re2.go diff --git a/pkg/matchers/intf.go b/pkg/matchers/intf.go new file mode 100644 index 00000000..3a9b3fe1 --- /dev/null +++ b/pkg/matchers/intf.go @@ -0,0 +1,12 @@ +package matchers + +// A thread-safe compiled matcher that can create instances +type Factory interface { + CreateInstance() Matcher +} + +// A non-thread-safe matcher that can be used to find matches +type Matcher interface { + FindSubmatchIndex(b []byte) []int + SubexpNameTable() map[string]int +} From 61adc4b75b7b1134dc8cfdec49f3ce9baedd9d73 Mon Sep 17 00:00:00 2001 From: Christopher LaPointe Date: Wed, 1 Jan 2025 14:18:06 -0500 Subject: [PATCH 04/14] Refactor code so extractor takes generic matcher --- cmd/helpers/extractorBuilder.go | 49 +++++++++++++++++++++++--- cmd/helpers/updatingAggregator_test.go | 4 ++- pkg/extractor/extractor.go | 22 +++++------- pkg/extractor/extractor_test.go | 14 ++++---- pkg/extractor_test/benchmark_test.go | 4 ++- pkg/matchers/factory.go | 1 + 6 files changed, 68 insertions(+), 26 deletions(-) diff --git a/cmd/helpers/extractorBuilder.go b/cmd/helpers/extractorBuilder.go index f7d2d90b..e38a4641 100644 --- a/cmd/helpers/extractorBuilder.go +++ b/cmd/helpers/extractorBuilder.go @@ -1,12 +1,16 @@ package helpers import ( + "errors" "os" "rare/pkg/expressions" "rare/pkg/extractor" "rare/pkg/extractor/batchers" "rare/pkg/extractor/dirwalk" "rare/pkg/logger" + "rare/pkg/matchers" + "rare/pkg/matchers/dissect" + "rare/pkg/matchers/fastregex" "runtime" "strings" @@ -74,15 +78,15 @@ func BuildExtractorFromArguments(c *cli.Context, batcher *batchers.Batcher) *ext func BuildExtractorFromArgumentsEx(c *cli.Context, batcher *batchers.Batcher, sep string) *extractor.Extractor { config := extractor.Config{ - Posix: c.Bool("posix"), - Regex: c.String("match"), Extract: strings.Join(c.StringSlice("extract"), sep), Workers: c.Int("workers"), } - if c.Bool("ignore-case") { - config.Regex = "(?i)" + config.Regex + matcher, err := BuildMatcherFromArguments(c) + if err != nil { + logger.Fatalln(ExitCodeInvalidUsage, err) } + config.Matcher = matcher ignoreSlice := c.StringSlice("ignore") if len(ignoreSlice) > 0 { @@ -100,6 +104,37 @@ func BuildExtractorFromArgumentsEx(c *cli.Context, batcher *batchers.Batcher, se return ret } +func BuildMatcherFromArguments(c *cli.Context) (matchers.Factory, error) { + var ( + matchExpr = c.String("match") + dissectExpr = c.String("dissect") + posix = c.Bool("posix") + ignoreCase = c.Bool("ignore-case") + ) + + switch { + case c.IsSet("match") && c.IsSet("dissect"): + return nil, errors.New("match and dissect conflict") + case c.IsSet("dissect"): + // TODO: Ignore case + d, err := dissect.New(dissectExpr) + if err != nil { + return nil, err + } + return matchers.ToFactory(d), nil + default: // match has a default (OPTIMIZE: Dont bother with regex now that we have a wrapper??) + if ignoreCase { + matchExpr = "(?i)" + matchExpr + } + + r, err := fastregex.CompileEx(matchExpr, posix) + if err != nil { + return nil, err + } + return matchers.ToFactory(r), nil + } +} + func getExtractorFlags() []cli.Flag { workerCount := runtime.NumCPU()/2 + 1 @@ -152,6 +187,12 @@ func getExtractorFlags() []cli.Flag { Usage: "Regex to create match groups to summarize on", Value: ".*", }, + &cli.StringFlag{ + Name: "dissect", + Aliases: []string{"d"}, + Category: cliCategoryMatching, + Usage: "Dissect expression create match groups to summarize on", + }, &cli.StringSliceFlag{ Name: "extract", Aliases: []string{"e"}, diff --git a/cmd/helpers/updatingAggregator_test.go b/cmd/helpers/updatingAggregator_test.go index 336fb590..6b673ec3 100644 --- a/cmd/helpers/updatingAggregator_test.go +++ b/cmd/helpers/updatingAggregator_test.go @@ -4,6 +4,8 @@ import ( "io" "rare/pkg/extractor" "rare/pkg/extractor/batchers" + "rare/pkg/matchers" + "rare/pkg/matchers/fastregex" "strings" "testing" @@ -31,7 +33,7 @@ func TestAggregationLoop(t *testing.T) { // Build a real extractor batcher := batchers.OpenReaderToChan("test", io.NopCloser(strings.NewReader(testData)), 1, 1) ex, err := extractor.New(batcher.BatchChan(), &extractor.Config{ - Regex: `(\d+)`, + Matcher: matchers.ToFactory(fastregex.MustCompile(`(\d+)`)), Extract: "val:{1}", Workers: 1, }) diff --git a/pkg/extractor/extractor.go b/pkg/extractor/extractor.go index d0a489f1..4a9884df 100644 --- a/pkg/extractor/extractor.go +++ b/pkg/extractor/extractor.go @@ -3,7 +3,7 @@ package extractor import ( "rare/pkg/expressions" "rare/pkg/expressions/funclib" - "rare/pkg/matchers/fastregex" + "rare/pkg/matchers" "sync" "sync/atomic" "unsafe" @@ -31,11 +31,10 @@ type Match struct { // Config for the extractor type Config struct { - Posix bool // Posix parse regex - Regex string // Regex to find matches - Extract string // Extract these values from regex (expression) - Workers int // Workers to parse regex - Ignore IgnoreSet // Ignore these truthy expressions + Matcher matchers.Factory // Matcher + Extract string // Extract these values from regex (expression) + Workers int // Workers to parse regex + Ignore IgnoreSet // Ignore these truthy expressions } // Extractor is the representation of the reader @@ -43,7 +42,7 @@ type Config struct { // Expects someone to consume its ReadChan() type Extractor struct { readChan chan []Match - compiledRegexp fastregex.CompiledRegexp + compiledRegexp matchers.Factory readLines uint64 matchedLines uint64 ignoredLines uint64 @@ -54,7 +53,7 @@ type Extractor struct { type extractorInstance struct { *Extractor - re fastregex.Regexp + re matchers.Matcher context *SliceSpaceExpressionContext } @@ -157,14 +156,9 @@ func New(inputBatch <-chan InputBatch, config *Config) (*Extractor, error) { return nil, compErr } - compiledRegex, err := fastregex.CompileEx(config.Regex, config.Posix) - if err != nil { - return nil, err - } - extractor := Extractor{ readChan: make(chan []Match, 5), - compiledRegexp: compiledRegex, + compiledRegexp: config.Matcher, keyBuilder: compiledExpression, config: *config, ignore: config.Ignore, diff --git a/pkg/extractor/extractor_test.go b/pkg/extractor/extractor_test.go index 81acc1ee..0a716462 100644 --- a/pkg/extractor/extractor_test.go +++ b/pkg/extractor/extractor_test.go @@ -1,6 +1,8 @@ package extractor import ( + "rare/pkg/matchers" + "rare/pkg/matchers/fastregex" "strings" "testing" @@ -15,7 +17,7 @@ xxx` func TestBasicExtractor(t *testing.T) { input := convertReaderToBatches("test", strings.NewReader(testData), 1) ex, err := New(input, &Config{ - Regex: `(\d+)`, + Matcher: matchers.ToFactory(fastregex.MustCompile(`(\d+)`)), Extract: "val:{1}", Workers: 1, }) @@ -39,7 +41,7 @@ func TestBasicExtractor(t *testing.T) { func TestSourceAndLine(t *testing.T) { input := convertReaderToBatches("test", strings.NewReader(testData), 1) ex, err := New(input, &Config{ - Regex: `(\d+)`, + Matcher: matchers.ToFactory(fastregex.MustCompile(`(\d+)`)), Extract: "{src} {line} val:{1} {bad} {@}", Workers: 1, }) @@ -57,7 +59,7 @@ func TestIgnoreLines(t *testing.T) { input := convertReaderToBatches("test", strings.NewReader(testData), 1) ignore, _ := NewIgnoreExpressions(`{eq {1} "123"}`) ex, err := New(input, &Config{ - Regex: `(\d+)`, + Matcher: matchers.ToFactory(fastregex.MustCompile(`(\d+)`)), Extract: "{src} {line} val:{1} {bad}{500}", Workers: 1, Ignore: ignore, @@ -72,7 +74,7 @@ func TestIgnoreLines(t *testing.T) { func TestNamedGroup(t *testing.T) { input := convertReaderToBatches("test", strings.NewReader(testData), 1) ex, err := New(input, &Config{ - Regex: `(?P\d+)`, + Matcher: matchers.ToFactory(fastregex.MustCompile(`(?P\d+)`)), Extract: "val:{1}:{num}", Workers: 1, }) @@ -87,7 +89,7 @@ func TestNamedGroup(t *testing.T) { func TestJSONOutput(t *testing.T) { input := convertReaderToBatches("test", strings.NewReader(testData), 1) ex, err := New(input, &Config{ - Regex: `(?P\d+)`, + Matcher: matchers.ToFactory(fastregex.MustCompile(`(?P\d+)`)), Extract: "{.} {#} {.#} {#.}", Workers: 1, }) @@ -100,7 +102,7 @@ func TestJSONOutput(t *testing.T) { func TestGH10SliceBoundsPanic(t *testing.T) { input := convertReaderToBatches("", strings.NewReader("this is an [ERROR] message"), 1) ex, err := New(input, &Config{ - Regex: `\[(INFO)|(ERROR)|(WARNING)|(CRITICAL)\]`, + Matcher: matchers.ToFactory(fastregex.MustCompile(`\[(INFO)|(ERROR)|(WARNING)|(CRITICAL)\]`)), Extract: "val:{2} val:{3}", Workers: 1, }) diff --git a/pkg/extractor_test/benchmark_test.go b/pkg/extractor_test/benchmark_test.go index 6f7dadd9..8dcf982e 100644 --- a/pkg/extractor_test/benchmark_test.go +++ b/pkg/extractor_test/benchmark_test.go @@ -2,6 +2,8 @@ package benchmark_test import ( "rare/pkg/extractor" + "rare/pkg/matchers" + "rare/pkg/matchers/fastregex" "testing" ) @@ -29,7 +31,7 @@ func BenchmarkExtractor(b *testing.B) { for n := 0; n < b.N; n++ { gen := batchInputGenerator(10000, 100) extractor, _ := extractor.New(gen, &extractor.Config{ - Regex: `(\d{3})`, + Matcher: matchers.ToFactory(fastregex.MustCompile(`(\d{3})`)), Extract: "{bucket {1} 10}", Workers: 2, }) diff --git a/pkg/matchers/factory.go b/pkg/matchers/factory.go index 2cbad3e0..5fcdda26 100644 --- a/pkg/matchers/factory.go +++ b/pkg/matchers/factory.go @@ -12,6 +12,7 @@ func (s *factoryWrapper[T]) CreateInstance() Matcher { return s.matcher.CreateInstance() } +// Maps a factory-like interface to a matcher factory func ToFactory[T Matcher](f LikeFactory[T]) Factory { return &factoryWrapper[T]{f} } From 0b39b6a949bfaf21e7f0dc0512c30cc53f4d6898 Mon Sep 17 00:00:00 2001 From: Christopher LaPointe Date: Wed, 1 Jan 2025 14:20:17 -0500 Subject: [PATCH 05/14] Rename new to compile --- cmd/helpers/extractorBuilder.go | 2 +- pkg/matchers/dissect/dissect.go | 6 +++--- pkg/matchers/dissect/dissect_test.go | 24 ++++++++++++------------ 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/cmd/helpers/extractorBuilder.go b/cmd/helpers/extractorBuilder.go index e38a4641..aa48acc9 100644 --- a/cmd/helpers/extractorBuilder.go +++ b/cmd/helpers/extractorBuilder.go @@ -117,7 +117,7 @@ func BuildMatcherFromArguments(c *cli.Context) (matchers.Factory, error) { return nil, errors.New("match and dissect conflict") case c.IsSet("dissect"): // TODO: Ignore case - d, err := dissect.New(dissectExpr) + d, err := dissect.Compile(dissectExpr) if err != nil { return nil, err } diff --git a/pkg/matchers/dissect/dissect.go b/pkg/matchers/dissect/dissect.go index 26e60ed3..a2dbc577 100644 --- a/pkg/matchers/dissect/dissect.go +++ b/pkg/matchers/dissect/dissect.go @@ -35,7 +35,7 @@ type DissectInstance struct { groupPool *slicepool.IntPool } -func New(expr string) (*Dissect, error) { +func Compile(expr string) (*Dissect, error) { parts := make([]token, 0) groupNames := make(map[string]int) @@ -106,8 +106,8 @@ func New(expr string) (*Dissect, error) { }, nil } -func MustNew(expr string) *Dissect { - d, err := New(expr) +func MustCompile(expr string) *Dissect { + d, err := Compile(expr) if err != nil { panic(err) } diff --git a/pkg/matchers/dissect/dissect_test.go b/pkg/matchers/dissect/dissect_test.go index 20cafa7a..55d4e72e 100644 --- a/pkg/matchers/dissect/dissect_test.go +++ b/pkg/matchers/dissect/dissect_test.go @@ -7,7 +7,7 @@ import ( ) func TestDissectBasic(t *testing.T) { - d := MustNew("%{val};%{};%{?skip} - %{val2}").CreateInstance() + d := MustCompile("%{val};%{};%{?skip} - %{val2}").CreateInstance() assert.Equal(t, []int{0, 17, 0, 5, 12, 17}, d.FindSubmatchIndex([]byte("Hello;a;b - there"))) @@ -18,13 +18,13 @@ func TestDissectBasic(t *testing.T) { } func TestEmpty(t *testing.T) { - d := MustNew("").CreateInstance() + d := MustCompile("").CreateInstance() assert.Equal(t, []int{0, 0}, d.FindSubmatchIndex([]byte("hello"))) assert.Equal(t, []int{0, 0}, d.FindSubmatchIndex([]byte(""))) } func TestNoTokens(t *testing.T) { - d := MustNew("test").CreateInstance() + d := MustCompile("test").CreateInstance() assert.Nil(t, d.FindSubmatchIndex([]byte("hello there"))) assert.Equal(t, []int{0, 4}, d.FindSubmatchIndex([]byte("test"))) @@ -36,7 +36,7 @@ func TestNoTokens(t *testing.T) { } func TestPrefix(t *testing.T) { - d := MustNew("mid %{val};%{val2} after").CreateInstance() + d := MustCompile("mid %{val};%{val2} after").CreateInstance() assert.Equal(t, []int{12, 29, 16, 19, 20, 23}, d.FindSubmatchIndex([]byte("string with mid 123;456 after k"))) assert.Nil(t, d.FindSubmatchIndex([]byte("string with mi 123;456 after k"))) @@ -45,44 +45,44 @@ func TestPrefix(t *testing.T) { } func TestSuffix(t *testing.T) { - d := MustNew("%{val};%{val2} after").CreateInstance() + d := MustCompile("%{val};%{val2} after").CreateInstance() assert.Equal(t, []int{0, 13, 0, 3, 4, 7}, d.FindSubmatchIndex([]byte("123;456 after k"))) assert.Equal(t, []int{0, 17, 0, 7, 8, 11}, d.FindSubmatchIndex([]byte("hah 123;456 after k"))) assert.Nil(t, d.FindSubmatchIndex([]byte("123;456 boom k"))) assert.Nil(t, d.FindSubmatchIndex([]byte(""))) - assert.Equal(t, []int{2, 13, 6, 13}, MustNew("end %{nada}").CreateInstance().FindSubmatchIndex([]byte("a end nothing"))) + assert.Equal(t, []int{2, 13, 6, 13}, MustCompile("end %{nada}").CreateInstance().FindSubmatchIndex([]byte("a end nothing"))) } func TestNoPrefixSuffix(t *testing.T) { - d := MustNew("%{onlymatch}").CreateInstance() + d := MustCompile("%{onlymatch}").CreateInstance() assert.Equal(t, []int{0, 5, 0, 5}, d.FindSubmatchIndex([]byte("a b c"))) } func TestErrorNew(t *testing.T) { // Unclosed - _, err := New("unclosed %{") + _, err := Compile("unclosed %{") assert.ErrorIs(t, err, ErrorUnclosedToken) // Dupe key - _, err = New("a %{a} %{a}") + _, err = Compile("a %{a} %{a}") assert.ErrorIs(t, err, ErrorKeyConflict) // Sequential tokens - _, err = New("a %{a}%{b}") + _, err = Compile("a %{a}%{b}") assert.ErrorIs(t, err, ErrorSequentialToken) } func TestMustPanics(t *testing.T) { assert.Panics(t, func() { - MustNew("%{bad expr") + MustCompile("%{bad expr") }) } // 88 ns func BenchmarkDissect(b *testing.B) { - d, _ := New("t%{val} ") + d, _ := Compile("t%{val} ") di := d.CreateInstance() val := []byte("this is a test ") From 1ae6f8206e8b4d9a05f01e22daa249582ceec29b Mon Sep 17 00:00:00 2001 From: Christopher LaPointe Date: Wed, 1 Jan 2025 18:05:47 -0500 Subject: [PATCH 06/14] Add case-insensitivity to dissect --- cmd/helpers/extractorBuilder.go | 3 +-- pkg/matchers/dissect/case.go | 37 ++++++++++++++++++++++++++++ pkg/matchers/dissect/case_test.go | 1 + pkg/matchers/dissect/dissect.go | 26 +++++++++++++++---- pkg/matchers/dissect/dissect_test.go | 20 +++++++++++++-- 5 files changed, 78 insertions(+), 9 deletions(-) create mode 100644 pkg/matchers/dissect/case.go create mode 100644 pkg/matchers/dissect/case_test.go diff --git a/cmd/helpers/extractorBuilder.go b/cmd/helpers/extractorBuilder.go index aa48acc9..b97213c4 100644 --- a/cmd/helpers/extractorBuilder.go +++ b/cmd/helpers/extractorBuilder.go @@ -116,8 +116,7 @@ func BuildMatcherFromArguments(c *cli.Context) (matchers.Factory, error) { case c.IsSet("match") && c.IsSet("dissect"): return nil, errors.New("match and dissect conflict") case c.IsSet("dissect"): - // TODO: Ignore case - d, err := dissect.Compile(dissectExpr) + d, err := dissect.CompileEx(dissectExpr, ignoreCase) if err != nil { return nil, err } diff --git a/pkg/matchers/dissect/case.go b/pkg/matchers/dissect/case.go new file mode 100644 index 00000000..cd2befab --- /dev/null +++ b/pkg/matchers/dissect/case.go @@ -0,0 +1,37 @@ +package dissect + +import "unicode" + +// Finds case-insensitive index of second string +// ASSUMES second string is already lowered (optimization) +func indexIgnoreCase(s, substr string) int { + n := len(substr) + switch { + case n == 0: + return 0 + case len(s) < n: + return -1 + case len(s) == n: + for i := 0; i < n; i++ { + if unicode.ToLower(rune(s[i])) != rune(substr[i]) { + return -1 + } + } + return 0 + default: + for i := 0; i <= len(s)-n; i++ { + match := true + for j := 0; j < n; j++ { + if unicode.ToLower(rune(s[i+j])) != rune(substr[j]) { + match = false + break + } + } + if match { + return i + } + } + return -1 + } + +} diff --git a/pkg/matchers/dissect/case_test.go b/pkg/matchers/dissect/case_test.go new file mode 100644 index 00000000..2bb2e1b6 --- /dev/null +++ b/pkg/matchers/dissect/case_test.go @@ -0,0 +1 @@ +package dissect diff --git a/pkg/matchers/dissect/dissect.go b/pkg/matchers/dissect/dissect.go index a2dbc577..48910187 100644 --- a/pkg/matchers/dissect/dissect.go +++ b/pkg/matchers/dissect/dissect.go @@ -23,8 +23,9 @@ type token struct { } type Dissect struct { - tokens []token - prefix string + tokens []token + prefix string + indexOf func(src, of string) int groupNames map[string]int groupCount int @@ -35,7 +36,7 @@ type DissectInstance struct { groupPool *slicepool.IntPool } -func Compile(expr string) (*Dissect, error) { +func CompileEx(expr string, ignoreCase bool) (*Dissect, error) { parts := make([]token, 0) groupNames := make(map[string]int) @@ -73,6 +74,10 @@ func Compile(expr string) (*Dissect, error) { keyUntil := expr[:end] expr = expr[end:] + if ignoreCase { + keyUntil = strings.ToLower(keyUntil) + } + skipped := false switch { @@ -98,14 +103,25 @@ func Compile(expr string) (*Dissect, error) { } } + indexOfFunc := strings.Index + if ignoreCase { + indexOfFunc = indexIgnoreCase + prefix = strings.ToLower(prefix) + } + return &Dissect{ groupNames: groupNames, groupCount: groupIndex, tokens: parts, prefix: prefix, + indexOf: indexOfFunc, }, nil } +func Compile(expr string) (*Dissect, error) { + return CompileEx(expr, false) +} + func MustCompile(expr string) *Dissect { d, err := Compile(expr) if err != nil { @@ -129,7 +145,7 @@ func (s *DissectInstance) FindSubmatchIndex(b []byte) []int { start := 0 if s.prefix != "" { - start = strings.Index(str, s.prefix) + start = s.indexOf(str, s.prefix) if start < 0 { return nil } @@ -146,7 +162,7 @@ func (s *DissectInstance) FindSubmatchIndex(b []byte) []int { if token.until == "" { endOffset = len(str[start:]) } else { - endOffset = strings.Index(str[start:], token.until) + endOffset = s.indexOf(str[start:], token.until) if endOffset < 0 { return nil } diff --git a/pkg/matchers/dissect/dissect_test.go b/pkg/matchers/dissect/dissect_test.go index 55d4e72e..3ad95409 100644 --- a/pkg/matchers/dissect/dissect_test.go +++ b/pkg/matchers/dissect/dissect_test.go @@ -33,6 +33,7 @@ func TestNoTokens(t *testing.T) { assert.Equal(t, []int{0, 4}, d.FindSubmatchIndex([]byte("testa"))) assert.Equal(t, []int{0, 4}, d.FindSubmatchIndex([]byte("testabc"))) assert.Equal(t, []int{3, 7}, d.FindSubmatchIndex([]byte("abctestabc"))) + assert.Nil(t, d.FindSubmatchIndex([]byte("tEst"))) } func TestPrefix(t *testing.T) { @@ -80,9 +81,24 @@ func TestMustPanics(t *testing.T) { }) } -// 88 ns +func TestIgnoreCase(t *testing.T) { + d, err := CompileEx("TeSt1", true) + + assert.NoError(t, err) + assert.Equal(t, []int{0, 5}, d.CreateInstance().FindSubmatchIndex([]byte("test1"))) + assert.Equal(t, []int{0, 5}, d.CreateInstance().FindSubmatchIndex([]byte("tEst1"))) + assert.Equal(t, []int{0, 5}, d.CreateInstance().FindSubmatchIndex([]byte("TEST1"))) + assert.Equal(t, []int{1, 6}, d.CreateInstance().FindSubmatchIndex([]byte("ATest123"))) + assert.Nil(t, d.CreateInstance().FindSubmatchIndex([]byte("asdf"))) + + d, err = CompileEx("pref %{val} post", true) + assert.NoError(t, err) + assert.Equal(t, []int{2, 13, 7, 8}, d.CreateInstance().FindSubmatchIndex([]byte("a Pref 5 pOst"))) +} + +// BenchmarkDissect-4 13347456 86.07 ns/op 32 B/op 0 allocs/op func BenchmarkDissect(b *testing.B) { - d, _ := Compile("t%{val} ") + d, _ := CompileEx("t%{val} ", true) di := d.CreateInstance() val := []byte("this is a test ") From 3867f4e986296e0629b153aed2fdff7f0f4bcb0e Mon Sep 17 00:00:00 2001 From: Christopher LaPointe Date: Wed, 1 Jan 2025 18:10:01 -0500 Subject: [PATCH 07/14] Fix build for moved fastregex package --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3616e546..3c5864f0 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -23,7 +23,7 @@ jobs: - name: Test run: | go test -v -race -coverprofile=coverage.txt -covermode=atomic ./... - go test -tags=pcre2 rare/pkg/fastregex + go test -tags=pcre2 rare/pkg/matchers/fastregex - name: StaticCheck run: | go run honnef.co/go/tools/cmd/staticcheck@2024.1.1 ./... From 11a475d8d9b9a2f938a72d0e6c32c9a39306be52 Mon Sep 17 00:00:00 2001 From: Christopher LaPointe Date: Wed, 1 Jan 2025 19:18:41 -0500 Subject: [PATCH 08/14] Unit tests --- pkg/matchers/dissect/case_test.go | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/pkg/matchers/dissect/case_test.go b/pkg/matchers/dissect/case_test.go index 2bb2e1b6..ab822e36 100644 --- a/pkg/matchers/dissect/case_test.go +++ b/pkg/matchers/dissect/case_test.go @@ -1 +1,30 @@ package dissect + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestIndexIgnoreCase(t *testing.T) { + assert.Equal(t, 0, indexIgnoreCase("abc", "a")) + assert.Equal(t, 0, indexIgnoreCase("abc", "")) + + assert.Equal(t, 1, indexIgnoreCase("abc", "bc")) + assert.Equal(t, -1, indexIgnoreCase("abc", "ac")) + + assert.Equal(t, 0, indexIgnoreCase("abc", "abc")) + + assert.Equal(t, -1, indexIgnoreCase("abc", "bca")) + assert.Equal(t, -1, indexIgnoreCase("abc", "abcd")) + + assert.Equal(t, 0, indexIgnoreCase("ABC", "a")) + assert.Equal(t, 0, indexIgnoreCase("ABC", "")) + + assert.Equal(t, 1, indexIgnoreCase("ABC", "bc")) + + assert.Equal(t, 0, indexIgnoreCase("ABC", "abc")) + + assert.Equal(t, -1, indexIgnoreCase("ABC", "bca")) + assert.Equal(t, -1, indexIgnoreCase("ABC", "abcd")) +} From af01c6666c81af44b8c1373bf742a1555e2a4793 Mon Sep 17 00:00:00 2001 From: Christopher LaPointe Date: Wed, 1 Jan 2025 19:53:26 -0500 Subject: [PATCH 09/14] Fix skip-token bug. Add AlwaysMatcher, which makes the default case much more performant --- cmd/helpers/extractorBuilder.go | 4 +++- pkg/matchers/dissect/dissect.go | 10 +++++----- pkg/matchers/dissect/dissect_test.go | 8 ++++++++ pkg/matchers/simple.go | 15 +++++++++++++++ 4 files changed, 31 insertions(+), 6 deletions(-) create mode 100644 pkg/matchers/simple.go diff --git a/cmd/helpers/extractorBuilder.go b/cmd/helpers/extractorBuilder.go index b97213c4..65089adb 100644 --- a/cmd/helpers/extractorBuilder.go +++ b/cmd/helpers/extractorBuilder.go @@ -121,7 +121,7 @@ func BuildMatcherFromArguments(c *cli.Context) (matchers.Factory, error) { return nil, err } return matchers.ToFactory(d), nil - default: // match has a default (OPTIMIZE: Dont bother with regex now that we have a wrapper??) + case c.IsSet("match"): if ignoreCase { matchExpr = "(?i)" + matchExpr } @@ -131,6 +131,8 @@ func BuildMatcherFromArguments(c *cli.Context) (matchers.Factory, error) { return nil, err } return matchers.ToFactory(r), nil + default: + return &matchers.AlwaysMatch{}, nil } } diff --git a/pkg/matchers/dissect/dissect.go b/pkg/matchers/dissect/dissect.go index 48910187..a7ac268c 100644 --- a/pkg/matchers/dissect/dissect.go +++ b/pkg/matchers/dissect/dissect.go @@ -46,12 +46,12 @@ func CompileEx(expr string, ignoreCase bool) (*Dissect, error) { for { start := strings.Index(expr, "%{") if start < 0 { - if groupIndex == 0 { // no tokens in expr + if len(parts) == 0 { // no tokens in expr prefix = expr } break } - if groupIndex == 0 { + if len(parts) == 0 { prefix = expr[:start] } expr = expr[start+2:] @@ -78,12 +78,12 @@ func CompileEx(expr string, ignoreCase bool) (*Dissect, error) { keyUntil = strings.ToLower(keyUntil) } + // Special flags skipped := false - switch { - case len(keyName) == 0: + case len(keyName) == 0: // empty skip skipped = true - case keyName[0] == '?': + case keyName[0] == '?': // named skip skipped = true keyName = keyName[1:] } diff --git a/pkg/matchers/dissect/dissect_test.go b/pkg/matchers/dissect/dissect_test.go index 3ad95409..6ac1f3bd 100644 --- a/pkg/matchers/dissect/dissect_test.go +++ b/pkg/matchers/dissect/dissect_test.go @@ -17,6 +17,14 @@ func TestDissectBasic(t *testing.T) { }, d.SubexpNameTable()) } +func TestPrefixOnSkipKey(t *testing.T) { + d := MustCompile("prefix %{}: %{val}").CreateInstance() + + assert.Nil(t, d.FindSubmatchIndex([]byte("a: b"))) + assert.Equal(t, []int{0, 11, 10, 11}, d.FindSubmatchIndex([]byte("prefix a: b"))) + assert.Nil(t, d.FindSubmatchIndex([]byte("Prefix a: b"))) +} + func TestEmpty(t *testing.T) { d := MustCompile("").CreateInstance() assert.Equal(t, []int{0, 0}, d.FindSubmatchIndex([]byte("hello"))) diff --git a/pkg/matchers/simple.go b/pkg/matchers/simple.go new file mode 100644 index 00000000..6f8c3d46 --- /dev/null +++ b/pkg/matchers/simple.go @@ -0,0 +1,15 @@ +package matchers + +type AlwaysMatch struct{} + +func (s *AlwaysMatch) CreateInstance() Matcher { + return s +} + +func (s *AlwaysMatch) FindSubmatchIndex(b []byte) []int { + return []int{0, len(b)} +} + +func (s *AlwaysMatch) SubexpNameTable() map[string]int { + return make(map[string]int) +} From 57d1cfc70875cf5620b48105a3c224644db0d5c4 Mon Sep 17 00:00:00 2001 From: Christopher LaPointe Date: Wed, 1 Jan 2025 19:57:08 -0500 Subject: [PATCH 10/14] Builder tests --- cmd/helpers/extractorBuilder_test.go | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/cmd/helpers/extractorBuilder_test.go b/cmd/helpers/extractorBuilder_test.go index c0a776ad..7a2221fa 100644 --- a/cmd/helpers/extractorBuilder_test.go +++ b/cmd/helpers/extractorBuilder_test.go @@ -56,6 +56,10 @@ func TestBuildingExtractorFromContext(t *testing.T) { assert.NoError(t, runApp("")) assert.NoError(t, runApp(`-I -i "{eq {0} abc}" ../testdata/log.txt`)) assert.NoError(t, runApp(`-f ../testdata/log.txt`)) + assert.NoError(t, runApp(`-m ".*" ../testdata/log.txt`)) + assert.NoError(t, runApp(`-I -m ".*" ../testdata/log.txt`)) + assert.NoError(t, runApp(`-d "%{}" ../testdata/log.txt`)) + assert.NoError(t, runApp(`-I -d "%{}" ../testdata/log.txt`)) testLogFatal(t, 2, func() { runApp("--batch 0 ../testdata/log.txt") }) @@ -77,5 +81,11 @@ func TestBuildingExtractorFromContext(t *testing.T) { testLogFatal(t, 2, func() { runApp(`-i "{0" -`) }) - assert.Equal(t, 3, actionCalled) + testLogFatal(t, 2, func() { + runApp(`-m regex -d dissect -`) + }) + testLogFatal(t, 2, func() { + runApp(`-d "%{unclosed" -`) + }) + assert.Equal(t, 7, actionCalled) } From c0d3d6223dbebbfded39523a11d2426be5ca9480 Mon Sep 17 00:00:00 2001 From: Christopher LaPointe Date: Thu, 2 Jan 2025 20:43:08 -0500 Subject: [PATCH 11/14] Docs --- README.md | 1 + cmd/helpers/extractorBuilder.go | 2 +- docs/cli-help.md | 32 ++++++++++++---- docs/index.md | 1 + docs/usage/dissect.md | 68 +++++++++++++++++++++++++++++++++ docs/usage/examples.md | 10 ++--- docs/usage/expressions.md | 4 +- docs/usage/extractor.md | 55 ++++++++++++++++++++++---- docs/usage/overview.md | 9 +++-- mkdocs.yml | 1 + 10 files changed, 156 insertions(+), 27 deletions(-) create mode 100644 docs/usage/dissect.md diff --git a/README.md b/README.md index 2c163dcf..297df443 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ See [rare.zdyn.net](https://rare.zdyn.net) or the [docs/ folder](docs/) for the ## Features * Multiple summary formats including: filter (like grep), histogram, bar graphs, tables, heatmaps, reduce, and numerical analysis + * Parse using regex (`-m`) or dissect tokenizer (`-d`) * File glob expansions (eg `/var/log/*` or `/var/log/*/*.log`) and `-R` * Optional gzip decompression (with `-z`) * Following `-f` or re-open following `-F` (use `--poll` to poll, and `--tail` to tail) diff --git a/cmd/helpers/extractorBuilder.go b/cmd/helpers/extractorBuilder.go index 65089adb..3aca277b 100644 --- a/cmd/helpers/extractorBuilder.go +++ b/cmd/helpers/extractorBuilder.go @@ -211,7 +211,7 @@ func getExtractorFlags() []cli.Flag { Name: "ignore-case", Aliases: []string{"I"}, Category: cliCategoryMatching, - Usage: "Augment regex to be case insensitive", + Usage: "Augment matcher to be case insensitive", }, &cli.IntFlag{ Name: "batch", diff --git a/docs/cli-help.md b/docs/cli-help.md index 1eff3b25..afeb3946 100644 --- a/docs/cli-help.md +++ b/docs/cli-help.md @@ -67,6 +67,8 @@ Filter incoming results with search criteria, and output raw matches **--batch-buffer**="": Specifies how many batches to read-ahead. Impacts memory usage, can improve performance (default: 6) +**--dissect, -d**="": Dissect expression create match groups to summarize on + **--extract, -e**="": Expression that will generate the key to group by. Specify multiple times for multi-dimensions or use {$} helper (default: [{0}]) **--follow, -f**: Read appended data as file grows @@ -75,7 +77,7 @@ Filter incoming results with search criteria, and output raw matches **--ignore, -i**="": Ignore a match given a truthy expression (Can have multiple) -**--ignore-case, -I**: Augment regex to be case insensitive +**--ignore-case, -I**: Augment matcher to be case insensitive **--line, -l**: Output source file and line number @@ -113,6 +115,8 @@ Summarize results by extracting them to a histogram **--csv, -o**="": Write final results to csv. Use - to output to stdout +**--dissect, -d**="": Dissect expression create match groups to summarize on + **--extra, -x**: Alias for -b --percentage **--extract, -e**="": Expression that will generate the key to group by. Specify multiple times for multi-dimensions or use {$} helper (default: [{0}]) @@ -123,7 +127,7 @@ Summarize results by extracting them to a histogram **--ignore, -i**="": Ignore a match given a truthy expression (Can have multiple) -**--ignore-case, -I**: Augment regex to be case insensitive +**--ignore-case, -I**: Augment matcher to be case insensitive **--match, -m**="": Regex to create match groups to summarize on (default: .*) @@ -167,6 +171,8 @@ Create a 2D heatmap of extracted data **--delim**="": Character to tabulate on. Use {$} helper by default (default: \x00) +**--dissect, -d**="": Dissect expression create match groups to summarize on + **--extract, -e**="": Expression that will generate the key to group by. Specify multiple times for multi-dimensions or use {$} helper (default: [{0}]) **--follow, -f**: Read appended data as file grows @@ -175,7 +181,7 @@ Create a 2D heatmap of extracted data **--ignore, -i**="": Ignore a match given a truthy expression (Can have multiple) -**--ignore-case, -I**: Augment regex to be case insensitive +**--ignore-case, -I**: Augment matcher to be case insensitive **--match, -m**="": Regex to create match groups to summarize on (default: .*) @@ -223,6 +229,8 @@ Create rows of sparkline graphs **--delim**="": Character to tabulate on. Use {$} helper by default (default: \x00) +**--dissect, -d**="": Dissect expression create match groups to summarize on + **--extract, -e**="": Expression that will generate the key to group by. Specify multiple times for multi-dimensions or use {$} helper (default: [{0}]) **--follow, -f**: Read appended data as file grows @@ -231,7 +239,7 @@ Create rows of sparkline graphs **--ignore, -i**="": Ignore a match given a truthy expression (Can have multiple) -**--ignore-case, -I**: Augment regex to be case insensitive +**--ignore-case, -I**: Augment matcher to be case insensitive **--match, -m**="": Regex to create match groups to summarize on (default: .*) @@ -273,6 +281,8 @@ Create a bargraph of the given 1 or 2 dimension data **--csv, -o**="": Write final results to csv. Use - to output to stdout +**--dissect, -d**="": Dissect expression create match groups to summarize on + **--extract, -e**="": Expression that will generate the key to group by. Specify multiple times for multi-dimensions or use {$} helper (default: [{0}]) **--follow, -f**: Read appended data as file grows @@ -281,7 +291,7 @@ Create a bargraph of the given 1 or 2 dimension data **--ignore, -i**="": Ignore a match given a truthy expression (Can have multiple) -**--ignore-case, -I**: Augment regex to be case insensitive +**--ignore-case, -I**: Augment matcher to be case insensitive **--match, -m**="": Regex to create match groups to summarize on (default: .*) @@ -317,6 +327,8 @@ Numerical analysis on a set of filtered data **--batch-buffer**="": Specifies how many batches to read-ahead. Impacts memory usage, can improve performance (default: 6) +**--dissect, -d**="": Dissect expression create match groups to summarize on + **--extra, -x**: Displays extra analysis on the data (Requires more memory and cpu) **--extract, -e**="": Expression that will generate the key to group by. Specify multiple times for multi-dimensions or use {$} helper (default: [{0}]) @@ -327,7 +339,7 @@ Numerical analysis on a set of filtered data **--ignore, -i**="": Ignore a match given a truthy expression (Can have multiple) -**--ignore-case, -I**: Augment regex to be case insensitive +**--ignore-case, -I**: Augment matcher to be case insensitive **--match, -m**="": Regex to create match groups to summarize on (default: .*) @@ -367,6 +379,8 @@ Create a 2D summarizing table of extracted data **--delim**="": Character to tabulate on. Use {$} helper by default (default: \x00) +**--dissect, -d**="": Dissect expression create match groups to summarize on + **--extra, -x**: Display row and column totals **--extract, -e**="": Expression that will generate the key to group by. Specify multiple times for multi-dimensions or use {$} helper (default: [{0}]) @@ -377,7 +391,7 @@ Create a 2D summarizing table of extracted data **--ignore, -i**="": Ignore a match given a truthy expression (Can have multiple) -**--ignore-case, -I**: Augment regex to be case insensitive +**--ignore-case, -I**: Augment matcher to be case insensitive **--match, -m**="": Regex to create match groups to summarize on (default: .*) @@ -421,6 +435,8 @@ Aggregate the results of a query based on an expression, pulling customized summ **--csv, -o**="": Write final results to csv. Use - to output to stdout +**--dissect, -d**="": Dissect expression create match groups to summarize on + **--extract, -e**="": Expression that will generate the key to group by. Specify multiple times for multi-dimensions or use {$} helper (default: [{@}]) **--follow, -f**: Read appended data as file grows @@ -431,7 +447,7 @@ Aggregate the results of a query based on an expression, pulling customized summ **--ignore, -i**="": Ignore a match given a truthy expression (Can have multiple) -**--ignore-case, -I**: Augment regex to be case insensitive +**--ignore-case, -I**: Augment matcher to be case insensitive **--initial**="": Specify the default initial value for any accumulators that don't specify (default: 0) diff --git a/docs/index.md b/docs/index.md index df2a0c94..efc9e53c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -18,6 +18,7 @@ Supports various CLI-based graphing and metric formats (filter (grep-like), hist ## Features * Multiple summary formats including: filter (like grep), histogram, bar graphs, tables, heatmaps, sparklines, reduce, and numerical analysis + * Parse using regex (`-m`) or dissect tokenizer (`-d`) * File glob expansions (eg `/var/log/*` or `/var/log/*/*.log`) and `-R` * Optional gzip decompression (with `-z`) * Following `-f` or re-open following `-F` (use `--poll` to poll, and `--tail` to tail) diff --git a/docs/usage/dissect.md b/docs/usage/dissect.md new file mode 100644 index 00000000..a8a40137 --- /dev/null +++ b/docs/usage/dissect.md @@ -0,0 +1,68 @@ +# Dissect Syntax + +*Dissect* is a simple token-based search algorithm, and can +be up to 10x faster than regex (and 40% faster than PCRE). + +It works by searching for for constant delimiters in a string +and extracting the text between the tokens as named keys. + +*rare* implements a subset of the full dissect algorithm. + +**Syntax Example:** +``` +prefix %{name} : %{value} - %{?ignored} +``` + +## Syntax + +- Anything in a `%{}` is a variable token. +- A blank token, or a token that starts with `?` is skipped. eg `%{}` or `%{?skipped}` +- Tokens are extracted by both name and index (in the order they appear). +- Index `{0}` is the full match, including the delimiters +- Patterns don't need to match the entire line + +## Examples + +### Simple + +``` +prefix %{name} : %{value} +``` + +Will match: +``` +prefix bob : 123 +``` + +And will extract two keys: +``` +name=bob +value=123 +``` + +### Nginx Logs + +As a simple example, to parse nginx logs that look like: + +``` +104.238.185.46 - - [19/Aug/2019:02:26:25 +0000] "GET / HTTP/1.1" 200 546 "-" "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/98 Safari/537.4 (StatusCake)" +``` + +The following dissect expression can be used: + +``` +%{ip} - - [%{timestamp}] "%{verb} %{path} HTTP/%{?http-version}" %{status} %{size} "-" "%{useragent}" +``` + +Which, as json, will return: +```json +{ + "timestamp": "12/Dec/2019:17:54:13 +0000", + "verb": "POST", + "path": "/temtel.php", + "status": 404, + "size": 571, + "useragent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36", + "ip": "203.113.174.104" +} +``` diff --git a/docs/usage/examples.md b/docs/usage/examples.md index d9050792..df4b1eca 100644 --- a/docs/usage/examples.md +++ b/docs/usage/examples.md @@ -183,10 +183,10 @@ Matched: 1,035,666 / 1,035,666 (R: 8; C: 61) **NOTE:** For stacking (`-s`), the results will be color-coded (not shown here) ```sh -$ rare bars -z -m "\[(.+?)\].*\" (\d+)" -e "{buckettime {1} year}" -e "{2}" testdata/* +$ rare bars -z -m "\[(.+?)\].*\" (\d+)" -e "{buckettime {1} year}" -e "{2}" -s testdata/* - | 200 | 206 | 301 | 304 | 400 | 404 | 405 | 408 -2019 ||||||||||||||||||||||||||||||||||||||| 3,741,444 -2020 ||||||||||||||||||||||||||||||||||||||||||||||||| 4,631,884 -Matched: 8,373,328 / 8,383,717 + 0 200 1 206 2 301 3 304 4 400 5 404 6 405 7 408 +2019 000000000555555555555555555555555555555 3,742,444 +2020 0000000000000000004455555555555555555555555555555 4,631,884 +Matched: 8,374,328 / 8,384,811 ``` diff --git a/docs/usage/expressions.md b/docs/usage/expressions.md index a22b600f..1642718f 100644 --- a/docs/usage/expressions.md +++ b/docs/usage/expressions.md @@ -16,7 +16,7 @@ The basic syntax structure is as follows: * Characters can be escaped with `\`, including `\{` or `\n` * Expressions are surrounded by `{}`. * An integer in an expression denotes a matched value from the regex (or other input) eg. `{2}`. The entire match will always be `{0}` - * A string in an expression is a special key or a named regex group eg. `{src}` or `{group1}` + * A string in an expression is a special key or a named regex/dissect group eg. `{src}` or `{group1}` * When an expression has space(s), the first literal will be the name of a helper function. From there, the logic is nested. eg `{coalesce {4} {3} notfound}` * Quotes in an argument create a single argument eg. `{coalesce {4} {3} "not found"}` @@ -59,7 +59,7 @@ rare histo \ -b access.log ``` -The above parses the method `{1}`, url `{2}`, status `{3}`, and response size `{4}` in the regex. +The above parses the method `{1}`, url `{2}`, status `{3}`, and response size `{4}` in the matcher. It extracts the ` `. It will ignore `-i` if response size `{4}` is less-than `1024*1024` (1MB). diff --git a/docs/usage/extractor.md b/docs/usage/extractor.md index d8baf562..8dcc2524 100644 --- a/docs/usage/extractor.md +++ b/docs/usage/extractor.md @@ -3,14 +3,54 @@ The main component of *rare* is the extractor (or matcher). There are three fundamental concepts around the parser: - * Each line of an input (separated by `\n`) is matched to a regex - * A regex is used to parse a line into a match (and optionally, groups) + * Each line of an input (separated by `\n`) is matched to a matcher + * A matcher is used to parse a line into a match (and optionally, groups) * An expression (see: [expression](expressions.md)) is used to format an - output from a regex group - * Optionally, one or more ignore filter can be applied to silent matches + output from a matched groups + * Optionally, one or more ignore expressions can be applied to silent matches that satisfy a truthy-comparison -## Decomposing a Filter +## Matcher Types + +If no matcher is specified, by default, the entire line is always matched +and passed-through to the expression-stage. + +More than one matcher can **not** be specified at the same time. + +### Regex + +A regex express is specified with `--match` or `-m`, and follows common +[regex syntax](regexp.md). + +When matching a regex, groups and keys are extracted both index and +by-name if specified. + +Set ignore-case with `-I` or `--ignore-case`. + +**Example:** + +```bash +rare filter -m '"(\w{3,4}) ([A-Za-z0-9/.@_-]+)' access.log +``` + +### Dissect + +A dissect expression is specified with `--disect` or `-d`, and follows +[dissect syntax](dissect.md). + +Like regex, groups are extracted by both index and name. + +Set ignore-case with `-I` or `--ignore-case`. + +**Example:** + +```bash +rare filter -d 'HTTP/1.1" %{code} ${size}' -e '{code}' access.log +``` + +## Examples + +### Decomposing a Matcher The most primitive way use rare is to filter lines in an input. We'll be using an example nginx log for our example. @@ -34,7 +74,7 @@ If you want it to only output the matched portion, you can add `-e "{0}"` Lastly, lets say we want to ignore all paths that equal "/", we could do that by adding an ignore pattern: `-i {eq {1} /}` -## Histograms +### Histograms Histograms are like filters, but rather than outputting every match, it will create an aggregated count based on the extracted expression. @@ -48,4 +88,5 @@ rare histogram -m '"(\w{3,4}) ([A-Za-z0-9/.@_-]+)' -e '{1} {2}' -b access.log ## See Also -* [Regular Expressions](regexp.md) \ No newline at end of file +* [Regular Expressions](regexp.md) +* [Examples](examples.md) \ No newline at end of file diff --git a/docs/usage/overview.md b/docs/usage/overview.md index 7963c9f1..bca986ca 100644 --- a/docs/usage/overview.md +++ b/docs/usage/overview.md @@ -23,11 +23,11 @@ Read more at: ## Extraction (Matching) -Extraction is denoted with `-m` (match) and is the process of reading a line in -a file or set of files and parsing it with a regular expression into the -match-groups denoted by the regex. +Extraction is denoted with `-m` (regex) or `-d` (dissect) and is the process of reading +a line in a file or set of files and parsing it with a regular expression into the +match-groups denoted by the matcher. -If the regex doesn't match, the line is discarded (a non-match) +If the expression doesn't match, the line is discarded (a non-match) These match groups are then fed into the next stage, the expression. @@ -62,6 +62,7 @@ Aggregator types: * `histogram` will count instances of the extracted key * `table` will count the key in 2 dimensions * `heatmap` will generate a 2D visualization using colored blocks to denote value +* `sparkline` will generate a 2D visualization with the results being a sparkline * `bargraph` will create either a stacked or non-stacked bargraph based on 2 dimensions * `analyze` will use the key as a numeric value and compute mean/median/mode/stddev/percentiles * `reduce` allows evaluating data using expressions, and grouping/sorting the output diff --git a/mkdocs.yml b/mkdocs.yml index a3e6c422..e76e5fb1 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -19,6 +19,7 @@ nav: - JSON: usage/json.md - Funcs File: usage/funcsfile.md - Regular Expressions: usage/regexp.md + - Dissect Expressions: usage/dissect.md - CLI Docs: cli-help.md - Benchmarks: benchmarks.md - Contributing: contributing.md From 1da5ff919cd9e89f866b3ce85cf9f3664cdca672 Mon Sep 17 00:00:00 2001 From: Christopher LaPointe Date: Fri, 3 Jan 2025 21:38:18 -0500 Subject: [PATCH 12/14] Some renaming and tests --- pkg/extractor/extractor.go | 20 ++++++++++---------- pkg/matchers/dissect/case.go | 8 ++++---- pkg/matchers/dissect/dissect_test.go | 2 +- pkg/matchers/simple_test.go | 17 +++++++++++++++++ 4 files changed, 32 insertions(+), 15 deletions(-) create mode 100644 pkg/matchers/simple_test.go diff --git a/pkg/extractor/extractor.go b/pkg/extractor/extractor.go index 4a9884df..16a09c6b 100644 --- a/pkg/extractor/extractor.go +++ b/pkg/extractor/extractor.go @@ -23,7 +23,7 @@ type InputBatch struct { type Match struct { bLine BString // Keep the pointer around next to line Line string // Unsafe pointer to bLine (no-copy) - Indices []int // match indices as returned by regexp + Indices []int // match indices as returned by matcher Extracted string // The extracted expression LineNumber uint64 // Line number Source string // Source name @@ -32,8 +32,8 @@ type Match struct { // Config for the extractor type Config struct { Matcher matchers.Factory // Matcher - Extract string // Extract these values from regex (expression) - Workers int // Workers to parse regex + Extract string // Extract these values from matcher (expression) + Workers int // Workers to parse matcher Ignore IgnoreSet // Ignore these truthy expressions } @@ -42,7 +42,7 @@ type Config struct { // Expects someone to consume its ReadChan() type Extractor struct { readChan chan []Match - compiledRegexp matchers.Factory + matcherFactory matchers.Factory readLines uint64 matchedLines uint64 ignoredLines uint64 @@ -53,7 +53,7 @@ type Extractor struct { type extractorInstance struct { *Extractor - re matchers.Matcher + matcher matchers.Matcher context *SliceSpaceExpressionContext } @@ -76,7 +76,7 @@ func (s *Extractor) ReadChan() <-chan []Match { // async safe func (s *extractorInstance) processLineSync(source string, lineNum uint64, line BString) (Match, bool) { atomic.AddUint64(&s.readLines, 1) - matches := s.re.FindSubmatchIndex(line) + matches := s.matcher.FindSubmatchIndex(line) // Extract and forward to the ReadChan if there are matches if len(matches) > 0 { @@ -118,12 +118,12 @@ func (s *extractorInstance) processLineSync(source string, lineNum uint64, line func (s *Extractor) asyncWorker(wg *sync.WaitGroup, inputBatch <-chan InputBatch) { defer wg.Done() - re := s.compiledRegexp.CreateInstance() + matcher := s.matcherFactory.CreateInstance() si := extractorInstance{ Extractor: s, - re: re, + matcher: matcher, context: &SliceSpaceExpressionContext{ - nameTable: re.SubexpNameTable(), + nameTable: matcher.SubexpNameTable(), }, } @@ -158,7 +158,7 @@ func New(inputBatch <-chan InputBatch, config *Config) (*Extractor, error) { extractor := Extractor{ readChan: make(chan []Match, 5), - compiledRegexp: config.Matcher, + matcherFactory: config.Matcher, keyBuilder: compiledExpression, config: *config, ignore: config.Ignore, diff --git a/pkg/matchers/dissect/case.go b/pkg/matchers/dissect/case.go index cd2befab..9ccd868f 100644 --- a/pkg/matchers/dissect/case.go +++ b/pkg/matchers/dissect/case.go @@ -4,8 +4,8 @@ import "unicode" // Finds case-insensitive index of second string // ASSUMES second string is already lowered (optimization) -func indexIgnoreCase(s, substr string) int { - n := len(substr) +func indexIgnoreCase(s, loweredSubstr string) int { + n := len(loweredSubstr) switch { case n == 0: return 0 @@ -13,7 +13,7 @@ func indexIgnoreCase(s, substr string) int { return -1 case len(s) == n: for i := 0; i < n; i++ { - if unicode.ToLower(rune(s[i])) != rune(substr[i]) { + if unicode.ToLower(rune(s[i])) != rune(loweredSubstr[i]) { return -1 } } @@ -22,7 +22,7 @@ func indexIgnoreCase(s, substr string) int { for i := 0; i <= len(s)-n; i++ { match := true for j := 0; j < n; j++ { - if unicode.ToLower(rune(s[i+j])) != rune(substr[j]) { + if unicode.ToLower(rune(s[i+j])) != rune(loweredSubstr[j]) { match = false break } diff --git a/pkg/matchers/dissect/dissect_test.go b/pkg/matchers/dissect/dissect_test.go index 6ac1f3bd..67d7aed4 100644 --- a/pkg/matchers/dissect/dissect_test.go +++ b/pkg/matchers/dissect/dissect_test.go @@ -106,7 +106,7 @@ func TestIgnoreCase(t *testing.T) { // BenchmarkDissect-4 13347456 86.07 ns/op 32 B/op 0 allocs/op func BenchmarkDissect(b *testing.B) { - d, _ := CompileEx("t%{val} ", true) + d, _ := CompileEx("t%{val} ", false) di := d.CreateInstance() val := []byte("this is a test ") diff --git a/pkg/matchers/simple_test.go b/pkg/matchers/simple_test.go new file mode 100644 index 00000000..5d2b1469 --- /dev/null +++ b/pkg/matchers/simple_test.go @@ -0,0 +1,17 @@ +package matchers + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestSimpleMatcherAndFactory(t *testing.T) { + matcher := ToFactory(&AlwaysMatch{}) // ToFactory isn't necessary, but will exercise the path + inst := matcher.CreateInstance() + + assert.Empty(t, inst.SubexpNameTable()) + + assert.Equal(t, []int{0, 0}, inst.FindSubmatchIndex([]byte{})) + assert.Equal(t, []int{0, 2}, inst.FindSubmatchIndex([]byte("hi"))) +} From 42ae5ab398d48d3cd02aaf7f8cb31b46fad6a4f7 Mon Sep 17 00:00:00 2001 From: Christopher LaPointe Date: Fri, 3 Jan 2025 21:42:55 -0500 Subject: [PATCH 13/14] Clarify index match --- docs/usage/dissect.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/docs/usage/dissect.md b/docs/usage/dissect.md index a8a40137..919c647d 100644 --- a/docs/usage/dissect.md +++ b/docs/usage/dissect.md @@ -34,7 +34,14 @@ Will match: prefix bob : 123 ``` -And will extract two keys: +And extract 3 index-keys: +``` +0: prefix bob : 123 +1: bob +2: 123 +``` + +And will extract two named keys: ``` name=bob value=123 From 6243c3ef1aa763177a63fc3ced5cd8669d0201e2 Mon Sep 17 00:00:00 2001 From: Christopher LaPointe Date: Fri, 3 Jan 2025 21:44:17 -0500 Subject: [PATCH 14/14] Fix typo --- docs/usage/extractor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage/extractor.md b/docs/usage/extractor.md index 8dcc2524..b845d673 100644 --- a/docs/usage/extractor.md +++ b/docs/usage/extractor.md @@ -19,7 +19,7 @@ More than one matcher can **not** be specified at the same time. ### Regex -A regex express is specified with `--match` or `-m`, and follows common +A regex expression is specified with `--match` or `-m`, and follows common [regex syntax](regexp.md). When matching a regex, groups and keys are extracted both index and