zix99 · zix99 · Jan 4, 2025 · Dec 31, 2024 · Dec 31, 2024 · Jan 1, 2025
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -23,7 +23,7 @@ jobs:
       - name: Test
         run: |
           go test -v -race -coverprofile=coverage.txt -covermode=atomic ./...
-          go test -tags=pcre2 rare/pkg/fastregex
+          go test -tags=pcre2 rare/pkg/matchers/fastregex
       - name: StaticCheck
         run: |
           go run honnef.co/go/tools/cmd/[email protected] ./...

diff --git a/README.md b/README.md
@@ -20,6 +20,7 @@ See [rare.zdyn.net](https://rare.zdyn.net) or the [docs/ folder](docs/) for the
 ## Features
 
  * Multiple summary formats including: filter (like grep), histogram, bar graphs, tables, heatmaps, reduce, and numerical analysis
+ * Parse using regex (`-m`) or dissect tokenizer (`-d`)
  * File glob expansions (eg `/var/log/*` or `/var/log/*/*.log`) and `-R`
  * Optional gzip decompression (with `-z`)
  * Following `-f` or re-open following `-F` (use `--poll` to poll, and `--tail` to tail)

diff --git a/cmd/helpers/extractorBuilder.go b/cmd/helpers/extractorBuilder.go
@@ -1,12 +1,16 @@
 package helpers
 
 import (
+	"errors"
 	"os"
 	"rare/pkg/expressions"
 	"rare/pkg/extractor"
 	"rare/pkg/extractor/batchers"
 	"rare/pkg/extractor/dirwalk"
 	"rare/pkg/logger"
+	"rare/pkg/matchers"
+	"rare/pkg/matchers/dissect"
+	"rare/pkg/matchers/fastregex"
 	"runtime"
 	"strings"
 
@@ -74,15 +78,15 @@ func BuildExtractorFromArguments(c *cli.Context, batcher *batchers.Batcher) *ext
 
 func BuildExtractorFromArgumentsEx(c *cli.Context, batcher *batchers.Batcher, sep string) *extractor.Extractor {
 	config := extractor.Config{
-		Posix:   c.Bool("posix"),
-		Regex:   c.String("match"),
 		Extract: strings.Join(c.StringSlice("extract"), sep),
 		Workers: c.Int("workers"),
 	}
 
-	if c.Bool("ignore-case") {
-		config.Regex = "(?i)" + config.Regex
+	matcher, err := BuildMatcherFromArguments(c)
+	if err != nil {
+		logger.Fatalln(ExitCodeInvalidUsage, err)
 	}
+	config.Matcher = matcher
 
 	ignoreSlice := c.StringSlice("ignore")
 	if len(ignoreSlice) > 0 {
@@ -100,6 +104,38 @@ func BuildExtractorFromArgumentsEx(c *cli.Context, batcher *batchers.Batcher, se
 	return ret
 }
 
+func BuildMatcherFromArguments(c *cli.Context) (matchers.Factory, error) {
+	var (
+		matchExpr   = c.String("match")
+		dissectExpr = c.String("dissect")
+		posix       = c.Bool("posix")
+		ignoreCase  = c.Bool("ignore-case")
+	)
+
+	switch {
+	case c.IsSet("match") && c.IsSet("dissect"):
+		return nil, errors.New("match and dissect conflict")
+	case c.IsSet("dissect"):
+		d, err := dissect.CompileEx(dissectExpr, ignoreCase)
+		if err != nil {
+			return nil, err
+		}
+		return matchers.ToFactory(d), nil
+	case c.IsSet("match"):
+		if ignoreCase {
+			matchExpr = "(?i)" + matchExpr
+		}
+
+		r, err := fastregex.CompileEx(matchExpr, posix)
+		if err != nil {
+			return nil, err
+		}
+		return matchers.ToFactory(r), nil
+	default:
+		return &matchers.AlwaysMatch{}, nil
+	}
+}
+
 func getExtractorFlags() []cli.Flag {
 	workerCount := runtime.NumCPU()/2 + 1
 
@@ -146,12 +182,18 @@ func getExtractorFlags() []cli.Flag {
 			Usage:    "Compile regex as against posix standard",
 		},
 		&cli.StringFlag{
-			Name:     "match,m",
+			Name:     "match",
 			Aliases:  []string{"m"},
 			Category: cliCategoryMatching,
 			Usage:    "Regex to create match groups to summarize on",
 			Value:    ".*",
 		},
+		&cli.StringFlag{
+			Name:     "dissect",
+			Aliases:  []string{"d"},
+			Category: cliCategoryMatching,
+			Usage:    "Dissect expression create match groups to summarize on",
+		},
 		&cli.StringSliceFlag{
 			Name:     "extract",
 			Aliases:  []string{"e"},
@@ -169,7 +211,7 @@ func getExtractorFlags() []cli.Flag {
 			Name:     "ignore-case",
 			Aliases:  []string{"I"},
 			Category: cliCategoryMatching,
-			Usage:    "Augment regex to be case insensitive",
+			Usage:    "Augment matcher to be case insensitive",
 		},
 		&cli.IntFlag{
 			Name:     "batch",

diff --git a/cmd/helpers/extractorBuilder_test.go b/cmd/helpers/extractorBuilder_test.go
@@ -56,6 +56,10 @@ func TestBuildingExtractorFromContext(t *testing.T) {
 	assert.NoError(t, runApp(""))
 	assert.NoError(t, runApp(`-I -i "{eq {0} abc}" ../testdata/log.txt`))
 	assert.NoError(t, runApp(`-f ../testdata/log.txt`))
+	assert.NoError(t, runApp(`-m ".*" ../testdata/log.txt`))
+	assert.NoError(t, runApp(`-I -m ".*" ../testdata/log.txt`))
+	assert.NoError(t, runApp(`-d "%{}" ../testdata/log.txt`))
+	assert.NoError(t, runApp(`-I -d "%{}" ../testdata/log.txt`))
 	testLogFatal(t, 2, func() {
 		runApp("--batch 0 ../testdata/log.txt")
 	})
@@ -77,5 +81,11 @@ func TestBuildingExtractorFromContext(t *testing.T) {
 	testLogFatal(t, 2, func() {
 		runApp(`-i "{0" -`)
 	})
-	assert.Equal(t, 3, actionCalled)
+	testLogFatal(t, 2, func() {
+		runApp(`-m regex -d dissect -`)
+	})
+	testLogFatal(t, 2, func() {
+		runApp(`-d "%{unclosed" -`)
+	})
+	assert.Equal(t, 7, actionCalled)
 }
diff --git a/cmd/helpers/updatingAggregator_test.go b/cmd/helpers/updatingAggregator_test.go
@@ -4,6 +4,8 @@ import (
 	"io"
 	"rare/pkg/extractor"
 	"rare/pkg/extractor/batchers"
+	"rare/pkg/matchers"
+	"rare/pkg/matchers/fastregex"
 	"strings"
 	"testing"
 
@@ -31,7 +33,7 @@ func TestAggregationLoop(t *testing.T) {
 	// Build a real extractor
 	batcher := batchers.OpenReaderToChan("test", io.NopCloser(strings.NewReader(testData)), 1, 1)
 	ex, err := extractor.New(batcher.BatchChan(), &extractor.Config{
-		Regex:   `(\d+)`,
+		Matcher: matchers.ToFactory(fastregex.MustCompile(`(\d+)`)),
 		Extract: "val:{1}",
 		Workers: 1,
 	})

diff --git a/docs/cli-help.md b/docs/cli-help.md
@@ -67,6 +67,8 @@ Filter incoming results with search criteria, and output raw matches
 
 **--batch-buffer**="": Specifies how many batches to read-ahead. Impacts memory usage, can improve performance (default: 6)
 
+**--dissect, -d**="": Dissect expression create match groups to summarize on
+
 **--extract, -e**="": Expression that will generate the key to group by. Specify multiple times for multi-dimensions or use {$} helper (default: [{0}])
 
 **--follow, -f**: Read appended data as file grows
@@ -75,7 +77,7 @@ Filter incoming results with search criteria, and output raw matches
 
 **--ignore, -i**="": Ignore a match given a truthy expression (Can have multiple)
 
-**--ignore-case, -I**: Augment regex to be case insensitive
+**--ignore-case, -I**: Augment matcher to be case insensitive
 
 **--line, -l**: Output source file and line number
 
@@ -113,6 +115,8 @@ Summarize results by extracting them to a histogram
 
 **--csv, -o**="": Write final results to csv. Use - to output to stdout
 
+**--dissect, -d**="": Dissect expression create match groups to summarize on
+
 **--extra, -x**: Alias for -b --percentage
 
 **--extract, -e**="": Expression that will generate the key to group by. Specify multiple times for multi-dimensions or use {$} helper (default: [{0}])
@@ -123,7 +127,7 @@ Summarize results by extracting them to a histogram
 
 **--ignore, -i**="": Ignore a match given a truthy expression (Can have multiple)
 
-**--ignore-case, -I**: Augment regex to be case insensitive
+**--ignore-case, -I**: Augment matcher to be case insensitive
 
 **--match, -m**="": Regex to create match groups to summarize on (default: .*)
 
@@ -167,6 +171,8 @@ Create a 2D heatmap of extracted data
 
 **--delim**="": Character to tabulate on. Use {$} helper by default (default: \x00)
 
+**--dissect, -d**="": Dissect expression create match groups to summarize on
+
 **--extract, -e**="": Expression that will generate the key to group by. Specify multiple times for multi-dimensions or use {$} helper (default: [{0}])
 
 **--follow, -f**: Read appended data as file grows
@@ -175,7 +181,7 @@ Create a 2D heatmap of extracted data
 
 **--ignore, -i**="": Ignore a match given a truthy expression (Can have multiple)
 
-**--ignore-case, -I**: Augment regex to be case insensitive
+**--ignore-case, -I**: Augment matcher to be case insensitive
 
 **--match, -m**="": Regex to create match groups to summarize on (default: .*)
 
@@ -223,6 +229,8 @@ Create rows of sparkline graphs
 
 **--delim**="": Character to tabulate on. Use {$} helper by default (default: \x00)
 
+**--dissect, -d**="": Dissect expression create match groups to summarize on
+
 **--extract, -e**="": Expression that will generate the key to group by. Specify multiple times for multi-dimensions or use {$} helper (default: [{0}])
 
 **--follow, -f**: Read appended data as file grows
@@ -231,7 +239,7 @@ Create rows of sparkline graphs
 
 **--ignore, -i**="": Ignore a match given a truthy expression (Can have multiple)
 
-**--ignore-case, -I**: Augment regex to be case insensitive
+**--ignore-case, -I**: Augment matcher to be case insensitive
 
 **--match, -m**="": Regex to create match groups to summarize on (default: .*)
 
@@ -273,6 +281,8 @@ Create a bargraph of the given 1 or 2 dimension data
 
 **--csv, -o**="": Write final results to csv. Use - to output to stdout
 
+**--dissect, -d**="": Dissect expression create match groups to summarize on
+
 **--extract, -e**="": Expression that will generate the key to group by. Specify multiple times for multi-dimensions or use {$} helper (default: [{0}])
 
 **--follow, -f**: Read appended data as file grows
@@ -281,7 +291,7 @@ Create a bargraph of the given 1 or 2 dimension data
 
 **--ignore, -i**="": Ignore a match given a truthy expression (Can have multiple)
 
-**--ignore-case, -I**: Augment regex to be case insensitive
+**--ignore-case, -I**: Augment matcher to be case insensitive
 
 **--match, -m**="": Regex to create match groups to summarize on (default: .*)
 
@@ -317,6 +327,8 @@ Numerical analysis on a set of filtered data
 
 **--batch-buffer**="": Specifies how many batches to read-ahead. Impacts memory usage, can improve performance (default: 6)
 
+**--dissect, -d**="": Dissect expression create match groups to summarize on
+
 **--extra, -x**: Displays extra analysis on the data (Requires more memory and cpu)
 
 **--extract, -e**="": Expression that will generate the key to group by. Specify multiple times for multi-dimensions or use {$} helper (default: [{0}])
@@ -327,7 +339,7 @@ Numerical analysis on a set of filtered data
 
 **--ignore, -i**="": Ignore a match given a truthy expression (Can have multiple)
 
-**--ignore-case, -I**: Augment regex to be case insensitive
+**--ignore-case, -I**: Augment matcher to be case insensitive
 
 **--match, -m**="": Regex to create match groups to summarize on (default: .*)
 
@@ -367,6 +379,8 @@ Create a 2D summarizing table of extracted data
 
 **--delim**="": Character to tabulate on. Use {$} helper by default (default: \x00)
 
+**--dissect, -d**="": Dissect expression create match groups to summarize on
+
 **--extra, -x**: Display row and column totals
 
 **--extract, -e**="": Expression that will generate the key to group by. Specify multiple times for multi-dimensions or use {$} helper (default: [{0}])
@@ -377,7 +391,7 @@ Create a 2D summarizing table of extracted data
 
 **--ignore, -i**="": Ignore a match given a truthy expression (Can have multiple)
 
-**--ignore-case, -I**: Augment regex to be case insensitive
+**--ignore-case, -I**: Augment matcher to be case insensitive
 
 **--match, -m**="": Regex to create match groups to summarize on (default: .*)
 
@@ -421,6 +435,8 @@ Aggregate the results of a query based on an expression, pulling customized summ
 
 **--csv, -o**="": Write final results to csv. Use - to output to stdout
 
+**--dissect, -d**="": Dissect expression create match groups to summarize on
+
 **--extract, -e**="": Expression that will generate the key to group by. Specify multiple times for multi-dimensions or use {$} helper (default: [{@}])
 
 **--follow, -f**: Read appended data as file grows
@@ -431,7 +447,7 @@ Aggregate the results of a query based on an expression, pulling customized summ
 
 **--ignore, -i**="": Ignore a match given a truthy expression (Can have multiple)
 
-**--ignore-case, -I**: Augment regex to be case insensitive
+**--ignore-case, -I**: Augment matcher to be case insensitive
 
 **--initial**="": Specify the default initial value for any accumulators that don't specify (default: 0)
 

diff --git a/docs/index.md b/docs/index.md
@@ -18,6 +18,7 @@ Supports various CLI-based graphing and metric formats (filter (grep-like), hist
 ## Features
 
  * Multiple summary formats including: filter (like grep), histogram, bar graphs, tables, heatmaps, sparklines, reduce, and numerical analysis
+ * Parse using regex (`-m`) or dissect tokenizer (`-d`)
  * File glob expansions (eg `/var/log/*` or `/var/log/*/*.log`) and `-R`
  * Optional gzip decompression (with `-z`)
  * Following `-f` or re-open following `-F` (use `--poll` to poll, and `--tail` to tail)

diff --git a/docs/usage/dissect.md b/docs/usage/dissect.md
@@ -0,0 +1,75 @@
+# Dissect Syntax
+
+*Dissect* is a simple token-based search algorithm, and can
+be up to 10x faster than regex (and 40% faster than PCRE).
+
+It works by searching for for constant delimiters in a string
+and extracting the text between the tokens as named keys.
+
+*rare* implements a subset of the full dissect algorithm.
+
+**Syntax Example:**
+```
+prefix %{name} : %{value} - %{?ignored}
+```
+
+## Syntax
+
+- Anything in a `%{}` is a variable token.
+- A blank token, or a token that starts with `?` is skipped. eg `%{}` or `%{?skipped}`
+- Tokens are extracted by both name and index (in the order they appear).
+- Index `{0}` is the full match, including the delimiters
+- Patterns don't need to match the entire line
+
+## Examples
+
+### Simple
+
+```
+prefix %{name} : %{value}
+```
+
+Will match:
+```
+prefix bob : 123
+```
+
+And extract 3 index-keys:
+```
+0: prefix bob : 123
+1: bob
+2: 123
+```
+
+And will extract two named keys:
+```
+name=bob
+value=123
+```
+
+### Nginx Logs
+
+As a simple example, to parse nginx logs that look like:
+
+```
+104.238.185.46 - - [19/Aug/2019:02:26:25 +0000] "GET / HTTP/1.1" 200 546 "-" "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/98 Safari/537.4 (StatusCake)"
+```
+
+The following dissect expression can be used:
+
+```
+%{ip} - - [%{timestamp}] "%{verb} %{path} HTTP/%{?http-version}" %{status} %{size} "-" "%{useragent}"
+```
+
+Which, as json, will return:
+```json
+{
+    "timestamp": "12/Dec/2019:17:54:13 +0000",
+    "verb": "POST",
+    "path": "/temtel.php",
+    "status": 404,
+    "size": 571,
+    "useragent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36",
+    "ip": "203.113.174.104"
+}
+```