Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 32 additions & 13 deletions cmd/helpers/extractorBuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package helpers

import (
"errors"
"fmt"
"os"
"runtime"
"slices"
Expand Down Expand Up @@ -42,6 +43,7 @@ func BuildBatcherFromArgumentsEx(c *cli.Context, fileglobs ...string) (*batchers
followReopen = c.Bool("reopen")
followPoll = c.Bool("poll")
concurrentReaders = c.Int("readers")
readersBuffer = c.Int("readers-buffer")
gunzip = c.Bool("gunzip")
batchSize = c.Int("batch")
batchBuffer = c.Int("batch-buffer")
Expand All @@ -53,6 +55,9 @@ func BuildBatcherFromArgumentsEx(c *cli.Context, fileglobs ...string) (*batchers
if concurrentReaders < 1 {
logger.Fatalf(ExitCodeInvalidUsage, "Must have at least 1 reader")
}
if readersBuffer < 1 {
logger.Fatalf(ExitCodeInvalidUsage, "Reader buffer must be at least 1 byte")
}
if followPoll && !follow {
logger.Fatalf(ExitCodeInvalidUsage, "Follow (-f) must be enabled for --poll")
}
Expand All @@ -67,16 +72,16 @@ func BuildBatcherFromArgumentsEx(c *cli.Context, fileglobs ...string) (*batchers
if follow {
logger.Println("Cannot follow a stdin stream, not a file")
}
return batchers.OpenReaderToChan("<stdin>", os.Stdin, batchSize, batchBuffer), nil
return batchers.OpenReaderToChan("<stdin>", os.Stdin, batchSize, batchBuffer, readersBuffer), nil
} else if follow { // Read from source file
if gunzip {
logger.Println("Cannot combine -f and -z")
}
walker := BuildPathWalkerFromArguments(c)
return batchers.TailFilesToChan(walker.Walk(fileglobs...), batchSize, batchBuffer, followReopen, followPoll, followTail), walker
return batchers.TailFilesToChan(walker.Walk(fileglobs...), batchSize, batchBuffer, readersBuffer, followReopen, followPoll, followTail), walker
} else { // Read (no-follow) source file(s)
walker := BuildPathWalkerFromArguments(c)
return batchers.OpenFilesToChan(walker.Walk(fileglobs...), gunzip, concurrentReaders, batchSize, batchBuffer), walker
return batchers.OpenFilesToChan(walker.Walk(fileglobs...), gunzip, concurrentReaders, batchSize, batchBuffer, readersBuffer), walker
}
}

Expand Down Expand Up @@ -258,26 +263,40 @@ func getExtractorFlags() []cli.Flag {
Category: cliCategoryTweaking,
Usage: "Specifies io batching size. Set to 1 for immediate input",
Value: 1000,
EnvVars: []string{"RARE_BATCH"},
},
&cli.IntFlag{
Name: "batch-buffer",
Category: cliCategoryTweaking,
Usage: "Specifies how many batches to read-ahead. Impacts memory usage, can improve performance",
Value: workerCount * 2, // Keep 2 batches ready for each worker
Name: "batch-buffer",
Category: cliCategoryTweaking,
Usage: "Specifies how many batches to read-ahead. Impacts memory usage, can improve performance",
Value: workerCount * 2, // Keep 2 batches ready for each worker
DefaultText: fmt.Sprintf("workers*2 = %d", workerCount*2),
EnvVars: []string{"RARE_BATCH_BUFFER"},
},
&cli.IntFlag{
Name: "workers",
Aliases: []string{"w"},
Category: cliCategoryTweaking,
Usage: "Set number of data processors",
Value: workerCount,
Name: "workers",
Aliases: []string{"w"},
Category: cliCategoryTweaking,
Usage: "Set number of data processors",
Value: workerCount,
DefaultText: fmt.Sprintf("NumCPU/2+1 = %d", workerCount),
EnvVars: []string{"RARE_WORKERS"},
},
&cli.IntFlag{
Name: "readers",
Aliases: []string{"wr"},
Category: cliCategoryTweaking,
Usage: "Sets the number of concurrent readers (Infinite when -f)",
Value: 3,
Value: workerCount,
EnvVars: []string{"RARE_READERS"},
},
&cli.IntFlag{
Name: "readers-buffer",
Aliases: []string{"wrb"},
Category: cliCategoryTweaking,
Usage: "How many bytes will be buffered per reader",
Value: 512 * 1024,
EnvVars: []string{"RARE_READERS_BUFFER"},
},
}
}
Expand Down
3 changes: 3 additions & 0 deletions cmd/helpers/extractorBuilder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ func TestBuildingExtractorFromContext(t *testing.T) {
testLogFatal(t, 2, func() {
runApp("--readers 0 ../testdata/log.txt")
})
testLogFatal(t, 2, func() {
runApp("--readers-buffer 0 ../testdata/log.txt")
})
testLogFatal(t, 2, func() {
runApp("--poll ../testdata/log.txt")
})
Expand Down
2 changes: 1 addition & 1 deletion cmd/helpers/updatingAggregator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ func (s *VirtualAggregator) ParseErrors() uint64 {

func TestAggregationLoop(t *testing.T) {
// Build a real extractor
batcher := batchers.OpenReaderToChan("test", io.NopCloser(strings.NewReader(testData)), 1, 1)
batcher := batchers.OpenReaderToChan("test", io.NopCloser(strings.NewReader(testData)), 1, 1, 1024)
ex, err := extractor.New(batcher.BatchChan(), &extractor.Config{
Matcher: matchers.ToFactory(fastregex.MustCompile(`(\d+)`)),
Extract: "val:{1}",
Expand Down
123 changes: 62 additions & 61 deletions docs/benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,61 +6,62 @@ It's worth noting that in many of these results rare is just as fast, but part
of that reason is that it consumes CPU in a more efficient way (go is great at parallelization).
So take that into account, for better or worse.

All tests were done on ~83MB of gzip'd (1.5GB gunzip'd) nginx logs spread across 10 files. They
were run on a spinning disk on an older machine. New machines run significantly faster.
All tests were done on ~824MB of gzip'd (13.93 GB gunzip'd) nginx logs spread across 8 files. They
were run on a NVMe SSD on a recent (2025) machine.

Each program was run 3 times and the last time was taken (to make sure things were cached equally).


## rare

At no point scanning the data does `rare` exceed ~4MB of resident memory.
At no point scanning the data does `rare` exceed ~42MB of resident memory. Buffer sizes can be tweaked
to adjust memory usage.

```bash
$ rare -v
rare version 0.4.3, e0fc395; regex: re2

$ time rare filter -m '" (\d{3})' -e "{1}" -z testdata/*.gz | wc -l
Matched: 8,373,328 / 8,373,328
8373328
Matched: 82,733,280 / 82,733,280
82733280

real 0m3.266s
user 0m10.607s
sys 0m0.769s
real 0m3.409s
user 0m32.750s
sys 0m2.175s
```

When aggregating data, `rare` is significantly faster than alternatives.

```bash
$ time rare histo -m '" (\d{3})' -e "{1}" -z testdata/*.gz
404 5,557,374
200 2,564,984
400 243,282
405 5,708
408 1,397
Matched: 8,373,328 / 8,373,328 (Groups: 8)
[9/9] 1.41 GB (514.25 MB/s)

real 0m2.870s
user 0m9.606s
sys 0m0.393s
404 54,843,840
200 25,400,160
400 2,412,960
405 56,640
408 13,920
Matched: 82,733,280 / 82,733,280 (Groups: 8)
[8/8] 13.93 GB (4.27 GB/s)

real 0m3.283s
user 0m31.485s
sys 0m1.497s
```

And, as an alternative, using *dissect* matcher instead of regex is even slightly faster:

```bash
$ time rare histo -d '" %{CODE} ' -e '{CODE}' -z testdata/*.gz
404 5,557,374
200 2,564,984
400 243,282
405 5,708
408 1,397
Matched: 8,373,328 / 8,373,328 (Groups: 8)
[9/9] 1.41 GB (531.11 MB/s)

real 0m2.533s
user 0m7.976s
sys 0m0.350s
404 54,843,840
200 25,400,160
400 2,412,960
405 56,640
408 13,920
Matched: 82,733,280 / 82,733,280 (Groups: 8)
[8/8] 13.93 GB (5.61 GB/s)

real 0m2.546s
user 0m22.922s
sys 0m1.491s
```

### pcre2
Expand All @@ -71,47 +72,47 @@ on more complex regex's.
```bash
# Normal re2 version
$ time rare table -z -m "\[(.+?)\].*\" (\d+)" -e "{buckettime {1} year nginx}" -e "{bucket {2} 100}" testdata/*.gz
2020 2019
400 2,915,487 2,892,274
200 1,716,107 848,925
300 290 245
Matched: 8,373,328 / 8,373,328 (R: 3; C: 2)
[9/9] 1.41 GB (52.81 MB/s)
2020 2019
400 28,994,880 28,332,480
200 17,084,640 8,316,000
300 2,880 2,400
Matched: 82,733,280 / 82,733,280 (R: 3; C: 2)
[8/8] 13.93 GB (596.89 MB/s)

real 0m27.880s
user 1m28.782s
sys 0m0.824s
real 0m23.819s
user 3m52.252s
sys 0m1.625s

# libpcre2 version
$ time rare-pcre table -z -m "\[(.+?)\].*\" (\d+)" -e "{buckettime {1} year nginx}" -e "{bucket {2} 100}" testdata/*.gz
2020 2019
400 2,915,487 2,892,274
200 1,716,107 848,925
300 290 245
Matched: 8,373,328 / 8,373,328 (R: 3; C: 2)
[9/9] 1.41 GB (241.82 MB/s)

real 0m5.751s
user 0m20.173s
sys 0m0.461s
2020 2019
400 28,994,880 28,332,480
200 17,084,640 8,316,000
300 2,880 2,400
Matched: 82,733,280 / 82,733,280 (R: 3; C: 2)
[8/8] 13.93 GB (2.10 GB/s)

real 0m6.813s
user 1m15.638s
sys 0m1.985s
```


## zcat & grep

```
$ time zcat testdata/*.gz | grep -Poa '" (\d{3})' | wc -l
8373328
82733280

real 0m11.272s
user 0m16.239s
sys 0m1.989s
real 0m28.414s
user 0m35.268s
sys 0m1.865s

$ time zcat testdata/* | grep -Poa '" 200' > /dev/null
$ time zcat testdata/*.gz | grep -Poa '" 200' > /dev/null

real 0m5.416s
user 0m4.810s
sys 0m1.185s
real 0m28.616s
user 0m27.517s
sys 0m1.658s

```

Expand All @@ -126,11 +127,11 @@ the complete functionality that rare exposes.

```bash
$ time rg -z '" (\d{3})' testdata/*.gz | wc -l
8373328
82733280

real 0m3.791s
user 0m8.149s
sys 0m4.420s
real 0m7.058s
user 0m40.284s
sys 0m8.962s
```

# Other Tools
Expand Down
Loading
Loading