zix99 · zix99 · Oct 26, 2025 · Oct 25, 2025 · Oct 25, 2025 · Oct 25, 2025
diff --git a/cmd/helpers/extractorBuilder.go b/cmd/helpers/extractorBuilder.go
@@ -2,6 +2,7 @@ package helpers
 
 import (
 	"errors"
+	"fmt"
 	"os"
 	"runtime"
 	"slices"
@@ -42,6 +43,7 @@ func BuildBatcherFromArgumentsEx(c *cli.Context, fileglobs ...string) (*batchers
 		followReopen      = c.Bool("reopen")
 		followPoll        = c.Bool("poll")
 		concurrentReaders = c.Int("readers")
+		readersBuffer     = c.Int("readers-buffer")
 		gunzip            = c.Bool("gunzip")
 		batchSize         = c.Int("batch")
 		batchBuffer       = c.Int("batch-buffer")
@@ -53,6 +55,9 @@ func BuildBatcherFromArgumentsEx(c *cli.Context, fileglobs ...string) (*batchers
 	if concurrentReaders < 1 {
 		logger.Fatalf(ExitCodeInvalidUsage, "Must have at least 1 reader")
 	}
+	if readersBuffer < 1 {
+		logger.Fatalf(ExitCodeInvalidUsage, "Reader buffer must be at least 1 byte")
+	}
 	if followPoll && !follow {
 		logger.Fatalf(ExitCodeInvalidUsage, "Follow (-f) must be enabled for --poll")
 	}
@@ -67,16 +72,16 @@ func BuildBatcherFromArgumentsEx(c *cli.Context, fileglobs ...string) (*batchers
 		if follow {
 			logger.Println("Cannot follow a stdin stream, not a file")
 		}
-		return batchers.OpenReaderToChan("<stdin>", os.Stdin, batchSize, batchBuffer), nil
+		return batchers.OpenReaderToChan("<stdin>", os.Stdin, batchSize, batchBuffer, readersBuffer), nil
 	} else if follow { // Read from source file
 		if gunzip {
 			logger.Println("Cannot combine -f and -z")
 		}
 		walker := BuildPathWalkerFromArguments(c)
-		return batchers.TailFilesToChan(walker.Walk(fileglobs...), batchSize, batchBuffer, followReopen, followPoll, followTail), walker
+		return batchers.TailFilesToChan(walker.Walk(fileglobs...), batchSize, batchBuffer, readersBuffer, followReopen, followPoll, followTail), walker
 	} else { // Read (no-follow) source file(s)
 		walker := BuildPathWalkerFromArguments(c)
-		return batchers.OpenFilesToChan(walker.Walk(fileglobs...), gunzip, concurrentReaders, batchSize, batchBuffer), walker
+		return batchers.OpenFilesToChan(walker.Walk(fileglobs...), gunzip, concurrentReaders, batchSize, batchBuffer, readersBuffer), walker
 	}
 }
 
@@ -258,26 +263,40 @@ func getExtractorFlags() []cli.Flag {
 			Category: cliCategoryTweaking,
 			Usage:    "Specifies io batching size. Set to 1 for immediate input",
 			Value:    1000,
+			EnvVars:  []string{"RARE_BATCH"},
 		},
 		&cli.IntFlag{
-			Name:     "batch-buffer",
-			Category: cliCategoryTweaking,
-			Usage:    "Specifies how many batches to read-ahead. Impacts memory usage, can improve performance",
-			Value:    workerCount * 2, // Keep 2 batches ready for each worker
+			Name:        "batch-buffer",
+			Category:    cliCategoryTweaking,
+			Usage:       "Specifies how many batches to read-ahead. Impacts memory usage, can improve performance",
+			Value:       workerCount * 2, // Keep 2 batches ready for each worker
+			DefaultText: fmt.Sprintf("workers*2 = %d", workerCount*2),
+			EnvVars:     []string{"RARE_BATCH_BUFFER"},
 		},
 		&cli.IntFlag{
-			Name:     "workers",
-			Aliases:  []string{"w"},
-			Category: cliCategoryTweaking,
-			Usage:    "Set number of data processors",
-			Value:    workerCount,
+			Name:        "workers",
+			Aliases:     []string{"w"},
+			Category:    cliCategoryTweaking,
+			Usage:       "Set number of data processors",
+			Value:       workerCount,
+			DefaultText: fmt.Sprintf("NumCPU/2+1 = %d", workerCount),
+			EnvVars:     []string{"RARE_WORKERS"},
 		},
 		&cli.IntFlag{
 			Name:     "readers",
 			Aliases:  []string{"wr"},
 			Category: cliCategoryTweaking,
 			Usage:    "Sets the number of concurrent readers (Infinite when -f)",
-			Value:    3,
+			Value:    workerCount,
+			EnvVars:  []string{"RARE_READERS"},
+		},
+		&cli.IntFlag{
+			Name:     "readers-buffer",
+			Aliases:  []string{"wrb"},
+			Category: cliCategoryTweaking,
+			Usage:    "How many bytes will be buffered per reader",
+			Value:    512 * 1024,
+			EnvVars:  []string{"RARE_READERS_BUFFER"},
 		},
 	}
 }

diff --git a/cmd/helpers/extractorBuilder_test.go b/cmd/helpers/extractorBuilder_test.go
@@ -88,6 +88,9 @@ func TestBuildingExtractorFromContext(t *testing.T) {
 	testLogFatal(t, 2, func() {
 		runApp("--readers 0 ../testdata/log.txt")
 	})
+	testLogFatal(t, 2, func() {
+		runApp("--readers-buffer 0 ../testdata/log.txt")
+	})
 	testLogFatal(t, 2, func() {
 		runApp("--poll ../testdata/log.txt")
 	})

diff --git a/cmd/helpers/updatingAggregator_test.go b/cmd/helpers/updatingAggregator_test.go
@@ -33,7 +33,7 @@ func (s *VirtualAggregator) ParseErrors() uint64 {
 
 func TestAggregationLoop(t *testing.T) {
 	// Build a real extractor
-	batcher := batchers.OpenReaderToChan("test", io.NopCloser(strings.NewReader(testData)), 1, 1)
+	batcher := batchers.OpenReaderToChan("test", io.NopCloser(strings.NewReader(testData)), 1, 1, 1024)
 	ex, err := extractor.New(batcher.BatchChan(), &extractor.Config{
 		Matcher: matchers.ToFactory(fastregex.MustCompile(`(\d+)`)),
 		Extract: "val:{1}",

diff --git a/docs/benchmarks.md b/docs/benchmarks.md
@@ -6,61 +6,62 @@ It's worth noting that in many of these results rare is just as fast, but part
 of that reason is that it consumes CPU in a more efficient way (go is great at parallelization).
 So take that into account, for better or worse.
 
-All tests were done on ~83MB of gzip'd (1.5GB gunzip'd) nginx logs spread across 10 files.  They
-were run on a spinning disk on an older machine. New machines run significantly faster.
+All tests were done on ~824MB of gzip'd (13.93 GB gunzip'd) nginx logs spread across 8 files.  They
+were run on a NVMe SSD on a recent (2025) machine.
 
 Each program was run 3 times and the last time was taken (to make sure things were cached equally).
 
 
 ## rare
 
-At no point scanning the data does `rare` exceed ~4MB of resident memory.
+At no point scanning the data does `rare` exceed ~42MB of resident memory. Buffer sizes can be tweaked
+to adjust memory usage.
 
 ```bash
 $ rare -v
 rare version 0.4.3, e0fc395; regex: re2
 
 $ time rare filter -m '" (\d{3})' -e "{1}" -z testdata/*.gz | wc -l
-Matched: 8,373,328 / 8,373,328
-8373328
+Matched: 82,733,280 / 82,733,280
+82733280
 
-real    0m3.266s
-user    0m10.607s
-sys     0m0.769s
+real    0m3.409s
+user    0m32.750s
+sys     0m2.175s
 ```
 
 When aggregating data, `rare` is significantly faster than alternatives.
 
 ```bash
 $ time rare histo -m '" (\d{3})' -e "{1}" -z testdata/*.gz
-404                 5,557,374 
-200                 2,564,984 
-400                 243,282   
-405                 5,708     
-408                 1,397     
-Matched: 8,373,328 / 8,373,328 (Groups: 8)
-[9/9] 1.41 GB (514.25 MB/s)
-
-real    0m2.870s
-user    0m9.606s
-sys     0m0.393s
+404         54,843,840
+200         25,400,160
+400         2,412,960
+405         56,640
+408         13,920
+Matched: 82,733,280 / 82,733,280 (Groups: 8)
+[8/8] 13.93 GB (4.27 GB/s)
+
+real    0m3.283s
+user    0m31.485s
+sys     0m1.497s
 ```
 
 And, as an alternative, using *dissect* matcher instead of regex is even slightly faster:
 
 ```bash
 $ time rare histo -d '" %{CODE} ' -e '{CODE}' -z testdata/*.gz
-404         5,557,374 
-200         2,564,984 
-400         243,282   
-405         5,708     
-408         1,397     
-Matched: 8,373,328 / 8,373,328 (Groups: 8)
-[9/9] 1.41 GB (531.11 MB/s)
-
-real    0m2.533s
-user    0m7.976s
-sys     0m0.350s
+404         54,843,840
+200         25,400,160
+400         2,412,960
+405         56,640
+408         13,920
+Matched: 82,733,280 / 82,733,280 (Groups: 8)
+[8/8] 13.93 GB (5.61 GB/s)
+
+real    0m2.546s
+user    0m22.922s
+sys     0m1.491s
 ```
 
 ### pcre2
@@ -71,47 +72,47 @@ on more complex regex's.
 ```bash
 # Normal re2 version
 $ time rare table -z -m "\[(.+?)\].*\" (\d+)" -e "{buckettime {1} year nginx}" -e "{bucket {2} 100}" testdata/*.gz
-          2020      2019      
-400       2,915,487 2,892,274           
-200       1,716,107 848,925             
-300       290       245                 
-Matched: 8,373,328 / 8,373,328 (R: 3; C: 2)
-[9/9] 1.41 GB (52.81 MB/s)
+    2020       2019
+400 28,994,880 28,332,480
+200 17,084,640 8,316,000
+300 2,880      2,400
+Matched: 82,733,280 / 82,733,280 (R: 3; C: 2)
+[8/8] 13.93 GB (596.89 MB/s)
 
-real    0m27.880s
-user    1m28.782s
-sys     0m0.824s
+real    0m23.819s
+user    3m52.252s
+sys     0m1.625s
 
 # libpcre2 version
 $ time rare-pcre table -z -m "\[(.+?)\].*\" (\d+)" -e "{buckettime {1} year nginx}" -e "{bucket {2} 100}" testdata/*.gz
-          2020      2019      
-400       2,915,487 2,892,274           
-200       1,716,107 848,925             
-300       290       245                 
-Matched: 8,373,328 / 8,373,328 (R: 3; C: 2)
-[9/9] 1.41 GB (241.82 MB/s)
-
-real    0m5.751s
-user    0m20.173s
-sys     0m0.461s
+    2020       2019
+400 28,994,880 28,332,480
+200 17,084,640 8,316,000
+300 2,880      2,400
+Matched: 82,733,280 / 82,733,280 (R: 3; C: 2)
+[8/8] 13.93 GB (2.10 GB/s)
+
+real    0m6.813s
+user    1m15.638s
+sys     0m1.985s
 ```
 
 
 ## zcat & grep
 
 ```
 $ time zcat testdata/*.gz | grep -Poa '" (\d{3})' | wc -l
-8373328
+82733280
 
-real    0m11.272s
-user    0m16.239s
-sys     0m1.989s
+real    0m28.414s
+user    0m35.268s
+sys     0m1.865s
 
-$ time zcat testdata/* | grep -Poa '" 200' > /dev/null
+$ time zcat testdata/*.gz | grep -Poa '" 200' > /dev/null
 
-real    0m5.416s
-user    0m4.810s
-sys     0m1.185s
+real    0m28.616s
+user    0m27.517s
+sys     0m1.658s
 
 ```
 
@@ -126,11 +127,11 @@ the complete functionality that rare exposes.
 
 ```bash
 $ time rg -z '" (\d{3})' testdata/*.gz | wc -l
-8373328
+82733280
 
-real    0m3.791s
-user    0m8.149s
-sys     0m4.420s
+real    0m7.058s
+user    0m40.284s
+sys     0m8.962s
 ```
 
 # Other Tools