From 5a31e8c2b7808beba11c25be031ee5621eb332e4 Mon Sep 17 00:00:00 2001 From: Christopher LaPointe Date: Sat, 25 Oct 2025 10:05:18 -0400 Subject: [PATCH 1/8] Improve tweak help text and env variables --- cmd/helpers/extractorBuilder.go | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/cmd/helpers/extractorBuilder.go b/cmd/helpers/extractorBuilder.go index b8bc3d6..6ec823a 100644 --- a/cmd/helpers/extractorBuilder.go +++ b/cmd/helpers/extractorBuilder.go @@ -2,6 +2,7 @@ package helpers import ( "errors" + "fmt" "os" "runtime" "slices" @@ -258,26 +259,32 @@ func getExtractorFlags() []cli.Flag { Category: cliCategoryTweaking, Usage: "Specifies io batching size. Set to 1 for immediate input", Value: 1000, + EnvVars: []string{"RARE_BATCH"}, }, &cli.IntFlag{ - Name: "batch-buffer", - Category: cliCategoryTweaking, - Usage: "Specifies how many batches to read-ahead. Impacts memory usage, can improve performance", - Value: workerCount * 2, // Keep 2 batches ready for each worker + Name: "batch-buffer", + Category: cliCategoryTweaking, + Usage: "Specifies how many batches to read-ahead. Impacts memory usage, can improve performance", + Value: workerCount * 2, // Keep 2 batches ready for each worker + DefaultText: fmt.Sprintf("workers*2 = %d", workerCount*2), + EnvVars: []string{"RARE_BATCH_BUFFER"}, }, &cli.IntFlag{ - Name: "workers", - Aliases: []string{"w"}, - Category: cliCategoryTweaking, - Usage: "Set number of data processors", - Value: workerCount, + Name: "workers", + Aliases: []string{"w"}, + Category: cliCategoryTweaking, + Usage: "Set number of data processors", + Value: workerCount, + DefaultText: fmt.Sprintf("NumCPU/2+1 = %d", workerCount), + EnvVars: []string{"RARE_WORKERS"}, }, &cli.IntFlag{ Name: "readers", Aliases: []string{"wr"}, Category: cliCategoryTweaking, Usage: "Sets the number of concurrent readers (Infinite when -f)", - Value: 3, + Value: workerCount, + EnvVars: []string{"RARE_READERS"}, }, } } From 0cc7c8c4183c01c9582e2c283cd11a17cac391d3 Mon Sep 17 00:00:00 2001 From: Christopher LaPointe Date: Sat, 25 Oct 2025 10:27:34 -0400 Subject: [PATCH 2/8] Allow changing the reader buffer, and increasing default, sees significant improvement in performance --- cmd/helpers/extractorBuilder.go | 18 +++++++++++++++--- cmd/helpers/updatingAggregator_test.go | 2 +- pkg/extractor/batchers/batcher.go | 10 +++++----- pkg/extractor/batchers/batcher_test.go | 4 ++-- pkg/extractor/batchers/fileBatcher.go | 4 ++-- pkg/extractor/batchers/fileBatcher_test.go | 2 +- pkg/extractor/batchers/readerBatcher.go | 4 ++-- pkg/extractor/batchers/readerBatcher_test.go | 2 +- pkg/extractor/batchers/tailBatcher.go | 4 ++-- pkg/extractor/batchers/tailBatcher_test.go | 4 ++-- 10 files changed, 33 insertions(+), 21 deletions(-) diff --git a/cmd/helpers/extractorBuilder.go b/cmd/helpers/extractorBuilder.go index 6ec823a..47004b7 100644 --- a/cmd/helpers/extractorBuilder.go +++ b/cmd/helpers/extractorBuilder.go @@ -43,6 +43,7 @@ func BuildBatcherFromArgumentsEx(c *cli.Context, fileglobs ...string) (*batchers followReopen = c.Bool("reopen") followPoll = c.Bool("poll") concurrentReaders = c.Int("readers") + readersBuffer = c.Int("readers-buffer") gunzip = c.Bool("gunzip") batchSize = c.Int("batch") batchBuffer = c.Int("batch-buffer") @@ -54,6 +55,9 @@ func BuildBatcherFromArgumentsEx(c *cli.Context, fileglobs ...string) (*batchers if concurrentReaders < 1 { logger.Fatalf(ExitCodeInvalidUsage, "Must have at least 1 reader") } + if readersBuffer < 1 { + logger.Fatalf(ExitCodeInvalidUsage, "Reader buffer must be at least 1 byte") + } if followPoll && !follow { logger.Fatalf(ExitCodeInvalidUsage, "Follow (-f) must be enabled for --poll") } @@ -68,16 +72,16 @@ func BuildBatcherFromArgumentsEx(c *cli.Context, fileglobs ...string) (*batchers if follow { logger.Println("Cannot follow a stdin stream, not a file") } - return batchers.OpenReaderToChan("", os.Stdin, batchSize, batchBuffer), nil + return batchers.OpenReaderToChan("", os.Stdin, batchSize, batchBuffer, readersBuffer), nil } else if follow { // Read from source file if gunzip { logger.Println("Cannot combine -f and -z") } walker := BuildPathWalkerFromArguments(c) - return batchers.TailFilesToChan(walker.Walk(fileglobs...), batchSize, batchBuffer, followReopen, followPoll, followTail), walker + return batchers.TailFilesToChan(walker.Walk(fileglobs...), batchSize, batchBuffer, readersBuffer, followReopen, followPoll, followTail), walker } else { // Read (no-follow) source file(s) walker := BuildPathWalkerFromArguments(c) - return batchers.OpenFilesToChan(walker.Walk(fileglobs...), gunzip, concurrentReaders, batchSize, batchBuffer), walker + return batchers.OpenFilesToChan(walker.Walk(fileglobs...), gunzip, concurrentReaders, batchSize, batchBuffer, readersBuffer), walker } } @@ -286,6 +290,14 @@ func getExtractorFlags() []cli.Flag { Value: workerCount, EnvVars: []string{"RARE_READERS"}, }, + &cli.IntFlag{ + Name: "readers-buffer", + Aliases: []string{"wrb"}, + Category: cliCategoryTweaking, + Usage: "How many bytes will be buffered per reader", + Value: 1024 * 1024, + EnvVars: []string{"RARE_READERS_BUFFER"}, + }, } } diff --git a/cmd/helpers/updatingAggregator_test.go b/cmd/helpers/updatingAggregator_test.go index 860f7ba..9be5071 100644 --- a/cmd/helpers/updatingAggregator_test.go +++ b/cmd/helpers/updatingAggregator_test.go @@ -33,7 +33,7 @@ func (s *VirtualAggregator) ParseErrors() uint64 { func TestAggregationLoop(t *testing.T) { // Build a real extractor - batcher := batchers.OpenReaderToChan("test", io.NopCloser(strings.NewReader(testData)), 1, 1) + batcher := batchers.OpenReaderToChan("test", io.NopCloser(strings.NewReader(testData)), 1, 1, 1024) ex, err := extractor.New(batcher.BatchChan(), &extractor.Config{ Matcher: matchers.ToFactory(fastregex.MustCompile(`(\d+)`)), Extract: "val:{1}", diff --git a/pkg/extractor/batchers/batcher.go b/pkg/extractor/batchers/batcher.go index da97fb5..bd5d8f5 100644 --- a/pkg/extractor/batchers/batcher.go +++ b/pkg/extractor/batchers/batcher.go @@ -15,7 +15,7 @@ import ( ) // ReadAheadBufferSize is the default size of the read-ahead buffer -const ReadAheadBufferSize = 128 * 1024 +// const ReadAheadBufferSize = 128 * 1024 // AutoFlushTimeout sets time before an auto-flushing reader will write a batch const AutoFlushTimeout = 250 * time.Millisecond @@ -152,9 +152,9 @@ func (s *Batcher) StatusString() string { // syncReaderToBatcher reads a reader buffer and breaks up its scans to `batchSize` // // and writes the batch-sized results to a channel -func (s *Batcher) syncReaderToBatcher(sourceName string, reader io.Reader, batchSize int) { +func (s *Batcher) syncReaderToBatcher(sourceName string, reader io.Reader, batchSize, bufSize int) { readerMetrics := newReaderMetrics(reader) - readahead := readahead.NewImmediate(readerMetrics, ReadAheadBufferSize) + readahead := readahead.NewImmediate(readerMetrics, bufSize) readahead.OnError(func(e error) { s.incErrors() logger.Printf("Error reading %s: %v", sourceName, e) @@ -189,9 +189,9 @@ func (s *Batcher) syncReaderToBatcher(sourceName string, reader io.Reader, batch // syncReaderToBatcherWithTimeFlush is similar to `syncReaderToBatcher`, except if it gets a new line // it will flush the batch if n time has elapsed since the last flush, regardless of how many items are in the current batch // Good for potentially slow or more interactive workloads (tail, stdin, etc) -func (s *Batcher) syncReaderToBatcherWithTimeFlush(sourceName string, reader io.Reader, batchSize int, autoFlush time.Duration) { +func (s *Batcher) syncReaderToBatcherWithTimeFlush(sourceName string, reader io.Reader, batchSize, bufSize int, autoFlush time.Duration) { readerMetrics := newReaderMetrics(reader) - readahead := readahead.NewImmediate(readerMetrics, ReadAheadBufferSize) + readahead := readahead.NewImmediate(readerMetrics, bufSize) readahead.OnError(func(e error) { s.incErrors() logger.Printf("Error reading %s: %v", sourceName, e) diff --git a/pkg/extractor/batchers/batcher_test.go b/pkg/extractor/batchers/batcher_test.go index 4085f23..3ca2e81 100644 --- a/pkg/extractor/batchers/batcher_test.go +++ b/pkg/extractor/batchers/batcher_test.go @@ -34,7 +34,7 @@ func TestReaderToBatcher(t *testing.T) { line2 line3` - s.syncReaderToBatcher("string", strings.NewReader(testData), 2) + s.syncReaderToBatcher("string", strings.NewReader(testData), 2, 1024) b1 := <-s.BatchChan() b2 := <-s.BatchChan() @@ -53,7 +53,7 @@ func TestBatcherWithAutoFlush(t *testing.T) { line2 line3` - s.syncReaderToBatcherWithTimeFlush("string", strings.NewReader(testData), 2, 1*time.Second) + s.syncReaderToBatcherWithTimeFlush("string", strings.NewReader(testData), 2, 1024, 1*time.Second) b1 := <-s.BatchChan() b2 := <-s.BatchChan() diff --git a/pkg/extractor/batchers/fileBatcher.go b/pkg/extractor/batchers/fileBatcher.go index ddc25e0..0eb7850 100644 --- a/pkg/extractor/batchers/fileBatcher.go +++ b/pkg/extractor/batchers/fileBatcher.go @@ -12,7 +12,7 @@ import ( // openFilesToChan takes an iterated channel of filenames, options, and loads them all with // // a max concurrency. Returns a channel that will populate with input batches -func OpenFilesToChan(filenames <-chan string, gunzip bool, concurrency int, batchSize, batchBuffer int) *Batcher { +func OpenFilesToChan(filenames <-chan string, gunzip bool, concurrency int, batchSize, batchBuffer, readBufSize int) *Batcher { out := newBatcher(batchBuffer) sema := make(chan struct{}, concurrency) @@ -46,7 +46,7 @@ func OpenFilesToChan(filenames <-chan string, gunzip bool, concurrency int, batc defer file.Close() out.startFileReading(goFilename) - out.syncReaderToBatcher(goFilename, file, batchSize) + out.syncReaderToBatcher(goFilename, file, batchSize, readBufSize) }(filename) } diff --git a/pkg/extractor/batchers/fileBatcher_test.go b/pkg/extractor/batchers/fileBatcher_test.go index faf0f23..784ef65 100644 --- a/pkg/extractor/batchers/fileBatcher_test.go +++ b/pkg/extractor/batchers/fileBatcher_test.go @@ -13,7 +13,7 @@ func TestOpenFilesToChan(t *testing.T) { filenames <- "fileBatcher_test.go" // me! close(filenames) - batches := OpenFilesToChan(filenames, false, 1, 1, 1) + batches := OpenFilesToChan(filenames, false, 1, 1, 1, 1024) total := 0 var lastStart uint64 = 0 diff --git a/pkg/extractor/batchers/readerBatcher.go b/pkg/extractor/batchers/readerBatcher.go index 14f26b0..b77d16a 100644 --- a/pkg/extractor/batchers/readerBatcher.go +++ b/pkg/extractor/batchers/readerBatcher.go @@ -4,14 +4,14 @@ import ( "io" ) -func OpenReaderToChan(sourceName string, reader io.ReadCloser, batchSize, batchBuffer int) *Batcher { +func OpenReaderToChan(sourceName string, reader io.ReadCloser, batchSize, batchBuffer, readBufSize int) *Batcher { out := newBatcher(batchBuffer) go func() { defer reader.Close() defer out.close() out.startFileReading(sourceName) - out.syncReaderToBatcherWithTimeFlush(sourceName, reader, batchSize, AutoFlushTimeout) + out.syncReaderToBatcherWithTimeFlush(sourceName, reader, batchSize, readBufSize, AutoFlushTimeout) out.stopFileReading(sourceName) }() diff --git a/pkg/extractor/batchers/readerBatcher_test.go b/pkg/extractor/batchers/readerBatcher_test.go index d353487..474f8d6 100644 --- a/pkg/extractor/batchers/readerBatcher_test.go +++ b/pkg/extractor/batchers/readerBatcher_test.go @@ -10,7 +10,7 @@ import ( func TestOpenReaderToChan(t *testing.T) { r := io.NopCloser(strings.NewReader("Hello\nthere\nbob")) - b := OpenReaderToChan("src", r, 1, 1) + b := OpenReaderToChan("src", r, 1, 1, 1024) b1 := <-b.BatchChan() assert.Equal(t, "src", b1.Source) diff --git a/pkg/extractor/batchers/tailBatcher.go b/pkg/extractor/batchers/tailBatcher.go index 0f975d3..ee36fd1 100644 --- a/pkg/extractor/batchers/tailBatcher.go +++ b/pkg/extractor/batchers/tailBatcher.go @@ -10,7 +10,7 @@ import ( // TailFilesToChan tails a set of files to an input batcher that can be consumed by extractor // // unlike a normal file batcher, this will attempt to tail all files at once -func TailFilesToChan(filenames <-chan string, batchSize, batchBuffer int, reopen, poll, tail bool) *Batcher { +func TailFilesToChan(filenames <-chan string, batchSize, batchBuffer, readBufSize int, reopen, poll, tail bool) *Batcher { out := newBatcher(batchBuffer) go func() { @@ -38,7 +38,7 @@ func TailFilesToChan(filenames <-chan string, batchSize, batchBuffer int, reopen out.startFileReading(filename) - out.syncReaderToBatcherWithTimeFlush(filename, r, batchSize, AutoFlushTimeout) + out.syncReaderToBatcherWithTimeFlush(filename, r, batchSize, readBufSize, AutoFlushTimeout) }(filename) } diff --git a/pkg/extractor/batchers/tailBatcher_test.go b/pkg/extractor/batchers/tailBatcher_test.go index c088099..269c92d 100644 --- a/pkg/extractor/batchers/tailBatcher_test.go +++ b/pkg/extractor/batchers/tailBatcher_test.go @@ -13,7 +13,7 @@ func TestBatchFollowFile(t *testing.T) { filenames := make(chan string, 1) filenames <- "tailBatcher_test.go" // me - batcher := TailFilesToChan(filenames, 5, 1, false, false, false) + batcher := TailFilesToChan(filenames, 5, 1, 1024, false, false, false) batch := <-batcher.BatchChan() assert.Equal(t, "tailBatcher_test.go", batch.Source) @@ -40,7 +40,7 @@ func TestBatchFollowTailFile(t *testing.T) { filenames := make(chan string, 1) filenames <- tmp.Name() - batcher := TailFilesToChan(filenames, 1, 1, false, false, true) + batcher := TailFilesToChan(filenames, 1, 1, 1024, false, false, true) for batcher.ActiveFileCount() == 0 { time.Sleep(time.Millisecond) // Semi-hack: Wait for the go-routine reader to start and the source to be drained From 2b987cdc97dddd48555a07328ee1a5571afc7f21 Mon Sep 17 00:00:00 2001 From: Christopher LaPointe Date: Sat, 25 Oct 2025 10:27:53 -0400 Subject: [PATCH 3/8] Update CLI Docs --- docs/cli-help.md | 72 ++++++++++++++++++++++++++++++------------------ 1 file changed, 45 insertions(+), 27 deletions(-) diff --git a/docs/cli-help.md b/docs/cli-help.md index fb8ea22..baf1212 100644 --- a/docs/cli-help.md +++ b/docs/cli-help.md @@ -68,7 +68,7 @@ Filter incoming results with search criteria, and output raw matches **--batch**="": Specifies io batching size. Set to 1 for immediate input (default: 1000) -**--batch-buffer**="": Specifies how many batches to read-ahead. Impacts memory usage, can improve performance (default: 6) +**--batch-buffer**="": Specifies how many batches to read-ahead. Impacts memory usage, can improve performance (default: 18) **--dissect, -d**="": Dissect expression create match groups to summarize on @@ -104,7 +104,9 @@ Filter incoming results with search criteria, and output raw matches **--read-symlinks**: Read files that are symbolic links -**--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 3) +**--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 9) + +**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 1048576) **--recursive, -R**: Recursively walk a non-globbing path and search for plain-files @@ -116,7 +118,7 @@ Filter incoming results with search criteria, and output raw matches **--text, -a**: Only output lines that contain valid text -**--workers, -w**="": Set number of data processors (default: 3) +**--workers, -w**="": Set number of data processors (default: 9) ### search @@ -124,7 +126,7 @@ Searches current directory recursively for a regex match **--batch**="": Specifies io batching size. Set to 1 for immediate input (default: 1000) -**--batch-buffer**="": Specifies how many batches to read-ahead. Impacts memory usage, can improve performance (default: 6) +**--batch-buffer**="": Specifies how many batches to read-ahead. Impacts memory usage, can improve performance (default: 18) **--dissect, -d**="": Dissect expression create match groups to summarize on @@ -160,7 +162,9 @@ Searches current directory recursively for a regex match **--read-symlinks**: Read files that are symbolic links -**--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 3) +**--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 9) + +**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 1048576) **--recursive, -R**: Recursively walk a non-globbing path and search for plain-files @@ -172,7 +176,7 @@ Searches current directory recursively for a regex match **--text, -a**: Only output lines that contain valid text -**--workers, -w**="": Set number of data processors (default: 3) +**--workers, -w**="": Set number of data processors (default: 9) ### histogram, histo, h @@ -186,7 +190,7 @@ Summarize results by extracting them to a histogram **--batch**="": Specifies io batching size. Set to 1 for immediate input (default: 1000) -**--batch-buffer**="": Specifies how many batches to read-ahead. Impacts memory usage, can improve performance (default: 6) +**--batch-buffer**="": Specifies how many batches to read-ahead. Impacts memory usage, can improve performance (default: 18) **--csv, -o**="": Write final results to csv. Use - to output to stdout @@ -230,7 +234,9 @@ Summarize results by extracting them to a histogram **--read-symlinks**: Read files that are symbolic links -**--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 3) +**--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 9) + +**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 1048576) **--recursive, -R**: Recursively walk a non-globbing path and search for plain-files @@ -244,7 +250,7 @@ Summarize results by extracting them to a histogram **--tail, -t**: When following a file, navigate to the end of the file to skip existing content -**--workers, -w**="": Set number of data processors (default: 3) +**--workers, -w**="": Set number of data processors (default: 9) ### heatmap, heat, hm @@ -252,7 +258,7 @@ Create a 2D heatmap of extracted data **--batch**="": Specifies io batching size. Set to 1 for immediate input (default: 1000) -**--batch-buffer**="": Specifies how many batches to read-ahead. Impacts memory usage, can improve performance (default: 6) +**--batch-buffer**="": Specifies how many batches to read-ahead. Impacts memory usage, can improve performance (default: 18) **--cols**="": Number of columns to display (default: 65) @@ -300,7 +306,9 @@ Create a 2D heatmap of extracted data **--read-symlinks**: Read files that are symbolic links -**--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 3) +**--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 9) + +**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 1048576) **--recursive, -R**: Recursively walk a non-globbing path and search for plain-files @@ -316,7 +324,7 @@ Create a 2D heatmap of extracted data **--tail, -t**: When following a file, navigate to the end of the file to skip existing content -**--workers, -w**="": Set number of data processors (default: 3) +**--workers, -w**="": Set number of data processors (default: 9) ### spark, sparkline, s @@ -324,7 +332,7 @@ Create rows of sparkline graphs **--batch**="": Specifies io batching size. Set to 1 for immediate input (default: 1000) -**--batch-buffer**="": Specifies how many batches to read-ahead. Impacts memory usage, can improve performance (default: 6) +**--batch-buffer**="": Specifies how many batches to read-ahead. Impacts memory usage, can improve performance (default: 18) **--cols**="": Number of columns to display (default: 65) @@ -370,7 +378,9 @@ Create rows of sparkline graphs **--read-symlinks**: Read files that are symbolic links -**--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 3) +**--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 9) + +**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 1048576) **--recursive, -R**: Recursively walk a non-globbing path and search for plain-files @@ -386,7 +396,7 @@ Create rows of sparkline graphs **--tail, -t**: When following a file, navigate to the end of the file to skip existing content -**--workers, -w**="": Set number of data processors (default: 3) +**--workers, -w**="": Set number of data processors (default: 9) ### bargraph, bars, bar, b @@ -394,7 +404,7 @@ Create a bargraph of the given 1 or 2 dimension data **--batch**="": Specifies io batching size. Set to 1 for immediate input (default: 1000) -**--batch-buffer**="": Specifies how many batches to read-ahead. Impacts memory usage, can improve performance (default: 6) +**--batch-buffer**="": Specifies how many batches to read-ahead. Impacts memory usage, can improve performance (default: 18) **--csv, -o**="": Write final results to csv. Use - to output to stdout @@ -432,7 +442,9 @@ Create a bargraph of the given 1 or 2 dimension data **--read-symlinks**: Read files that are symbolic links -**--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 3) +**--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 9) + +**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 1048576) **--recursive, -R**: Recursively walk a non-globbing path and search for plain-files @@ -448,7 +460,7 @@ Create a bargraph of the given 1 or 2 dimension data **--tail, -t**: When following a file, navigate to the end of the file to skip existing content -**--workers, -w**="": Set number of data processors (default: 3) +**--workers, -w**="": Set number of data processors (default: 9) ### analyze, a @@ -456,7 +468,7 @@ Numerical analysis on a set of filtered data **--batch**="": Specifies io batching size. Set to 1 for immediate input (default: 1000) -**--batch-buffer**="": Specifies how many batches to read-ahead. Impacts memory usage, can improve performance (default: 6) +**--batch-buffer**="": Specifies how many batches to read-ahead. Impacts memory usage, can improve performance (default: 18) **--dissect, -d**="": Dissect expression create match groups to summarize on @@ -492,7 +504,9 @@ Numerical analysis on a set of filtered data **--read-symlinks**: Read files that are symbolic links -**--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 3) +**--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 9) + +**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 1048576) **--recursive, -R**: Recursively walk a non-globbing path and search for plain-files @@ -504,7 +518,7 @@ Numerical analysis on a set of filtered data **--tail, -t**: When following a file, navigate to the end of the file to skip existing content -**--workers, -w**="": Set number of data processors (default: 3) +**--workers, -w**="": Set number of data processors (default: 9) ### tabulate, table, t @@ -512,7 +526,7 @@ Create a 2D summarizing table of extracted data **--batch**="": Specifies io batching size. Set to 1 for immediate input (default: 1000) -**--batch-buffer**="": Specifies how many batches to read-ahead. Impacts memory usage, can improve performance (default: 6) +**--batch-buffer**="": Specifies how many batches to read-ahead. Impacts memory usage, can improve performance (default: 18) **--cols**="": Number of columns to display (default: 10) @@ -560,7 +574,9 @@ Create a 2D summarizing table of extracted data **--read-symlinks**: Read files that are symbolic links -**--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 3) +**--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 9) + +**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 1048576) **--recursive, -R**: Recursively walk a non-globbing path and search for plain-files @@ -576,7 +592,7 @@ Create a 2D summarizing table of extracted data **--tail, -t**: When following a file, navigate to the end of the file to skip existing content -**--workers, -w**="": Set number of data processors (default: 3) +**--workers, -w**="": Set number of data processors (default: 9) ### reduce, r @@ -586,7 +602,7 @@ Aggregate the results of a query based on an expression, pulling customized summ **--batch**="": Specifies io batching size. Set to 1 for immediate input (default: 1000) -**--batch-buffer**="": Specifies how many batches to read-ahead. Impacts memory usage, can improve performance (default: 6) +**--batch-buffer**="": Specifies how many batches to read-ahead. Impacts memory usage, can improve performance (default: 18) **--cols**="": Number of columns to display (default: 10) @@ -632,7 +648,9 @@ Aggregate the results of a query based on an expression, pulling customized summ **--read-symlinks**: Read files that are symbolic links -**--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 3) +**--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 9) + +**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 1048576) **--recursive, -R**: Recursively walk a non-globbing path and search for plain-files @@ -648,7 +666,7 @@ Aggregate the results of a query based on an expression, pulling customized summ **--tail, -t**: When following a file, navigate to the end of the file to skip existing content -**--workers, -w**="": Set number of data processors (default: 3) +**--workers, -w**="": Set number of data processors (default: 9) ### docs From 540c5d4c8d99e7eb124046192618931a09eacd16 Mon Sep 17 00:00:00 2001 From: Christopher LaPointe Date: Sat, 25 Oct 2025 10:32:30 -0400 Subject: [PATCH 4/8] Clean up the tweak docs --- docs/usage/input.md | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/docs/usage/input.md b/docs/usage/input.md index bec70a2..7a1b65e 100644 --- a/docs/usage/input.md +++ b/docs/usage/input.md @@ -135,29 +135,48 @@ There are already some heuristics that optimize how files are read which should work for most cases. If you do find you need to modify how *rare* is reading, you can tweak two things: -* concurrency -- How many files are read at once +* workers -- How many threads are processing the data +* reader concurrency -- How many files are read at once * batch size -- How many lines read from a given file are "batched" to send to the expression stage -### Concurrency +### Workers + +Workers specify how many threads are processing the data (running regex/dissect and expressions). By +default it's `NumCPU/2+1` to leave room for readers and other processes. + +Specify with: + +`rare --workers=5 ...` + +### Reader Concurrency Concurrency specifies how many files are opened at once (in a normal case). It -defaults to `3`, but is ignored if following files. +defaults to the same as `workers`, but is ignored if following files. Specify with: `rare --readers=1 file1 file2 file3...` +Additionally, you can specify how large the reader's buffer is. This determines +how many bytes can be read, at maximum, before passing on the data to the batcher. + +It defaults to `1 MB`. + +Specify with: + +`rare --readers-buffer=1024 ...` + ### Batch Sizes Rare reads (by default) 1000 lines in a file, for a batch, before providing it -to the extractor stage. This significantly speeds up processing, but comes +to the extractor/worker stage. This significantly speeds up processing, but comes at the cost of being less real-time if input generation is slow. To counteract this, in the *follow* or *stdin* cases, there's also a flush timeout of 250ms. This means if a new line has been received, and the duration has passed, that the batch will be processed regardless of its current size. -You can tweak this value with `--batch` +You can tweak the max batch size with `--batch` `rare --batch=10 ...` From cc3506bac7d7e4a2684a43b105369f6ce341f16e Mon Sep 17 00:00:00 2001 From: Christopher LaPointe Date: Sat, 25 Oct 2025 10:35:41 -0400 Subject: [PATCH 5/8] Remove dead code --- pkg/extractor/batchers/batcher.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/pkg/extractor/batchers/batcher.go b/pkg/extractor/batchers/batcher.go index bd5d8f5..fcb400b 100644 --- a/pkg/extractor/batchers/batcher.go +++ b/pkg/extractor/batchers/batcher.go @@ -14,9 +14,6 @@ import ( "github.com/zix99/rare/pkg/readahead" ) -// ReadAheadBufferSize is the default size of the read-ahead buffer -// const ReadAheadBufferSize = 128 * 1024 - // AutoFlushTimeout sets time before an auto-flushing reader will write a batch const AutoFlushTimeout = 250 * time.Millisecond From aac4a096fa817a71da4f991220cdf913c6436d8f Mon Sep 17 00:00:00 2001 From: Christopher LaPointe Date: Sun, 26 Oct 2025 10:56:05 -0400 Subject: [PATCH 6/8] Reduce buffer from 1 MB to 512 KB. Neglible returns after this point --- cmd/helpers/extractorBuilder.go | 2 +- docs/cli-help.md | 21 ++++++++++++--------- docs/usage/input.md | 2 +- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/cmd/helpers/extractorBuilder.go b/cmd/helpers/extractorBuilder.go index 47004b7..7dba3fb 100644 --- a/cmd/helpers/extractorBuilder.go +++ b/cmd/helpers/extractorBuilder.go @@ -295,7 +295,7 @@ func getExtractorFlags() []cli.Flag { Aliases: []string{"wrb"}, Category: cliCategoryTweaking, Usage: "How many bytes will be buffered per reader", - Value: 1024 * 1024, + Value: 512 * 1024, EnvVars: []string{"RARE_READERS_BUFFER"}, }, } diff --git a/docs/cli-help.md b/docs/cli-help.md index baf1212..e551a69 100644 --- a/docs/cli-help.md +++ b/docs/cli-help.md @@ -10,6 +10,7 @@ rare [--color] [--funcs]=[value] [--help|-h] +[--metrics-memory] [--metrics] [--nocolor|--nc] [--noformat|--nf] @@ -45,6 +46,8 @@ rare [GLOBAL OPTIONS] command [COMMAND OPTIONS] [ARGUMENTS...] **--metrics**: Outputs runtime memory metrics after a program runs +**--metrics-memory**: Records memory metrics every 100ms to get peaks/averages + **--nocolor, --nc**: Disables color output **--noformat, --nf**: Disable number formatting @@ -106,7 +109,7 @@ Filter incoming results with search criteria, and output raw matches **--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 9) -**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 1048576) +**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 524288) **--recursive, -R**: Recursively walk a non-globbing path and search for plain-files @@ -164,7 +167,7 @@ Searches current directory recursively for a regex match **--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 9) -**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 1048576) +**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 524288) **--recursive, -R**: Recursively walk a non-globbing path and search for plain-files @@ -236,7 +239,7 @@ Summarize results by extracting them to a histogram **--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 9) -**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 1048576) +**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 524288) **--recursive, -R**: Recursively walk a non-globbing path and search for plain-files @@ -308,7 +311,7 @@ Create a 2D heatmap of extracted data **--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 9) -**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 1048576) +**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 524288) **--recursive, -R**: Recursively walk a non-globbing path and search for plain-files @@ -380,7 +383,7 @@ Create rows of sparkline graphs **--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 9) -**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 1048576) +**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 524288) **--recursive, -R**: Recursively walk a non-globbing path and search for plain-files @@ -444,7 +447,7 @@ Create a bargraph of the given 1 or 2 dimension data **--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 9) -**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 1048576) +**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 524288) **--recursive, -R**: Recursively walk a non-globbing path and search for plain-files @@ -506,7 +509,7 @@ Numerical analysis on a set of filtered data **--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 9) -**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 1048576) +**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 524288) **--recursive, -R**: Recursively walk a non-globbing path and search for plain-files @@ -576,7 +579,7 @@ Create a 2D summarizing table of extracted data **--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 9) -**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 1048576) +**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 524288) **--recursive, -R**: Recursively walk a non-globbing path and search for plain-files @@ -650,7 +653,7 @@ Aggregate the results of a query based on an expression, pulling customized summ **--readers, --wr**="": Sets the number of concurrent readers (Infinite when -f) (default: 9) -**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 1048576) +**--readers-buffer, --wrb**="": How many bytes will be buffered per reader (default: 524288) **--recursive, -R**: Recursively walk a non-globbing path and search for plain-files diff --git a/docs/usage/input.md b/docs/usage/input.md index 7a1b65e..4b0f33f 100644 --- a/docs/usage/input.md +++ b/docs/usage/input.md @@ -160,7 +160,7 @@ Specify with: Additionally, you can specify how large the reader's buffer is. This determines how many bytes can be read, at maximum, before passing on the data to the batcher. -It defaults to `1 MB`. +It defaults to `512 KB`. Specify with: From d191a90d0a2f488f720bcf8da9eb98e4d642e441 Mon Sep 17 00:00:00 2001 From: Christopher LaPointe Date: Sun, 26 Oct 2025 10:59:31 -0400 Subject: [PATCH 7/8] test buffer size failure --- cmd/helpers/extractorBuilder_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmd/helpers/extractorBuilder_test.go b/cmd/helpers/extractorBuilder_test.go index f4a9be5..e56433e 100644 --- a/cmd/helpers/extractorBuilder_test.go +++ b/cmd/helpers/extractorBuilder_test.go @@ -88,6 +88,9 @@ func TestBuildingExtractorFromContext(t *testing.T) { testLogFatal(t, 2, func() { runApp("--readers 0 ../testdata/log.txt") }) + testLogFatal(t, 2, func() { + runApp("--readers-buffer 0 ../testdata/log.txt") + }) testLogFatal(t, 2, func() { runApp("--poll ../testdata/log.txt") }) From 4e50488917083c2fe55d78076621efa2169bed27 Mon Sep 17 00:00:00 2001 From: Christopher LaPointe Date: Sun, 26 Oct 2025 14:19:17 -0400 Subject: [PATCH 8/8] Update benchmarks on modern PC with modern build. 5.5 GB/s max! --- docs/benchmarks.md | 123 +++++++++++++++++++++++---------------------- 1 file changed, 62 insertions(+), 61 deletions(-) diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 17f62e7..19712e2 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -6,61 +6,62 @@ It's worth noting that in many of these results rare is just as fast, but part of that reason is that it consumes CPU in a more efficient way (go is great at parallelization). So take that into account, for better or worse. -All tests were done on ~83MB of gzip'd (1.5GB gunzip'd) nginx logs spread across 10 files. They -were run on a spinning disk on an older machine. New machines run significantly faster. +All tests were done on ~824MB of gzip'd (13.93 GB gunzip'd) nginx logs spread across 8 files. They +were run on a NVMe SSD on a recent (2025) machine. Each program was run 3 times and the last time was taken (to make sure things were cached equally). ## rare -At no point scanning the data does `rare` exceed ~4MB of resident memory. +At no point scanning the data does `rare` exceed ~42MB of resident memory. Buffer sizes can be tweaked +to adjust memory usage. ```bash $ rare -v rare version 0.4.3, e0fc395; regex: re2 $ time rare filter -m '" (\d{3})' -e "{1}" -z testdata/*.gz | wc -l -Matched: 8,373,328 / 8,373,328 -8373328 +Matched: 82,733,280 / 82,733,280 +82733280 -real 0m3.266s -user 0m10.607s -sys 0m0.769s +real 0m3.409s +user 0m32.750s +sys 0m2.175s ``` When aggregating data, `rare` is significantly faster than alternatives. ```bash $ time rare histo -m '" (\d{3})' -e "{1}" -z testdata/*.gz -404 5,557,374 -200 2,564,984 -400 243,282 -405 5,708 -408 1,397 -Matched: 8,373,328 / 8,373,328 (Groups: 8) -[9/9] 1.41 GB (514.25 MB/s) - -real 0m2.870s -user 0m9.606s -sys 0m0.393s +404 54,843,840 +200 25,400,160 +400 2,412,960 +405 56,640 +408 13,920 +Matched: 82,733,280 / 82,733,280 (Groups: 8) +[8/8] 13.93 GB (4.27 GB/s) + +real 0m3.283s +user 0m31.485s +sys 0m1.497s ``` And, as an alternative, using *dissect* matcher instead of regex is even slightly faster: ```bash $ time rare histo -d '" %{CODE} ' -e '{CODE}' -z testdata/*.gz -404 5,557,374 -200 2,564,984 -400 243,282 -405 5,708 -408 1,397 -Matched: 8,373,328 / 8,373,328 (Groups: 8) -[9/9] 1.41 GB (531.11 MB/s) - -real 0m2.533s -user 0m7.976s -sys 0m0.350s +404 54,843,840 +200 25,400,160 +400 2,412,960 +405 56,640 +408 13,920 +Matched: 82,733,280 / 82,733,280 (Groups: 8) +[8/8] 13.93 GB (5.61 GB/s) + +real 0m2.546s +user 0m22.922s +sys 0m1.491s ``` ### pcre2 @@ -71,29 +72,29 @@ on more complex regex's. ```bash # Normal re2 version $ time rare table -z -m "\[(.+?)\].*\" (\d+)" -e "{buckettime {1} year nginx}" -e "{bucket {2} 100}" testdata/*.gz - 2020 2019 -400 2,915,487 2,892,274 -200 1,716,107 848,925 -300 290 245 -Matched: 8,373,328 / 8,373,328 (R: 3; C: 2) -[9/9] 1.41 GB (52.81 MB/s) + 2020 2019 +400 28,994,880 28,332,480 +200 17,084,640 8,316,000 +300 2,880 2,400 +Matched: 82,733,280 / 82,733,280 (R: 3; C: 2) +[8/8] 13.93 GB (596.89 MB/s) -real 0m27.880s -user 1m28.782s -sys 0m0.824s +real 0m23.819s +user 3m52.252s +sys 0m1.625s # libpcre2 version $ time rare-pcre table -z -m "\[(.+?)\].*\" (\d+)" -e "{buckettime {1} year nginx}" -e "{bucket {2} 100}" testdata/*.gz - 2020 2019 -400 2,915,487 2,892,274 -200 1,716,107 848,925 -300 290 245 -Matched: 8,373,328 / 8,373,328 (R: 3; C: 2) -[9/9] 1.41 GB (241.82 MB/s) - -real 0m5.751s -user 0m20.173s -sys 0m0.461s + 2020 2019 +400 28,994,880 28,332,480 +200 17,084,640 8,316,000 +300 2,880 2,400 +Matched: 82,733,280 / 82,733,280 (R: 3; C: 2) +[8/8] 13.93 GB (2.10 GB/s) + +real 0m6.813s +user 1m15.638s +sys 0m1.985s ``` @@ -101,17 +102,17 @@ sys 0m0.461s ``` $ time zcat testdata/*.gz | grep -Poa '" (\d{3})' | wc -l -8373328 +82733280 -real 0m11.272s -user 0m16.239s -sys 0m1.989s +real 0m28.414s +user 0m35.268s +sys 0m1.865s -$ time zcat testdata/* | grep -Poa '" 200' > /dev/null +$ time zcat testdata/*.gz | grep -Poa '" 200' > /dev/null -real 0m5.416s -user 0m4.810s -sys 0m1.185s +real 0m28.616s +user 0m27.517s +sys 0m1.658s ``` @@ -126,11 +127,11 @@ the complete functionality that rare exposes. ```bash $ time rg -z '" (\d{3})' testdata/*.gz | wc -l -8373328 +82733280 -real 0m3.791s -user 0m8.149s -sys 0m4.420s +real 0m7.058s +user 0m40.284s +sys 0m8.962s ``` # Other Tools