diff --git a/cmd/helpers/extractorBuilder.go b/cmd/helpers/extractorBuilder.go index 8f64c4bb..fe22db1b 100644 --- a/cmd/helpers/extractorBuilder.go +++ b/cmd/helpers/extractorBuilder.go @@ -54,7 +54,7 @@ func BuildExtractorFromArguments(c *cli.Context) *extractor.Extractor { } tailChannels := make([]<-chan extractor.InputBatch, 0) - for _, filename := range globExpand(fileglobs, recursive) { + for filename := range globExpand(fileglobs, recursive) { tail, err := tail.TailFile(filename, tail.Config{Follow: true, ReOpen: followReopen, Poll: followPoll}) if err != nil { diff --git a/cmd/helpers/readChannels.go b/cmd/helpers/readChannels.go index ee0dfaa5..2a3e6307 100644 --- a/cmd/helpers/readChannels.go +++ b/cmd/helpers/readChannels.go @@ -74,19 +74,32 @@ func openFileToReader(filename string, gunzip bool) (io.ReadCloser, error) { return file, nil } -func openFilesToChan(filenames []string, gunzip bool, concurrency int, batchSize int) <-chan extractor.InputBatch { +// openFilesToChan takes an iterated channel of filenames, options, and loads them all with +// a max concurrency. Returns a channel that will populate with input batches +func openFilesToChan(filenames <-chan string, gunzip bool, concurrency int, batchSize int) <-chan extractor.InputBatch { out := make(chan extractor.InputBatch, 128) sema := make(chan struct{}, concurrency) - var wg sync.WaitGroup - wg.Add(len(filenames)) - IncSourceCount(len(filenames)) // Load as many files as the sema allows go func() { - for _, filename := range filenames { + var wg sync.WaitGroup + readCount := 0 + + bufferedFilenames := bufferChan(filenames, 100) + for filename := range bufferedFilenames { sema <- struct{}{} + wg.Add(1) + readCount++ + SetSourceCount(readCount + len(bufferedFilenames)) + go func(goFilename string) { + defer func() { + <-sema + wg.Done() + StopFileReading(goFilename) + }() + var file io.ReadCloser file, err := openFileToReader(goFilename, gunzip) if err != nil { @@ -101,16 +114,9 @@ func openFilesToChan(filenames []string, gunzip bool, concurrency int, batchSize ErrLog.Printf("Error reading %s: %v\n", goFilename, e) } extractor.SyncReadAheadToBatchChannel(goFilename, ra, batchSize, out) - - <-sema - wg.Done() - StopFileReading(goFilename) }(filename) } - }() - // Wait on all files, and close chan - go func() { wg.Wait() close(out) }() diff --git a/cmd/helpers/readProgress.go b/cmd/helpers/readProgress.go index da7f51e9..cbfe95ba 100644 --- a/cmd/helpers/readProgress.go +++ b/cmd/helpers/readProgress.go @@ -16,6 +16,11 @@ func IncSourceCount(delta int) { sourceCount += delta } +// SetSourceCount sets the number of source files +func SetSourceCount(count int) { + sourceCount = count +} + // StartFileReading registers a given source as being read in the global read-pool func StartFileReading(source string) { activeReadMutex.Lock() diff --git a/cmd/helpers/util.go b/cmd/helpers/util.go index f33a754b..b9105f73 100644 --- a/cmd/helpers/util.go +++ b/cmd/helpers/util.go @@ -5,6 +5,18 @@ import ( "path/filepath" ) +// Aggregate one channel into another, with a buffer +func bufferChan(in <-chan string, size int) <-chan string { + out := make(chan string, size) + go func() { + for item := range in { + out <- item + } + close(out) + }() + return out +} + func isDir(path string) bool { if fi, err := os.Stat(path); err == nil && fi.IsDir() { return true @@ -13,29 +25,34 @@ func isDir(path string) bool { } // globExpand expands a directory-glob, and optionally recurses on it -func globExpand(paths []string, recursive bool) []string { - out := make([]string, 0) - for _, p := range paths { - if recursive && isDir(p) { - filepath.Walk(p, func(walkPath string, info os.FileInfo, err error) error { +func globExpand(paths []string, recursive bool) <-chan string { + c := make(chan string, 10) + go func() { + for _, p := range paths { + if recursive && isDir(p) { + filepath.Walk(p, func(walkPath string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if !info.IsDir() { + c <- walkPath + } + return nil + }) + } else { + expanded, err := filepath.Glob(p) if err != nil { - return err + ErrLog.Printf("Path error: %v\n", err) + } else { + for _, item := range expanded { + c <- item + } } - if !info.IsDir() { - out = append(out, walkPath) - } - return nil - }) - } else { - expanded, err := filepath.Glob(p) - if err != nil { - ErrLog.Printf("Path error: %v\n", err) - } else { - out = append(out, expanded...) } } - } - return out + close(c) + }() + return c } func min(a, b int) int { diff --git a/cmd/helpers/util_test.go b/cmd/helpers/util_test.go new file mode 100644 index 00000000..24a66296 --- /dev/null +++ b/cmd/helpers/util_test.go @@ -0,0 +1,30 @@ +package helpers + +import ( + "sync" + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func TestBufferingChan(t *testing.T) { + var wg sync.WaitGroup + + c := make(chan string) + wg.Add(1) + go func() { + for i := 0; i < 100; i++ { + c <- "hi" + } + close(c) + wg.Done() + }() + + bc := bufferChan(c, 100) + wg.Wait() + + assert.Eventually(t, func() bool { + return len(bc) == 100 + }, 1*time.Second, 10*time.Millisecond) +}