diff --git a/pkg/extractor/dirwalk/globExpand.go b/pkg/extractor/dirwalk/globExpand.go index 7b88bae..afe23c1 100644 --- a/pkg/extractor/dirwalk/globExpand.go +++ b/pkg/extractor/dirwalk/globExpand.go @@ -4,6 +4,7 @@ import ( "fmt" "os" "path/filepath" + "rare/pkg/extractor/dirwalk/iterwalk" "sync/atomic" ) @@ -73,7 +74,7 @@ func (s *Walker) recurseWalk(c chan<- string, p string, visited map[string]strin rootDevId = getDeviceId(p) } - filepath.WalkDir(p, func(walkPath string, info os.DirEntry, err error) error { + iterwalk.WalkDir(p, func(walkPath string, info os.DirEntry, err error) error { switch { case err != nil: // error s.onError(fmt.Errorf("path error: %w", err)) diff --git a/pkg/extractor/dirwalk/globExpand_test.go b/pkg/extractor/dirwalk/globExpand_test.go index 5e15aad..52f084b 100644 --- a/pkg/extractor/dirwalk/globExpand_test.go +++ b/pkg/extractor/dirwalk/globExpand_test.go @@ -287,8 +287,11 @@ func TestNoDoubleTraverseSymlink(t *testing.T) { } files := collectChan(walker.Walk(p)) - assert.Equal(t, 1, countContains(files, "op1")) - assert.Equal(t, 0, countContains(files, "op2")) + + // exclusive or on the two possible options (should walk one of them) + has1, has2 := countContains(files, "op1") > 0, countContains(files, "op2") > 0 + assert.True(t, has1 != has2) + assert.True(t, hadError) assert.Equal(t, uint64(0), walker.ExcludedCount()) } diff --git a/pkg/extractor/dirwalk/iterwalk/iterwalk.go b/pkg/extractor/dirwalk/iterwalk/iterwalk.go new file mode 100644 index 0000000..293e67a --- /dev/null +++ b/pkg/extractor/dirwalk/iterwalk/iterwalk.go @@ -0,0 +1,87 @@ +package iterwalk + +import ( + "io" + "io/fs" + "os" + "path/filepath" +) + +/* +This is a fork of filepath.WalkDir that reimplements the os.ReadDir function +with one that opens and incrementally iterates on the paths without sorting. + +For directories with many files, this both improves response time as well as +reducing the overall query time of the path. For smaller directories it does +add some overhead, but it's negligible (microseconds) +*/ + +const readBatchSize = 1000 + +// This is an exact copy of `filepath.WalkDir` +func WalkDir(root string, fn fs.WalkDirFunc) error { + info, err := os.Lstat(root) + if err != nil { + err = fn(root, nil, err) + } else { + err = walkDir(root, fs.FileInfoToDirEntry(info), fn) + } + if err == filepath.SkipDir || err == filepath.SkipAll { + return nil + } + return err +} + +// Copy of filepath.walkDir, but switching from os.ReadDir to iterDir +func walkDir(path string, d fs.DirEntry, walkDirFn fs.WalkDirFunc) error { + if err := walkDirFn(path, d, nil); err != nil || !d.IsDir() { + if err == filepath.SkipDir && d.IsDir() { + // Successfully skipped directory. + err = nil + } + return err + } + + f, err := os.Open(path) + if err != nil { + // Second call, to report ReadDir error. + err = walkDirFn(path, d, err) + if err != nil { + if err == filepath.SkipDir && d.IsDir() { + err = nil + } + return err + } + } + defer f.Close() + +OUTER: + for { + dirs, err := f.ReadDir(readBatchSize) + if err != nil && err != io.EOF { + err = walkDirFn(path, d, err) + if err != nil { + if err == filepath.SkipDir && d.IsDir() { + err = nil + } + return err + } + } + + if len(dirs) == 0 { + break + } + + for _, d1 := range dirs { + path1 := filepath.Join(path, d1.Name()) + if err := walkDir(path1, d1, walkDirFn); err != nil { + if err == filepath.SkipDir { + break OUTER + } + return err + } + } + } + + return nil +} diff --git a/pkg/extractor/dirwalk/iterwalk/iterwalk_test.go b/pkg/extractor/dirwalk/iterwalk/iterwalk_test.go new file mode 100644 index 0000000..a00ccda --- /dev/null +++ b/pkg/extractor/dirwalk/iterwalk/iterwalk_test.go @@ -0,0 +1,111 @@ +package iterwalk + +import ( + "io/fs" + "os" + "path/filepath" + "strconv" + "testing" + + "github.com/stretchr/testify/assert" +) + +// BenchmarkFilepath-4 96153 11961 ns/op 681 B/op 19 allocs/op +func BenchmarkFilepath(b *testing.B) { + for range b.N { + filepath.WalkDir("./", func(path string, d fs.DirEntry, err error) error { return err }) + } +} + +// BenchmarkIter-4 86143 14648 ns/op 681 B/op 19 allocs/op +func BenchmarkIter(b *testing.B) { + for range b.N { + WalkDir("./", func(path string, d fs.DirEntry, err error) error { return err }) + } +} + +func TestWalkDir_File(t *testing.T) { + dir := t.TempDir() + file := filepath.Join(dir, "testfile.txt") + os.WriteFile(file, []byte("hello"), 0644) + + var walked []string + err := WalkDir(file, func(path string, d fs.DirEntry, err error) error { + walked = append(walked, path) + return nil + }) + assert.NoError(t, err) + assert.Equal(t, []string{file}, walked) +} + +func TestWalkDir_Dir(t *testing.T) { + dir := t.TempDir() + os.WriteFile(filepath.Join(dir, "a.txt"), []byte("a"), 0644) + os.WriteFile(filepath.Join(dir, "b.txt"), []byte("b"), 0644) + + var walked []string + err := WalkDir(dir, func(path string, d fs.DirEntry, err error) error { + walked = append(walked, path) + return nil + }) + assert.NoError(t, err) + assert.Equal(t, len(walked), 3) +} + +func TestWalkDir_SkipDir(t *testing.T) { + dir := t.TempDir() + sub := filepath.Join(dir, "subdir") + os.Mkdir(sub, 0755) + os.WriteFile(filepath.Join(sub, "file.txt"), []byte("x"), 0644) + + var walked []string + err := WalkDir(dir, func(path string, d fs.DirEntry, err error) error { + walked = append(walked, path) + if d != nil && d.IsDir() && path == sub { + return filepath.SkipDir + } + return nil + }) + assert.NoError(t, err) + assert.Contains(t, walked, sub) + assert.NotContains(t, walked, filepath.Join(sub, "file.txt")) +} + +func TestWalkDir_SkipDir_Large(t *testing.T) { + if testing.Short() { + t.Skip() + } + + dir := t.TempDir() + sub := filepath.Join(dir, "bigsubdir") + os.Mkdir(sub, 0755) + // Create many files in the subdirectory + for i := range readBatchSize + 1 { + os.WriteFile(filepath.Join(sub, "file_"+strconv.Itoa(i)+".txt"), []byte("x"), 0644) + } + + var walked []string + err := WalkDir(dir, func(path string, d fs.DirEntry, err error) error { + if d != nil && !d.IsDir() { + return filepath.SkipDir + } + if !d.IsDir() { + walked = append(walked, path) + } + return nil + }) + assert.NoError(t, err) + assert.Len(t, walked, 0) + + // None of the files in the skipped subdir should be walked + for i := 0; i < readBatchSize+1; i++ { + assert.NotContains(t, walked, filepath.Join(sub, "file_"+strconv.Itoa(i)+".txt")) + } +} + +func TestWalkDir_Error(t *testing.T) { + err := WalkDir("/nonexistent/path", func(path string, d fs.DirEntry, err error) error { + return err + }) + assert.Error(t, err) +}