From 780233b882f3ac217808f4c65ba357348b49586d Mon Sep 17 00:00:00 2001 From: Steven Masley Date: Sun, 22 Jun 2025 22:56:09 -0500 Subject: [PATCH 1/5] test: unit test to excercise polluted file cache with error --- coderd/files/cache_test.go | 60 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/coderd/files/cache_test.go b/coderd/files/cache_test.go index 5efb4ba19be28..7c270f9e90347 100644 --- a/coderd/files/cache_test.go +++ b/coderd/files/cache_test.go @@ -2,6 +2,7 @@ package files_test import ( "context" + "sync" "sync/atomic" "testing" "time" @@ -9,7 +10,9 @@ import ( "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" "github.com/spf13/afero" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" "golang.org/x/sync/errgroup" "cdr.dev/slog/sloggers/slogtest" @@ -18,6 +21,7 @@ import ( "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbauthz" "github.com/coder/coder/v2/coderd/database/dbgen" + "github.com/coder/coder/v2/coderd/database/dbmock" "github.com/coder/coder/v2/coderd/database/dbtestutil" "github.com/coder/coder/v2/coderd/files" "github.com/coder/coder/v2/coderd/rbac" @@ -25,6 +29,62 @@ import ( "github.com/coder/coder/v2/testutil" ) +// TestCancelledFetch runs 2 Acquire calls. The first fails with a ctx.Canceled +// error. The second call should ignore the first error and try to fetch the file +// again, which should succeed. +func TestCancelledFetch(t *testing.T) { + t.Parallel() + + fileID := uuid.New() + rdy := make(chan struct{}) + dbM := dbmock.NewMockStore(gomock.NewController(t)) + + // First call should fail + dbM.EXPECT().GetFileByID(gomock.Any(), gomock.Any()).DoAndReturn(func(mTx context.Context, fileID uuid.UUID) (database.File, error) { + // Wait long enough for the second call to be queued up. + <-rdy + return database.File{}, context.Canceled + }) + + // Second call should succeed + dbM.EXPECT().GetFileByID(gomock.Any(), gomock.Any()).DoAndReturn(func(mTx context.Context, fileID uuid.UUID) (database.File, error) { + return database.File{ + ID: fileID, + Data: make([]byte, 100), + }, nil + }) + + //nolint:gocritic // Unit testing + ctx := dbauthz.AsFileReader(testutil.Context(t, testutil.WaitShort)) + cache := files.NewFromStore(dbM, prometheus.NewRegistry(), &coderdtest.FakeAuthorizer{}) + + var wg sync.WaitGroup + wg.Add(2) + + // First call that will fail + go func() { + _, err := cache.Acquire(ctx, fileID) + assert.ErrorIs(t, err, context.Canceled) + wg.Done() + }() + + // Second call, that should succeed + go func() { + fs, err := cache.Acquire(ctx, fileID) + assert.NoError(t, err) + if fs != nil { + fs.Close() + } + wg.Done() + }() + + // We need that second Acquire call to be queued up + time.Sleep(testutil.IntervalFast) + + close(rdy) + wg.Wait() +} + // nolint:paralleltest,tparallel // Serially testing is easier func TestCacheRBAC(t *testing.T) { t.Parallel() From 8a6deb18b033828460782a5703fe3179d4b4074c Mon Sep 17 00:00:00 2001 From: McKayla Washburn Date: Tue, 24 Jun 2025 22:38:41 +0000 Subject: [PATCH 2/5] chore: purge file cache entries on error --- coderd/files/cache.go | 226 ++++++++++++++++++++++++------------------ 1 file changed, 130 insertions(+), 96 deletions(-) diff --git a/coderd/files/cache.go b/coderd/files/cache.go index 3698aac9286c8..d139c15117c94 100644 --- a/coderd/files/cache.go +++ b/coderd/files/cache.go @@ -25,60 +25,61 @@ type FileAcquirer interface { // New returns a file cache that will fetch files from a database func New(registerer prometheus.Registerer, authz rbac.Authorizer) *Cache { - return (&Cache{ - lock: sync.Mutex{}, - data: make(map[uuid.UUID]*cacheEntry), - authz: authz, - }).registerMetrics(registerer) + return &Cache{ + lock: sync.Mutex{}, + data: make(map[uuid.UUID]*cacheEntry), + authz: authz, + cacheMetrics: newCacheMetrics(registerer), + } } -func (c *Cache) registerMetrics(registerer prometheus.Registerer) *Cache { +func newCacheMetrics(registerer prometheus.Registerer) cacheMetrics { subsystem := "file_cache" f := promauto.With(registerer) - c.currentCacheSize = f.NewGauge(prometheus.GaugeOpts{ - Namespace: "coderd", - Subsystem: subsystem, - Name: "open_files_size_bytes_current", - Help: "The current amount of memory of all files currently open in the file cache.", - }) - - c.totalCacheSize = f.NewCounter(prometheus.CounterOpts{ - Namespace: "coderd", - Subsystem: subsystem, - Name: "open_files_size_bytes_total", - Help: "The total amount of memory ever opened in the file cache. This number never decrements.", - }) - - c.currentOpenFiles = f.NewGauge(prometheus.GaugeOpts{ - Namespace: "coderd", - Subsystem: subsystem, - Name: "open_files_current", - Help: "The count of unique files currently open in the file cache.", - }) - - c.totalOpenedFiles = f.NewCounter(prometheus.CounterOpts{ - Namespace: "coderd", - Subsystem: subsystem, - Name: "open_files_total", - Help: "The total count of unique files ever opened in the file cache.", - }) - - c.currentOpenFileReferences = f.NewGauge(prometheus.GaugeOpts{ - Namespace: "coderd", - Subsystem: subsystem, - Name: "open_file_refs_current", - Help: "The count of file references currently open in the file cache. Multiple references can be held for the same file.", - }) - - c.totalOpenFileReferences = f.NewCounterVec(prometheus.CounterOpts{ - Namespace: "coderd", - Subsystem: subsystem, - Name: "open_file_refs_total", - Help: "The total number of file references ever opened in the file cache. The 'hit' label indicates if the file was loaded from the cache.", - }, []string{"hit"}) - - return c + return cacheMetrics{ + currentCacheSize: f.NewGauge(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: subsystem, + Name: "open_files_size_bytes_current", + Help: "The current amount of memory of all files currently open in the file cache.", + }), + + totalCacheSize: f.NewCounter(prometheus.CounterOpts{ + Namespace: "coderd", + Subsystem: subsystem, + Name: "open_files_size_bytes_total", + Help: "The total amount of memory ever opened in the file cache. This number never decrements.", + }), + + currentOpenFiles: f.NewGauge(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: subsystem, + Name: "open_files_current", + Help: "The count of unique files currently open in the file cache.", + }), + + totalOpenedFiles: f.NewCounter(prometheus.CounterOpts{ + Namespace: "coderd", + Subsystem: subsystem, + Name: "open_files_total", + Help: "The total count of unique files ever opened in the file cache.", + }), + + currentOpenFileReferences: f.NewGauge(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: subsystem, + Name: "open_file_refs_current", + Help: "The count of file references currently open in the file cache. Multiple references can be held for the same file.", + }), + + totalOpenFileReferences: f.NewCounterVec(prometheus.CounterOpts{ + Namespace: "coderd", + Subsystem: subsystem, + Name: "open_file_refs_total", + Help: "The total number of file references ever opened in the file cache. The 'hit' label indicates if the file was loaded from the cache.", + }, []string{"hit"}), + } } // Cache persists the files for template versions, and is used by dynamic @@ -106,18 +107,21 @@ type cacheMetrics struct { totalCacheSize prometheus.Counter } +type cacheEntry struct { + // refCount must only be accessed while the cacheEntry lock is held. + lock sync.Mutex + refCount int + value *lazy.ValueWithError[CacheEntryValue] + + close func() +} + type CacheEntryValue struct { fs.FS Object rbac.Object Size int64 } -type cacheEntry struct { - // refCount must only be accessed while the Cache lock is held. - refCount int - value *lazy.ValueWithError[CacheEntryValue] -} - var _ fs.FS = (*CloseFS)(nil) // CloseFS is a wrapper around fs.FS that implements io.Closer. The Close() @@ -142,93 +146,116 @@ func (c *Cache) Acquire(ctx context.Context, db database.Store, fileID uuid.UUID // mutex has been released, or we would continue to hold the lock until the // entire file has been fetched, which may be slow, and would prevent other // files from being fetched in parallel. - it, err := c.prepare(ctx, db, fileID).Load() + e := c.prepare(ctx, db, fileID) + ev, err := e.value.Load() if err != nil { - c.release(fileID) + c.purge(fileID) + return nil, err + } + + // We always run the fetch under a system context and actor, so we need to check the caller's + // context manually before returning. + + // Check if the caller's context was canceled + if err := ctx.Err(); err != nil { return nil, err } + // Check that the caller is authorized to access the file subject, ok := dbauthz.ActorFromContext(ctx) if !ok { return nil, dbauthz.ErrNoActor } - // Always check the caller can actually read the file. - if err := c.authz.Authorize(ctx, subject, policy.ActionRead, it.Object); err != nil { - c.release(fileID) + if err := c.authz.Authorize(ctx, subject, policy.ActionRead, ev.Object); err != nil { + e.close() return nil, err } - var once sync.Once + var closeOnce sync.Once return &CloseFS{ - FS: it.FS, + FS: ev.FS, close: func() { // sync.Once makes the Close() idempotent, so we can call it // multiple times without worrying about double-releasing. - once.Do(func() { c.release(fileID) }) + closeOnce.Do(func() { + e.close() + }) }, }, nil } -func (c *Cache) prepare(ctx context.Context, db database.Store, fileID uuid.UUID) *lazy.ValueWithError[CacheEntryValue] { +func (c *Cache) prepare(ctx context.Context, db database.Store, fileID uuid.UUID) *cacheEntry { c.lock.Lock() defer c.lock.Unlock() hitLabel := "true" entry, ok := c.data[fileID] if !ok { - value := lazy.NewWithError(func() (CacheEntryValue, error) { - val, err := fetch(ctx, db, fileID) + hitLabel = "false" - // Always add to the cache size the bytes of the file loaded. - if err == nil { + var releaseOnce sync.Once + entry = &cacheEntry{ + refCount: 0, + value: lazy.NewWithError(func() (CacheEntryValue, error) { + val, err := fetch(db, fileID) + if err != nil { + // Force future calls to Acquire to trigger a new fetch as soon as + // a fetch has failed, even if references are still held. + delete(c.data, fileID) + return val, err + } + + // Add the size of the file to the cache size metrics. c.currentCacheSize.Add(float64(val.Size)) c.totalCacheSize.Add(float64(val.Size)) - } - return val, err - }) + return val, err + }), - entry = &cacheEntry{ - value: value, - refCount: 0, + close: func() { + entry.lock.Lock() + defer entry.lock.Unlock() + + entry.refCount-- + c.currentOpenFileReferences.Dec() + + if entry.refCount == 0 { + releaseOnce.Do(func() { + c.purge(fileID) + }) + } + }, } c.data[fileID] = entry + c.currentOpenFiles.Inc() c.totalOpenedFiles.Inc() - hitLabel = "false" } + entry.lock.Lock() + defer entry.lock.Unlock() c.currentOpenFileReferences.Inc() c.totalOpenFileReferences.WithLabelValues(hitLabel).Inc() entry.refCount++ - return entry.value + return entry } -// release decrements the reference count for the given fileID, and frees the -// backing data if there are no further references being held. -// -// release should only be called after a successful call to Acquire using the Release() -// method on the returned *CloseFS. -func (c *Cache) release(fileID uuid.UUID) { +// purge immediately removes an entry from the cache. It should be called +func (c *Cache) purge(fileID uuid.UUID) { c.lock.Lock() defer c.lock.Unlock() entry, ok := c.data[fileID] if !ok { - // If we land here, it's almost certainly because a bug already happened, - // and we're freeing something that's already been freed, or we're calling - // this function with an incorrect ID. Should this function return an error? - return - } - - c.currentOpenFileReferences.Dec() - entry.refCount-- - if entry.refCount > 0 { + // If we land here, it's probably because of a fetch attempt that + // resulted in an error, and got purged already. It may also be an + // erroneous extra close, but we can't really distinguish between those + // two cases currently. return } + // Purge the file from the cache. c.currentOpenFiles.Dec() - ev, err := entry.value.Load() if err == nil { c.currentCacheSize.Add(-1 * float64(ev.Size)) @@ -246,11 +273,18 @@ func (c *Cache) Count() int { return len(c.data) } -func fetch(ctx context.Context, store database.Store, fileID uuid.UUID) (CacheEntryValue, error) { - // Make sure the read does not fail due to authorization issues. - // Authz is checked on the Acquire call, so this is safe. +func fetch(store database.Store, fileID uuid.UUID) (CacheEntryValue, error) { + // Because many callers can be waiting on the same file fetch concurrently, we + // want to prevent any failures that would cause them all to receive errors + // because the caller who initiated the fetch would fail. + // - We always run the fetch with an uncancelable context, and then check + // context cancellation for each acquirer afterwards. + // - We always run the fetch as a system user, and then check authorization + // for each acquirer afterwards. + // This prevents a canceled context or an unauthorized user from "holding up + // the queue". //nolint:gocritic - file, err := store.GetFileByID(dbauthz.AsFileReader(ctx), fileID) + file, err := store.GetFileByID(dbauthz.AsFileReader(context.Background()), fileID) if err != nil { return CacheEntryValue{}, xerrors.Errorf("failed to read file from database: %w", err) } From 610740a330de7649728da317ef2e7492bb8d375e Mon Sep 17 00:00:00 2001 From: McKayla Washburn Date: Tue, 24 Jun 2025 22:48:45 +0000 Subject: [PATCH 3/5] proper release gating --- coderd/files/cache.go | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/coderd/files/cache.go b/coderd/files/cache.go index d139c15117c94..44f914d2ed596 100644 --- a/coderd/files/cache.go +++ b/coderd/files/cache.go @@ -194,6 +194,11 @@ func (c *Cache) prepare(ctx context.Context, db database.Store, fileID uuid.UUID hitLabel = "false" var releaseOnce sync.Once + release := func() { + releaseOnce.Do(func() { + c.purge(fileID) + }) + } entry = &cacheEntry{ refCount: 0, value: lazy.NewWithError(func() (CacheEntryValue, error) { @@ -201,7 +206,8 @@ func (c *Cache) prepare(ctx context.Context, db database.Store, fileID uuid.UUID if err != nil { // Force future calls to Acquire to trigger a new fetch as soon as // a fetch has failed, even if references are still held. - delete(c.data, fileID) + entry.close() + release() return val, err } @@ -218,12 +224,11 @@ func (c *Cache) prepare(ctx context.Context, db database.Store, fileID uuid.UUID entry.refCount-- c.currentOpenFileReferences.Dec() - - if entry.refCount == 0 { - releaseOnce.Do(func() { - c.purge(fileID) - }) + if entry.refCount > 0 { + return } + + release() }, } c.data[fileID] = entry From ec534596493b0de45e93a1619b3f30b8bfc2b7d2 Mon Sep 17 00:00:00 2001 From: McKayla Washburn Date: Tue, 24 Jun 2025 23:11:37 +0000 Subject: [PATCH 4/5] lint --- coderd/files/cache.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/coderd/files/cache.go b/coderd/files/cache.go index 44f914d2ed596..4084ec77a34b6 100644 --- a/coderd/files/cache.go +++ b/coderd/files/cache.go @@ -146,7 +146,7 @@ func (c *Cache) Acquire(ctx context.Context, db database.Store, fileID uuid.UUID // mutex has been released, or we would continue to hold the lock until the // entire file has been fetched, which may be slow, and would prevent other // files from being fetched in parallel. - e := c.prepare(ctx, db, fileID) + e := c.prepare(db, fileID) ev, err := e.value.Load() if err != nil { c.purge(fileID) @@ -184,7 +184,7 @@ func (c *Cache) Acquire(ctx context.Context, db database.Store, fileID uuid.UUID }, nil } -func (c *Cache) prepare(ctx context.Context, db database.Store, fileID uuid.UUID) *cacheEntry { +func (c *Cache) prepare(db database.Store, fileID uuid.UUID) *cacheEntry { c.lock.Lock() defer c.lock.Unlock() From df7acff875ed99a2bfb39acde732f7b3b3f55908 Mon Sep 17 00:00:00 2001 From: McKayla Washburn Date: Tue, 24 Jun 2025 23:45:10 +0000 Subject: [PATCH 5/5] I win at debugging deadlocks today --- coderd/files/cache.go | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/coderd/files/cache.go b/coderd/files/cache.go index 4084ec77a34b6..a0272c7055b49 100644 --- a/coderd/files/cache.go +++ b/coderd/files/cache.go @@ -114,6 +114,7 @@ type cacheEntry struct { value *lazy.ValueWithError[CacheEntryValue] close func() + purge func() } type CacheEntryValue struct { @@ -149,7 +150,8 @@ func (c *Cache) Acquire(ctx context.Context, db database.Store, fileID uuid.UUID e := c.prepare(db, fileID) ev, err := e.value.Load() if err != nil { - c.purge(fileID) + e.close() + e.purge() return nil, err } @@ -193,9 +195,9 @@ func (c *Cache) prepare(db database.Store, fileID uuid.UUID) *cacheEntry { if !ok { hitLabel = "false" - var releaseOnce sync.Once - release := func() { - releaseOnce.Do(func() { + var purgeOnce sync.Once + purge := func() { + purgeOnce.Do(func() { c.purge(fileID) }) } @@ -204,10 +206,6 @@ func (c *Cache) prepare(db database.Store, fileID uuid.UUID) *cacheEntry { value: lazy.NewWithError(func() (CacheEntryValue, error) { val, err := fetch(db, fileID) if err != nil { - // Force future calls to Acquire to trigger a new fetch as soon as - // a fetch has failed, even if references are still held. - entry.close() - release() return val, err } @@ -228,8 +226,10 @@ func (c *Cache) prepare(db database.Store, fileID uuid.UUID) *cacheEntry { return } - release() + purge() }, + + purge: purge, } c.data[fileID] = entry @@ -245,7 +245,8 @@ func (c *Cache) prepare(db database.Store, fileID uuid.UUID) *cacheEntry { return entry } -// purge immediately removes an entry from the cache. It should be called +// purge immediately removes an entry from the cache, even if it has open references. +// It should only be called from the `close` function in a `cacheEntry`. func (c *Cache) purge(fileID uuid.UUID) { c.lock.Lock() defer c.lock.Unlock()