From f11fea6f3010429f6b9779d85586ab91f0f3a220 Mon Sep 17 00:00:00 2001 From: Hugo Dutka Date: Sat, 12 Apr 2025 17:46:03 +0000 Subject: [PATCH 1/4] set up a terraform providers mirror for tests --- provisioner/terraform/executor.go | 8 +- provisioner/terraform/provision_test.go | 187 ++++++++++++++++++++++-- provisioner/terraform/serve.go | 45 +++--- testutil/cache.go | 25 ++++ 4 files changed, 231 insertions(+), 34 deletions(-) create mode 100644 testutil/cache.go diff --git a/provisioner/terraform/executor.go b/provisioner/terraform/executor.go index 150f51e6dd10d..442ed36074eb2 100644 --- a/provisioner/terraform/executor.go +++ b/provisioner/terraform/executor.go @@ -35,8 +35,9 @@ type executor struct { mut *sync.Mutex binaryPath string // cachePath and workdir must not be used by multiple processes at once. - cachePath string - workdir string + cachePath string + cliConfigPath string + workdir string // used to capture execution times at various stages timings *timingAggregator } @@ -50,6 +51,9 @@ func (e *executor) basicEnv() []string { if e.cachePath != "" && runtime.GOOS == "linux" { env = append(env, "TF_PLUGIN_CACHE_DIR="+e.cachePath) } + if e.cliConfigPath != "" { + env = append(env, "TF_CLI_CONFIG_FILE="+e.cliConfigPath) + } return env } diff --git a/provisioner/terraform/provision_test.go b/provisioner/terraform/provision_test.go index e7b64046f3ab3..f2d790da744c1 100644 --- a/provisioner/terraform/provision_test.go +++ b/provisioner/terraform/provision_test.go @@ -3,13 +3,17 @@ package terraform_test import ( + "bytes" "context" + "crypto/sha256" + "encoding/hex" "encoding/json" "errors" "fmt" "net" "net/http" "os" + "os/exec" "path/filepath" "sort" "strings" @@ -29,10 +33,11 @@ import ( ) type provisionerServeOptions struct { - binaryPath string - exitTimeout time.Duration - workDir string - logger *slog.Logger + binaryPath string + cliConfigPath string + exitTimeout time.Duration + workDir string + logger *slog.Logger } func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Context, proto.DRPCProvisionerClient) { @@ -66,9 +71,10 @@ func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Cont Logger: *opts.logger, WorkDirectory: opts.workDir, }, - BinaryPath: opts.binaryPath, - CachePath: cachePath, - ExitTimeout: opts.exitTimeout, + BinaryPath: opts.binaryPath, + CachePath: cachePath, + ExitTimeout: opts.exitTimeout, + CliConfigPath: opts.cliConfigPath, }) }() api := proto.NewDRPCProvisionerClient(client) @@ -85,6 +91,149 @@ func configure(ctx context.Context, t *testing.T, client proto.DRPCProvisionerCl return sess } +func hashTemplateFilesAndTestName(t *testing.T, testName string, templateFiles map[string]string) string { + t.Helper() + + sortedFileNames := make([]string, 0, len(templateFiles)) + for fileName := range templateFiles { + sortedFileNames = append(sortedFileNames, fileName) + } + sort.Strings(sortedFileNames) + + hasher := sha256.New() + for _, fileName := range sortedFileNames { + file := templateFiles[fileName] + _, err := hasher.Write([]byte(fileName)) + require.NoError(t, err) + _, err = hasher.Write([]byte(file)) + require.NoError(t, err) + } + _, err := hasher.Write([]byte(testName)) + require.NoError(t, err) + return hex.EncodeToString(hasher.Sum(nil)) +} + +const ( + terraformConfigFileName = "terraform.rc" + cacheProvidersDirName = "providers" + cacheTemplateFilesDirName = "files" +) + +// Writes a Terraform CLI config file (`terraform.rc`) in `dir` to enforce using the local provider mirror. +// This blocks network access for providers, forcing Terraform to use only what's cached in `dir`. +// Returns the path to the generated config file. +func writeCliConfig(t *testing.T, dir string) string { + t.Helper() + + cliConfigPath := filepath.Join(dir, terraformConfigFileName) + require.NoError(t, os.MkdirAll(filepath.Dir(cliConfigPath), 0o700)) + + content := fmt.Sprintf(` + provider_installation { + filesystem_mirror { + path = "%s" + include = ["*/*"] + } + direct { + exclude = ["*/*"] + } + } + `, filepath.Join(dir, cacheProvidersDirName)) + require.NoError(t, os.WriteFile(cliConfigPath, []byte(content), 0o600)) + return cliConfigPath +} + +func runCmd(t *testing.T, dir string, args ...string) { + t.Helper() + + stdout, stderr := bytes.NewBuffer(nil), bytes.NewBuffer(nil) + cmd := exec.Command(args[0], args[1:]...) //#nosec + cmd.Dir = dir + cmd.Stdout = stdout + cmd.Stderr = stderr + if err := cmd.Run(); err != nil { + t.Fatalf("failed to run %s: %s\nstdout: %s\nstderr: %s", strings.Join(args, " "), err, stdout.String(), stderr.String()) + } +} + +// Ensures Terraform providers are downloaded and cached locally in a unique directory for this test. +// Uses `terraform init` then `mirror` to populate the cache if needed. +// Returns the cache directory path. +func downloadProviders(t *testing.T, rootDir string, templateFiles map[string]string) string { + t.Helper() + + // Each test gets a unique cache dir based on its name and template files. + // This ensures that tests can download providers in parallel and that they + // will redownload providers if the template files change. + hash := hashTemplateFilesAndTestName(t, t.Name(), templateFiles) + dir := filepath.Join(rootDir, hash[:12]) + if _, err := os.Stat(dir); err == nil { + t.Logf("%s: using cached terraform providers", t.Name()) + return dir + } + filesDir := filepath.Join(dir, cacheTemplateFilesDirName) + defer func() { + // The files dir will contain a copy of terraform providers generated + // by the terraform init command. We don't want to persist them since + // we already have a registry mirror in the providers dir. + if err := os.RemoveAll(filesDir); err != nil { + t.Logf("failed to remove files dir %s: %s", filesDir, err) + } + if !t.Failed() { + return + } + if err := os.RemoveAll(dir); err != nil { + t.Logf("failed to remove dir %s: %s", dir, err) + } + }() + + require.NoError(t, os.MkdirAll(filesDir, 0o700)) + + for fileName, file := range templateFiles { + filePath := filepath.Join(filesDir, fileName) + if _, err := os.Stat(filePath); os.IsNotExist(err) { + require.NoError(t, os.MkdirAll(filepath.Dir(filePath), 0o700)) + require.NoError(t, os.WriteFile(filePath, []byte(file), 0o600)) + } + } + + providersDir := filepath.Join(dir, cacheProvidersDirName) + require.NoError(t, os.MkdirAll(providersDir, 0o700)) + + // We need to run init because if a test uses modules in its template, + // the mirror command will fail without it. + runCmd(t, filesDir, "terraform", "init") + // Now, mirror the providers into `providersDir`. We use this explicit mirror + // instead of relying only on the standard Terraform plugin cache. + // + // Why? Because this mirror, when used with the CLI config from `writeCliConfig`, + // prevents Terraform from hitting the network registry during `plan`. This cuts + // down on network calls, making CI tests less flaky. + // + // In contrast, the standard cache *still* contacts the registry for metadata + // during `init`, even if the plugins are already cached locally - see link below. + // + // Ref: https://developer.hashicorp.com/terraform/cli/config/config-file#provider-plugin-cache + // > When a plugin cache directory is enabled, the terraform init command will + // > still use the configured or implied installation methods to obtain metadata + // > about which plugins are available + runCmd(t, filesDir, "terraform", "providers", "mirror", providersDir) + + return dir +} + +// Caches providers locally and generates a Terraform CLI config to use *only* that cache. +// This setup prevents network access for providers during `terraform init`, improving reliability +// in subsequent test runs. +// Returns the path to the generated CLI config file. +func cacheProviders(t *testing.T, templateFiles map[string]string, rootDir string) string { + t.Helper() + + providersParentDir := downloadProviders(t, rootDir, templateFiles) + cliConfigPath := writeCliConfig(t, providersParentDir) + return cliConfigPath +} + func readProvisionLog(t *testing.T, response proto.DRPCProvisioner_SessionClient) string { var logBuf strings.Builder for { @@ -352,6 +501,8 @@ func TestProvision(t *testing.T) { Apply bool // Some tests may need to be skipped until the relevant provider version is released. SkipReason string + // If SkipCacheProviders is true, then skip caching the terraform providers for this test. + SkipCacheProviders bool }{ { Name: "missing-variable", @@ -422,16 +573,18 @@ func TestProvision(t *testing.T) { Files: map[string]string{ "main.tf": `a`, }, - ErrorContains: "initialize terraform", - ExpectLogContains: "Argument or block definition required", + ErrorContains: "initialize terraform", + ExpectLogContains: "Argument or block definition required", + SkipCacheProviders: true, }, { Name: "bad-syntax-2", Files: map[string]string{ "main.tf": `;asdf;`, }, - ErrorContains: "initialize terraform", - ExpectLogContains: `The ";" character is not valid.`, + ErrorContains: "initialize terraform", + ExpectLogContains: `The ";" character is not valid.`, + SkipCacheProviders: true, }, { Name: "destroy-no-state", @@ -847,7 +1000,17 @@ func TestProvision(t *testing.T) { t.Skip(testCase.SkipReason) } - ctx, api := setupProvisioner(t, nil) + cliConfigPath := "" + if !testCase.SkipCacheProviders { + cliConfigPath = cacheProviders( + t, + testCase.Files, + filepath.Join(testutil.PersistentCacheDir(t), "terraform_provision_test"), + ) + } + ctx, api := setupProvisioner(t, &provisionerServeOptions{ + cliConfigPath: cliConfigPath, + }) sess := configure(ctx, t, api, &proto.Config{ TemplateSourceArchive: testutil.CreateTar(t, testCase.Files), }) diff --git a/provisioner/terraform/serve.go b/provisioner/terraform/serve.go index a84e8caf6b5ab..562946d8ef92e 100644 --- a/provisioner/terraform/serve.go +++ b/provisioner/terraform/serve.go @@ -28,7 +28,9 @@ type ServeOptions struct { BinaryPath string // CachePath must not be used by multiple processes at once. CachePath string - Tracer trace.Tracer + // CliConfigPath is the path to the Terraform CLI config file. + CliConfigPath string + Tracer trace.Tracer // ExitTimeout defines how long we will wait for a running Terraform // command to exit (cleanly) if the provision was stopped. This @@ -132,22 +134,24 @@ func Serve(ctx context.Context, options *ServeOptions) error { options.ExitTimeout = unhanger.HungJobExitTimeout } return provisionersdk.Serve(ctx, &server{ - execMut: &sync.Mutex{}, - binaryPath: options.BinaryPath, - cachePath: options.CachePath, - logger: options.Logger, - tracer: options.Tracer, - exitTimeout: options.ExitTimeout, + execMut: &sync.Mutex{}, + binaryPath: options.BinaryPath, + cachePath: options.CachePath, + cliConfigPath: options.CliConfigPath, + logger: options.Logger, + tracer: options.Tracer, + exitTimeout: options.ExitTimeout, }, options.ServeOptions) } type server struct { - execMut *sync.Mutex - binaryPath string - cachePath string - logger slog.Logger - tracer trace.Tracer - exitTimeout time.Duration + execMut *sync.Mutex + binaryPath string + cachePath string + cliConfigPath string + logger slog.Logger + tracer trace.Tracer + exitTimeout time.Duration } func (s *server) startTrace(ctx context.Context, name string, opts ...trace.SpanStartOption) (context.Context, trace.Span) { @@ -158,12 +162,13 @@ func (s *server) startTrace(ctx context.Context, name string, opts ...trace.Span func (s *server) executor(workdir string, stage database.ProvisionerJobTimingStage) *executor { return &executor{ - server: s, - mut: s.execMut, - binaryPath: s.binaryPath, - cachePath: s.cachePath, - workdir: workdir, - logger: s.logger.Named("executor"), - timings: newTimingAggregator(stage), + server: s, + mut: s.execMut, + binaryPath: s.binaryPath, + cachePath: s.cachePath, + cliConfigPath: s.cliConfigPath, + workdir: workdir, + logger: s.logger.Named("executor"), + timings: newTimingAggregator(stage), } } diff --git a/testutil/cache.go b/testutil/cache.go new file mode 100644 index 0000000000000..82d45da3b3322 --- /dev/null +++ b/testutil/cache.go @@ -0,0 +1,25 @@ +package testutil + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/require" +) + +// PersistentCacheDir returns a path to a directory +// that will be cached between test runs in Github Actions. +func PersistentCacheDir(t *testing.T) string { + t.Helper() + + // We don't use os.UserCacheDir() because the path it + // returns is different on different operating systems. + // This would make it harder to specify which cache dir to use + // in Github Actions. + home, err := os.UserHomeDir() + require.NoError(t, err) + dir := filepath.Join(home, ".cache", "coderv2-test") + + return dir +} From 3a9e0b3f34a771e48f533cbea126e7802dc1e648 Mon Sep 17 00:00:00 2001 From: Hugo Dutka Date: Sat, 12 Apr 2025 17:49:02 +0000 Subject: [PATCH 2/4] add cache to CI --- .github/actions/setup-test-cache/action.yaml | 55 ++++++++++++++++++++ .github/workflows/ci.yaml | 25 +++++++++ 2 files changed, 80 insertions(+) create mode 100644 .github/actions/setup-test-cache/action.yaml diff --git a/.github/actions/setup-test-cache/action.yaml b/.github/actions/setup-test-cache/action.yaml new file mode 100644 index 0000000000000..c8fd21a67e8c7 --- /dev/null +++ b/.github/actions/setup-test-cache/action.yaml @@ -0,0 +1,55 @@ +name: "Setup Test Cache" +description: | + Downloads the test cache and, if needed, uploads a new cache after the job is complete. + A PR job can use a cache if it was created by its base branch, its current + branch, or the default branch. + https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/caching-dependencies-to-speed-up-workflows#restrictions-for-accessing-a-cache +inputs: + key-prefix: + description: "Prefix for the cache key" + required: true + cache-path: + description: "Path to the cache directory" + required: true + # This path is defined in testutil/cache.go + default: "~/.cache/coderv2-test" +runs: + using: "composite" + steps: + - name: Get date values + id: dates + shell: bash + run: | + echo "year-month=$(date +'%Y-%m')" >> $GITHUB_OUTPUT + echo "prev-year-month=$(date -d 'last month' +'%Y-%m')" >> $GITHUB_OUTPUT + echo "day=$(date +'%d')" >> $GITHUB_OUTPUT + + # Using this particular key/restore-keys combination ensures that: + # 1. The cache is updated at most once a day for a given key prefix. + # 2. The cache is reset once a month for a given key prefix. + # + # TODO: As a cost optimization, we could remove caches that are older than + # a day or two. By default, depot keeps caches for 14 days, which isn't + # necessary for the test cache. + # https://depot.dev/docs/github-actions/overview#cache-retention-policy + - name: Download and optionally upload test cache + # This is a fork of actions/cache that only saves the cache if the current + # job is running on the main branch. + # Without it, PRs would create one-use caches that would linger until + # expiration and we'd be charged for them. I evaluated a couple of options + # for limiting the cache to the main branch, and forking was the simplest. + uses: coder/actions-cache@3857e1bfd93dc0ee8d12968ce41da6dbb749bad7 + with: + path: ${{ inputs.cache-path }} + # The key doesn't need to include an OS name. The action already takes + # that into account: https://github.com/actions/cache/tree/5a3ec84eff668545956fd18022155c47e93e2684?tab=readme-ov-file#cache-version + # Cache entries are immutable. If an entry under the key already exists, + # it will not be overwritten. + key: ${{ inputs.key-prefix }}-${{ steps.dates.outputs.year-month }}-${{ steps.dates.outputs.day }} + # > If there are multiple partial matches for a restore key, the action returns the most recently created cache. + # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/caching-dependencies-to-speed-up-workflows#matching-a-cache-key + # The second restore key allows non-main branches to use the cache from the previous month. + # This prevents PRs from rebuilding the cache on the first day of the month. + restore-keys: | + ${{ inputs.key-prefix }}-${{ steps.dates.outputs.year-month }}- + ${{ github.ref != 'refs/heads/main' && format('{0}-{1}-', inputs.key-prefix, steps.dates.outputs.prev-year-month) || '' }} diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 54239330f2a4f..8d084d5db2373 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -346,6 +346,11 @@ jobs: - name: Setup Terraform uses: ./.github/actions/setup-tf + - name: Setup Test Cache + uses: ./.github/actions/setup-test-cache + with: + key-prefix: test-go + - name: Test with Mock Database id: test shell: bash @@ -467,6 +472,11 @@ jobs: if: runner.os == 'Windows' uses: ./.github/actions/setup-imdisk + - name: Setup Test Cache + uses: ./.github/actions/setup-test-cache + with: + key-prefix: test-go-pg + - name: Test with PostgreSQL Database env: POSTGRES_VERSION: "13" @@ -519,6 +529,11 @@ jobs: - name: Setup Terraform uses: ./.github/actions/setup-tf + - name: Setup Test Cache + uses: ./.github/actions/setup-test-cache + with: + key-prefix: test-go-pg-16 + - name: Test with PostgreSQL Database env: POSTGRES_VERSION: "16" @@ -556,6 +571,11 @@ jobs: - name: Setup Terraform uses: ./.github/actions/setup-tf + - name: Setup Test Cache + uses: ./.github/actions/setup-test-cache + with: + key-prefix: test-go-race + # We run race tests with reduced parallelism because they use more CPU and we were finding # instances where tests appear to hang for multiple seconds, resulting in flaky tests when # short timeouts are used. @@ -594,6 +614,11 @@ jobs: - name: Setup Terraform uses: ./.github/actions/setup-tf + - name: Setup Test Cache + uses: ./.github/actions/setup-test-cache + with: + key-prefix: test-go-race-pg + # We run race tests with reduced parallelism because they use more CPU and we were finding # instances where tests appear to hang for multiple seconds, resulting in flaky tests when # short timeouts are used. From 2236dd4b54f50a88b208b5ac89efa52cf45ecca8 Mon Sep 17 00:00:00 2001 From: Hugo Dutka Date: Thu, 24 Apr 2025 10:09:27 +0000 Subject: [PATCH 3/4] address feedback: hashing, file exists check, cache clean up --- provisioner/terraform/provision_test.go | 69 +++++++++++++++++++------ 1 file changed, 53 insertions(+), 16 deletions(-) diff --git a/provisioner/terraform/provision_test.go b/provisioner/terraform/provision_test.go index f2d790da744c1..96514cc4b59ad 100644 --- a/provisioner/terraform/provision_test.go +++ b/provisioner/terraform/provision_test.go @@ -100,16 +100,28 @@ func hashTemplateFilesAndTestName(t *testing.T, testName string, templateFiles m } sort.Strings(sortedFileNames) + // Inserting a delimiter between the file name and the file content + // ensures that a file named `ab` with content `cd` + // will not hash to the same value as a file named `abc` with content `d`. + // This can still happen if the file name or content include the delimiter, + // but hopefully they won't. + delimiter := []byte("🎉 🌱 🌷") + hasher := sha256.New() for _, fileName := range sortedFileNames { file := templateFiles[fileName] _, err := hasher.Write([]byte(fileName)) require.NoError(t, err) + _, err = hasher.Write(delimiter) + require.NoError(t, err) _, err = hasher.Write([]byte(file)) require.NoError(t, err) } - _, err := hasher.Write([]byte(testName)) + _, err := hasher.Write(delimiter) + require.NoError(t, err) + _, err = hasher.Write([]byte(testName)) require.NoError(t, err) + return hex.EncodeToString(hasher.Sum(nil)) } @@ -156,19 +168,26 @@ func runCmd(t *testing.T, dir string, args ...string) { } } -// Ensures Terraform providers are downloaded and cached locally in a unique directory for this test. +// Each test gets a unique cache dir based on its name and template files. +// This ensures that tests can download providers in parallel and that they +// will redownload providers if the template files change. +func getTestCacheDir(t *testing.T, rootDir string, testName string, templateFiles map[string]string) string { + t.Helper() + + hash := hashTemplateFilesAndTestName(t, testName, templateFiles) + dir := filepath.Join(rootDir, hash[:12]) + return dir +} + +// Ensures Terraform providers are downloaded and cached locally in a unique directory for the test. // Uses `terraform init` then `mirror` to populate the cache if needed. // Returns the cache directory path. -func downloadProviders(t *testing.T, rootDir string, templateFiles map[string]string) string { +func downloadProviders(t *testing.T, rootDir string, testName string, templateFiles map[string]string) string { t.Helper() - // Each test gets a unique cache dir based on its name and template files. - // This ensures that tests can download providers in parallel and that they - // will redownload providers if the template files change. - hash := hashTemplateFilesAndTestName(t, t.Name(), templateFiles) - dir := filepath.Join(rootDir, hash[:12]) + dir := getTestCacheDir(t, rootDir, testName, templateFiles) if _, err := os.Stat(dir); err == nil { - t.Logf("%s: using cached terraform providers", t.Name()) + t.Logf("%s: using cached terraform providers", testName) return dir } filesDir := filepath.Join(dir, cacheTemplateFilesDirName) @@ -182,6 +201,8 @@ func downloadProviders(t *testing.T, rootDir string, templateFiles map[string]st if !t.Failed() { return } + // If `downloadProviders` function failed, clean up the cache dir. + // We don't want to leave it around because it may be incomplete or corrupted. if err := os.RemoveAll(dir); err != nil { t.Logf("failed to remove dir %s: %s", dir, err) } @@ -191,10 +212,8 @@ func downloadProviders(t *testing.T, rootDir string, templateFiles map[string]st for fileName, file := range templateFiles { filePath := filepath.Join(filesDir, fileName) - if _, err := os.Stat(filePath); os.IsNotExist(err) { - require.NoError(t, os.MkdirAll(filepath.Dir(filePath), 0o700)) - require.NoError(t, os.WriteFile(filePath, []byte(file), 0o600)) - } + require.NoError(t, os.MkdirAll(filepath.Dir(filePath), 0o700)) + require.NoError(t, os.WriteFile(filePath, []byte(file), 0o600)) } providersDir := filepath.Join(dir, cacheProvidersDirName) @@ -226,10 +245,10 @@ func downloadProviders(t *testing.T, rootDir string, templateFiles map[string]st // This setup prevents network access for providers during `terraform init`, improving reliability // in subsequent test runs. // Returns the path to the generated CLI config file. -func cacheProviders(t *testing.T, templateFiles map[string]string, rootDir string) string { +func cacheProviders(t *testing.T, rootDir string, testName string, templateFiles map[string]string) string { t.Helper() - providersParentDir := downloadProviders(t, rootDir, templateFiles) + providersParentDir := downloadProviders(t, rootDir, testName, templateFiles) cliConfigPath := writeCliConfig(t, providersParentDir) return cliConfigPath } @@ -991,6 +1010,23 @@ func TestProvision(t *testing.T) { }, } + // Remove unused cache dirs before running tests. + // This cleans up any cache dirs that were created by tests that no longer exist. + cacheRootDir := filepath.Join(testutil.PersistentCacheDir(t), "terraform_provision_test") + expectedCacheDirs := make(map[string]bool) + for _, testCase := range testCases { + cacheDir := getTestCacheDir(t, cacheRootDir, testCase.Name, testCase.Files) + expectedCacheDirs[cacheDir] = true + } + currentCacheDirs, err := filepath.Glob(filepath.Join(cacheRootDir, "*")) + require.NoError(t, err) + for _, cacheDir := range currentCacheDirs { + if _, ok := expectedCacheDirs[cacheDir]; !ok { + t.Logf("removing unused cache dir: %s", cacheDir) + require.NoError(t, os.RemoveAll(cacheDir)) + } + } + for _, testCase := range testCases { testCase := testCase t.Run(testCase.Name, func(t *testing.T) { @@ -1004,8 +1040,9 @@ func TestProvision(t *testing.T) { if !testCase.SkipCacheProviders { cliConfigPath = cacheProviders( t, + cacheRootDir, + testCase.Name, testCase.Files, - filepath.Join(testutil.PersistentCacheDir(t), "terraform_provision_test"), ) } ctx, api := setupProvisioner(t, &provisionerServeOptions{ From d56aaa705da5fd51a0707f99c245c5ea63c0d22a Mon Sep 17 00:00:00 2001 From: Hugo Dutka Date: Thu, 24 Apr 2025 11:43:04 +0000 Subject: [PATCH 4/4] address feedback: use separate save and restore steps, use os-and-arch-specific key prefixes --- .github/actions/setup-test-cache/action.yaml | 55 ----------------- .../actions/test-cache/download/action.yml | 50 ++++++++++++++++ .github/actions/test-cache/upload/action.yml | 20 +++++++ .github/workflows/ci.yaml | 60 ++++++++++++++----- 4 files changed, 115 insertions(+), 70 deletions(-) delete mode 100644 .github/actions/setup-test-cache/action.yaml create mode 100644 .github/actions/test-cache/download/action.yml create mode 100644 .github/actions/test-cache/upload/action.yml diff --git a/.github/actions/setup-test-cache/action.yaml b/.github/actions/setup-test-cache/action.yaml deleted file mode 100644 index c8fd21a67e8c7..0000000000000 --- a/.github/actions/setup-test-cache/action.yaml +++ /dev/null @@ -1,55 +0,0 @@ -name: "Setup Test Cache" -description: | - Downloads the test cache and, if needed, uploads a new cache after the job is complete. - A PR job can use a cache if it was created by its base branch, its current - branch, or the default branch. - https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/caching-dependencies-to-speed-up-workflows#restrictions-for-accessing-a-cache -inputs: - key-prefix: - description: "Prefix for the cache key" - required: true - cache-path: - description: "Path to the cache directory" - required: true - # This path is defined in testutil/cache.go - default: "~/.cache/coderv2-test" -runs: - using: "composite" - steps: - - name: Get date values - id: dates - shell: bash - run: | - echo "year-month=$(date +'%Y-%m')" >> $GITHUB_OUTPUT - echo "prev-year-month=$(date -d 'last month' +'%Y-%m')" >> $GITHUB_OUTPUT - echo "day=$(date +'%d')" >> $GITHUB_OUTPUT - - # Using this particular key/restore-keys combination ensures that: - # 1. The cache is updated at most once a day for a given key prefix. - # 2. The cache is reset once a month for a given key prefix. - # - # TODO: As a cost optimization, we could remove caches that are older than - # a day or two. By default, depot keeps caches for 14 days, which isn't - # necessary for the test cache. - # https://depot.dev/docs/github-actions/overview#cache-retention-policy - - name: Download and optionally upload test cache - # This is a fork of actions/cache that only saves the cache if the current - # job is running on the main branch. - # Without it, PRs would create one-use caches that would linger until - # expiration and we'd be charged for them. I evaluated a couple of options - # for limiting the cache to the main branch, and forking was the simplest. - uses: coder/actions-cache@3857e1bfd93dc0ee8d12968ce41da6dbb749bad7 - with: - path: ${{ inputs.cache-path }} - # The key doesn't need to include an OS name. The action already takes - # that into account: https://github.com/actions/cache/tree/5a3ec84eff668545956fd18022155c47e93e2684?tab=readme-ov-file#cache-version - # Cache entries are immutable. If an entry under the key already exists, - # it will not be overwritten. - key: ${{ inputs.key-prefix }}-${{ steps.dates.outputs.year-month }}-${{ steps.dates.outputs.day }} - # > If there are multiple partial matches for a restore key, the action returns the most recently created cache. - # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/caching-dependencies-to-speed-up-workflows#matching-a-cache-key - # The second restore key allows non-main branches to use the cache from the previous month. - # This prevents PRs from rebuilding the cache on the first day of the month. - restore-keys: | - ${{ inputs.key-prefix }}-${{ steps.dates.outputs.year-month }}- - ${{ github.ref != 'refs/heads/main' && format('{0}-{1}-', inputs.key-prefix, steps.dates.outputs.prev-year-month) || '' }} diff --git a/.github/actions/test-cache/download/action.yml b/.github/actions/test-cache/download/action.yml new file mode 100644 index 0000000000000..06a87fee06d4b --- /dev/null +++ b/.github/actions/test-cache/download/action.yml @@ -0,0 +1,50 @@ +name: "Download Test Cache" +description: | + Downloads the test cache and outputs today's cache key. + A PR job can use a cache if it was created by its base branch, its current + branch, or the default branch. + https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/caching-dependencies-to-speed-up-workflows#restrictions-for-accessing-a-cache +outputs: + cache-key: + description: "Today's cache key" + value: ${{ steps.vars.outputs.cache-key }} +inputs: + key-prefix: + description: "Prefix for the cache key" + required: true + cache-path: + description: "Path to the cache directory" + required: true + # This path is defined in testutil/cache.go + default: "~/.cache/coderv2-test" +runs: + using: "composite" + steps: + - name: Get date values and cache key + id: vars + shell: bash + run: | + export YEAR_MONTH=$(date +'%Y-%m') + export PREV_YEAR_MONTH=$(date -d 'last month' +'%Y-%m') + export DAY=$(date +'%d') + echo "year-month=$YEAR_MONTH" >> $GITHUB_OUTPUT + echo "prev-year-month=$PREV_YEAR_MONTH" >> $GITHUB_OUTPUT + echo "cache-key=${{ inputs.key-prefix }}-${YEAR_MONTH}-${DAY}" >> $GITHUB_OUTPUT + + # TODO: As a cost optimization, we could remove caches that are older than + # a day or two. By default, depot keeps caches for 14 days, which isn't + # necessary for the test cache. + # https://depot.dev/docs/github-actions/overview#cache-retention-policy + - name: Download test cache + uses: actions/cache/restore@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + with: + path: ${{ inputs.cache-path }} + key: ${{ steps.vars.outputs.cache-key }} + # > If there are multiple partial matches for a restore key, the action returns the most recently created cache. + # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/caching-dependencies-to-speed-up-workflows#matching-a-cache-key + # The second restore key allows non-main branches to use the cache from the previous month. + # This prevents PRs from rebuilding the cache on the first day of the month. + # It also makes sure that once a month, the cache is fully reset. + restore-keys: | + ${{ inputs.key-prefix }}-${{ steps.vars.outputs.year-month }}- + ${{ github.ref != 'refs/heads/main' && format('{0}-{1}-', inputs.key-prefix, steps.vars.outputs.prev-year-month) || '' }} diff --git a/.github/actions/test-cache/upload/action.yml b/.github/actions/test-cache/upload/action.yml new file mode 100644 index 0000000000000..a4d524164c74c --- /dev/null +++ b/.github/actions/test-cache/upload/action.yml @@ -0,0 +1,20 @@ +name: "Upload Test Cache" +description: Uploads the test cache. Only works on the main branch. +inputs: + cache-key: + description: "Cache key" + required: true + cache-path: + description: "Path to the cache directory" + required: true + # This path is defined in testutil/cache.go + default: "~/.cache/coderv2-test" +runs: + using: "composite" + steps: + - name: Upload test cache + if: ${{ github.ref == 'refs/heads/main' }} + uses: actions/cache/save@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + with: + path: ${{ inputs.cache-path }} + key: ${{ inputs.cache-key }} diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8d084d5db2373..3c5104fbefa5f 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -346,10 +346,11 @@ jobs: - name: Setup Terraform uses: ./.github/actions/setup-tf - - name: Setup Test Cache - uses: ./.github/actions/setup-test-cache + - name: Download Test Cache + id: download-cache + uses: ./.github/actions/test-cache/download with: - key-prefix: test-go + key-prefix: test-go-${{ runner.os }}-${{ runner.arch }} - name: Test with Mock Database id: test @@ -375,6 +376,11 @@ jobs: gotestsum --junitfile="gotests.xml" --jsonfile="gotests.json" \ --packages="./..." -- $PARALLEL_FLAG -short -failfast + - name: Upload Test Cache + uses: ./.github/actions/test-cache/upload + with: + cache-key: ${{ steps.download-cache.outputs.cache-key }} + - name: Upload test stats to Datadog timeout-minutes: 1 continue-on-error: true @@ -472,10 +478,11 @@ jobs: if: runner.os == 'Windows' uses: ./.github/actions/setup-imdisk - - name: Setup Test Cache - uses: ./.github/actions/setup-test-cache + - name: Download Test Cache + id: download-cache + uses: ./.github/actions/test-cache/download with: - key-prefix: test-go-pg + key-prefix: test-go-pg-${{ runner.os }}-${{ runner.arch }} - name: Test with PostgreSQL Database env: @@ -491,6 +498,11 @@ jobs: make test-postgres + - name: Upload Test Cache + uses: ./.github/actions/test-cache/upload + with: + cache-key: ${{ steps.download-cache.outputs.cache-key }} + - name: Upload test stats to Datadog timeout-minutes: 1 continue-on-error: true @@ -529,10 +541,11 @@ jobs: - name: Setup Terraform uses: ./.github/actions/setup-tf - - name: Setup Test Cache - uses: ./.github/actions/setup-test-cache + - name: Download Test Cache + id: download-cache + uses: ./.github/actions/test-cache/download with: - key-prefix: test-go-pg-16 + key-prefix: test-go-pg-16-${{ runner.os }}-${{ runner.arch }} - name: Test with PostgreSQL Database env: @@ -541,6 +554,11 @@ jobs: run: | make test-postgres + - name: Upload Test Cache + uses: ./.github/actions/test-cache/upload + with: + cache-key: ${{ steps.download-cache.outputs.cache-key }} + - name: Upload test stats to Datadog timeout-minutes: 1 continue-on-error: true @@ -571,10 +589,11 @@ jobs: - name: Setup Terraform uses: ./.github/actions/setup-tf - - name: Setup Test Cache - uses: ./.github/actions/setup-test-cache + - name: Download Test Cache + id: download-cache + uses: ./.github/actions/test-cache/download with: - key-prefix: test-go-race + key-prefix: test-go-race-${{ runner.os }}-${{ runner.arch }} # We run race tests with reduced parallelism because they use more CPU and we were finding # instances where tests appear to hang for multiple seconds, resulting in flaky tests when @@ -584,6 +603,11 @@ jobs: run: | gotestsum --junitfile="gotests.xml" -- -race -parallel 4 -p 4 ./... + - name: Upload Test Cache + uses: ./.github/actions/test-cache/upload + with: + cache-key: ${{ steps.download-cache.outputs.cache-key }} + - name: Upload test stats to Datadog timeout-minutes: 1 continue-on-error: true @@ -614,10 +638,11 @@ jobs: - name: Setup Terraform uses: ./.github/actions/setup-tf - - name: Setup Test Cache - uses: ./.github/actions/setup-test-cache + - name: Download Test Cache + id: download-cache + uses: ./.github/actions/test-cache/download with: - key-prefix: test-go-race-pg + key-prefix: test-go-race-pg-${{ runner.os }}-${{ runner.arch }} # We run race tests with reduced parallelism because they use more CPU and we were finding # instances where tests appear to hang for multiple seconds, resulting in flaky tests when @@ -630,6 +655,11 @@ jobs: make test-postgres-docker DB=ci gotestsum --junitfile="gotests.xml" -- -race -parallel 4 -p 4 ./... + - name: Upload Test Cache + uses: ./.github/actions/test-cache/upload + with: + cache-key: ${{ steps.download-cache.outputs.cache-key }} + - name: Upload test stats to Datadog timeout-minutes: 1 continue-on-error: true