diff --git a/.github/actions/test-cache/download/action.yml b/.github/actions/test-cache/download/action.yml new file mode 100644 index 0000000000000..06a87fee06d4b --- /dev/null +++ b/.github/actions/test-cache/download/action.yml @@ -0,0 +1,50 @@ +name: "Download Test Cache" +description: | + Downloads the test cache and outputs today's cache key. + A PR job can use a cache if it was created by its base branch, its current + branch, or the default branch. + https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/caching-dependencies-to-speed-up-workflows#restrictions-for-accessing-a-cache +outputs: + cache-key: + description: "Today's cache key" + value: ${{ steps.vars.outputs.cache-key }} +inputs: + key-prefix: + description: "Prefix for the cache key" + required: true + cache-path: + description: "Path to the cache directory" + required: true + # This path is defined in testutil/cache.go + default: "~/.cache/coderv2-test" +runs: + using: "composite" + steps: + - name: Get date values and cache key + id: vars + shell: bash + run: | + export YEAR_MONTH=$(date +'%Y-%m') + export PREV_YEAR_MONTH=$(date -d 'last month' +'%Y-%m') + export DAY=$(date +'%d') + echo "year-month=$YEAR_MONTH" >> $GITHUB_OUTPUT + echo "prev-year-month=$PREV_YEAR_MONTH" >> $GITHUB_OUTPUT + echo "cache-key=${{ inputs.key-prefix }}-${YEAR_MONTH}-${DAY}" >> $GITHUB_OUTPUT + + # TODO: As a cost optimization, we could remove caches that are older than + # a day or two. By default, depot keeps caches for 14 days, which isn't + # necessary for the test cache. + # https://depot.dev/docs/github-actions/overview#cache-retention-policy + - name: Download test cache + uses: actions/cache/restore@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + with: + path: ${{ inputs.cache-path }} + key: ${{ steps.vars.outputs.cache-key }} + # > If there are multiple partial matches for a restore key, the action returns the most recently created cache. + # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/caching-dependencies-to-speed-up-workflows#matching-a-cache-key + # The second restore key allows non-main branches to use the cache from the previous month. + # This prevents PRs from rebuilding the cache on the first day of the month. + # It also makes sure that once a month, the cache is fully reset. + restore-keys: | + ${{ inputs.key-prefix }}-${{ steps.vars.outputs.year-month }}- + ${{ github.ref != 'refs/heads/main' && format('{0}-{1}-', inputs.key-prefix, steps.vars.outputs.prev-year-month) || '' }} diff --git a/.github/actions/test-cache/upload/action.yml b/.github/actions/test-cache/upload/action.yml new file mode 100644 index 0000000000000..a4d524164c74c --- /dev/null +++ b/.github/actions/test-cache/upload/action.yml @@ -0,0 +1,20 @@ +name: "Upload Test Cache" +description: Uploads the test cache. Only works on the main branch. +inputs: + cache-key: + description: "Cache key" + required: true + cache-path: + description: "Path to the cache directory" + required: true + # This path is defined in testutil/cache.go + default: "~/.cache/coderv2-test" +runs: + using: "composite" + steps: + - name: Upload test cache + if: ${{ github.ref == 'refs/heads/main' }} + uses: actions/cache/save@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + with: + path: ${{ inputs.cache-path }} + key: ${{ inputs.cache-key }} diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 54239330f2a4f..3c5104fbefa5f 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -346,6 +346,12 @@ jobs: - name: Setup Terraform uses: ./.github/actions/setup-tf + - name: Download Test Cache + id: download-cache + uses: ./.github/actions/test-cache/download + with: + key-prefix: test-go-${{ runner.os }}-${{ runner.arch }} + - name: Test with Mock Database id: test shell: bash @@ -370,6 +376,11 @@ jobs: gotestsum --junitfile="gotests.xml" --jsonfile="gotests.json" \ --packages="./..." -- $PARALLEL_FLAG -short -failfast + - name: Upload Test Cache + uses: ./.github/actions/test-cache/upload + with: + cache-key: ${{ steps.download-cache.outputs.cache-key }} + - name: Upload test stats to Datadog timeout-minutes: 1 continue-on-error: true @@ -467,6 +478,12 @@ jobs: if: runner.os == 'Windows' uses: ./.github/actions/setup-imdisk + - name: Download Test Cache + id: download-cache + uses: ./.github/actions/test-cache/download + with: + key-prefix: test-go-pg-${{ runner.os }}-${{ runner.arch }} + - name: Test with PostgreSQL Database env: POSTGRES_VERSION: "13" @@ -481,6 +498,11 @@ jobs: make test-postgres + - name: Upload Test Cache + uses: ./.github/actions/test-cache/upload + with: + cache-key: ${{ steps.download-cache.outputs.cache-key }} + - name: Upload test stats to Datadog timeout-minutes: 1 continue-on-error: true @@ -519,6 +541,12 @@ jobs: - name: Setup Terraform uses: ./.github/actions/setup-tf + - name: Download Test Cache + id: download-cache + uses: ./.github/actions/test-cache/download + with: + key-prefix: test-go-pg-16-${{ runner.os }}-${{ runner.arch }} + - name: Test with PostgreSQL Database env: POSTGRES_VERSION: "16" @@ -526,6 +554,11 @@ jobs: run: | make test-postgres + - name: Upload Test Cache + uses: ./.github/actions/test-cache/upload + with: + cache-key: ${{ steps.download-cache.outputs.cache-key }} + - name: Upload test stats to Datadog timeout-minutes: 1 continue-on-error: true @@ -556,6 +589,12 @@ jobs: - name: Setup Terraform uses: ./.github/actions/setup-tf + - name: Download Test Cache + id: download-cache + uses: ./.github/actions/test-cache/download + with: + key-prefix: test-go-race-${{ runner.os }}-${{ runner.arch }} + # We run race tests with reduced parallelism because they use more CPU and we were finding # instances where tests appear to hang for multiple seconds, resulting in flaky tests when # short timeouts are used. @@ -564,6 +603,11 @@ jobs: run: | gotestsum --junitfile="gotests.xml" -- -race -parallel 4 -p 4 ./... + - name: Upload Test Cache + uses: ./.github/actions/test-cache/upload + with: + cache-key: ${{ steps.download-cache.outputs.cache-key }} + - name: Upload test stats to Datadog timeout-minutes: 1 continue-on-error: true @@ -594,6 +638,12 @@ jobs: - name: Setup Terraform uses: ./.github/actions/setup-tf + - name: Download Test Cache + id: download-cache + uses: ./.github/actions/test-cache/download + with: + key-prefix: test-go-race-pg-${{ runner.os }}-${{ runner.arch }} + # We run race tests with reduced parallelism because they use more CPU and we were finding # instances where tests appear to hang for multiple seconds, resulting in flaky tests when # short timeouts are used. @@ -605,6 +655,11 @@ jobs: make test-postgres-docker DB=ci gotestsum --junitfile="gotests.xml" -- -race -parallel 4 -p 4 ./... + - name: Upload Test Cache + uses: ./.github/actions/test-cache/upload + with: + cache-key: ${{ steps.download-cache.outputs.cache-key }} + - name: Upload test stats to Datadog timeout-minutes: 1 continue-on-error: true diff --git a/provisioner/terraform/executor.go b/provisioner/terraform/executor.go index 150f51e6dd10d..442ed36074eb2 100644 --- a/provisioner/terraform/executor.go +++ b/provisioner/terraform/executor.go @@ -35,8 +35,9 @@ type executor struct { mut *sync.Mutex binaryPath string // cachePath and workdir must not be used by multiple processes at once. - cachePath string - workdir string + cachePath string + cliConfigPath string + workdir string // used to capture execution times at various stages timings *timingAggregator } @@ -50,6 +51,9 @@ func (e *executor) basicEnv() []string { if e.cachePath != "" && runtime.GOOS == "linux" { env = append(env, "TF_PLUGIN_CACHE_DIR="+e.cachePath) } + if e.cliConfigPath != "" { + env = append(env, "TF_CLI_CONFIG_FILE="+e.cliConfigPath) + } return env } diff --git a/provisioner/terraform/provision_test.go b/provisioner/terraform/provision_test.go index e7b64046f3ab3..96514cc4b59ad 100644 --- a/provisioner/terraform/provision_test.go +++ b/provisioner/terraform/provision_test.go @@ -3,13 +3,17 @@ package terraform_test import ( + "bytes" "context" + "crypto/sha256" + "encoding/hex" "encoding/json" "errors" "fmt" "net" "net/http" "os" + "os/exec" "path/filepath" "sort" "strings" @@ -29,10 +33,11 @@ import ( ) type provisionerServeOptions struct { - binaryPath string - exitTimeout time.Duration - workDir string - logger *slog.Logger + binaryPath string + cliConfigPath string + exitTimeout time.Duration + workDir string + logger *slog.Logger } func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Context, proto.DRPCProvisionerClient) { @@ -66,9 +71,10 @@ func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Cont Logger: *opts.logger, WorkDirectory: opts.workDir, }, - BinaryPath: opts.binaryPath, - CachePath: cachePath, - ExitTimeout: opts.exitTimeout, + BinaryPath: opts.binaryPath, + CachePath: cachePath, + ExitTimeout: opts.exitTimeout, + CliConfigPath: opts.cliConfigPath, }) }() api := proto.NewDRPCProvisionerClient(client) @@ -85,6 +91,168 @@ func configure(ctx context.Context, t *testing.T, client proto.DRPCProvisionerCl return sess } +func hashTemplateFilesAndTestName(t *testing.T, testName string, templateFiles map[string]string) string { + t.Helper() + + sortedFileNames := make([]string, 0, len(templateFiles)) + for fileName := range templateFiles { + sortedFileNames = append(sortedFileNames, fileName) + } + sort.Strings(sortedFileNames) + + // Inserting a delimiter between the file name and the file content + // ensures that a file named `ab` with content `cd` + // will not hash to the same value as a file named `abc` with content `d`. + // This can still happen if the file name or content include the delimiter, + // but hopefully they won't. + delimiter := []byte("🎉 🌱 🌷") + + hasher := sha256.New() + for _, fileName := range sortedFileNames { + file := templateFiles[fileName] + _, err := hasher.Write([]byte(fileName)) + require.NoError(t, err) + _, err = hasher.Write(delimiter) + require.NoError(t, err) + _, err = hasher.Write([]byte(file)) + require.NoError(t, err) + } + _, err := hasher.Write(delimiter) + require.NoError(t, err) + _, err = hasher.Write([]byte(testName)) + require.NoError(t, err) + + return hex.EncodeToString(hasher.Sum(nil)) +} + +const ( + terraformConfigFileName = "terraform.rc" + cacheProvidersDirName = "providers" + cacheTemplateFilesDirName = "files" +) + +// Writes a Terraform CLI config file (`terraform.rc`) in `dir` to enforce using the local provider mirror. +// This blocks network access for providers, forcing Terraform to use only what's cached in `dir`. +// Returns the path to the generated config file. +func writeCliConfig(t *testing.T, dir string) string { + t.Helper() + + cliConfigPath := filepath.Join(dir, terraformConfigFileName) + require.NoError(t, os.MkdirAll(filepath.Dir(cliConfigPath), 0o700)) + + content := fmt.Sprintf(` + provider_installation { + filesystem_mirror { + path = "%s" + include = ["*/*"] + } + direct { + exclude = ["*/*"] + } + } + `, filepath.Join(dir, cacheProvidersDirName)) + require.NoError(t, os.WriteFile(cliConfigPath, []byte(content), 0o600)) + return cliConfigPath +} + +func runCmd(t *testing.T, dir string, args ...string) { + t.Helper() + + stdout, stderr := bytes.NewBuffer(nil), bytes.NewBuffer(nil) + cmd := exec.Command(args[0], args[1:]...) //#nosec + cmd.Dir = dir + cmd.Stdout = stdout + cmd.Stderr = stderr + if err := cmd.Run(); err != nil { + t.Fatalf("failed to run %s: %s\nstdout: %s\nstderr: %s", strings.Join(args, " "), err, stdout.String(), stderr.String()) + } +} + +// Each test gets a unique cache dir based on its name and template files. +// This ensures that tests can download providers in parallel and that they +// will redownload providers if the template files change. +func getTestCacheDir(t *testing.T, rootDir string, testName string, templateFiles map[string]string) string { + t.Helper() + + hash := hashTemplateFilesAndTestName(t, testName, templateFiles) + dir := filepath.Join(rootDir, hash[:12]) + return dir +} + +// Ensures Terraform providers are downloaded and cached locally in a unique directory for the test. +// Uses `terraform init` then `mirror` to populate the cache if needed. +// Returns the cache directory path. +func downloadProviders(t *testing.T, rootDir string, testName string, templateFiles map[string]string) string { + t.Helper() + + dir := getTestCacheDir(t, rootDir, testName, templateFiles) + if _, err := os.Stat(dir); err == nil { + t.Logf("%s: using cached terraform providers", testName) + return dir + } + filesDir := filepath.Join(dir, cacheTemplateFilesDirName) + defer func() { + // The files dir will contain a copy of terraform providers generated + // by the terraform init command. We don't want to persist them since + // we already have a registry mirror in the providers dir. + if err := os.RemoveAll(filesDir); err != nil { + t.Logf("failed to remove files dir %s: %s", filesDir, err) + } + if !t.Failed() { + return + } + // If `downloadProviders` function failed, clean up the cache dir. + // We don't want to leave it around because it may be incomplete or corrupted. + if err := os.RemoveAll(dir); err != nil { + t.Logf("failed to remove dir %s: %s", dir, err) + } + }() + + require.NoError(t, os.MkdirAll(filesDir, 0o700)) + + for fileName, file := range templateFiles { + filePath := filepath.Join(filesDir, fileName) + require.NoError(t, os.MkdirAll(filepath.Dir(filePath), 0o700)) + require.NoError(t, os.WriteFile(filePath, []byte(file), 0o600)) + } + + providersDir := filepath.Join(dir, cacheProvidersDirName) + require.NoError(t, os.MkdirAll(providersDir, 0o700)) + + // We need to run init because if a test uses modules in its template, + // the mirror command will fail without it. + runCmd(t, filesDir, "terraform", "init") + // Now, mirror the providers into `providersDir`. We use this explicit mirror + // instead of relying only on the standard Terraform plugin cache. + // + // Why? Because this mirror, when used with the CLI config from `writeCliConfig`, + // prevents Terraform from hitting the network registry during `plan`. This cuts + // down on network calls, making CI tests less flaky. + // + // In contrast, the standard cache *still* contacts the registry for metadata + // during `init`, even if the plugins are already cached locally - see link below. + // + // Ref: https://developer.hashicorp.com/terraform/cli/config/config-file#provider-plugin-cache + // > When a plugin cache directory is enabled, the terraform init command will + // > still use the configured or implied installation methods to obtain metadata + // > about which plugins are available + runCmd(t, filesDir, "terraform", "providers", "mirror", providersDir) + + return dir +} + +// Caches providers locally and generates a Terraform CLI config to use *only* that cache. +// This setup prevents network access for providers during `terraform init`, improving reliability +// in subsequent test runs. +// Returns the path to the generated CLI config file. +func cacheProviders(t *testing.T, rootDir string, testName string, templateFiles map[string]string) string { + t.Helper() + + providersParentDir := downloadProviders(t, rootDir, testName, templateFiles) + cliConfigPath := writeCliConfig(t, providersParentDir) + return cliConfigPath +} + func readProvisionLog(t *testing.T, response proto.DRPCProvisioner_SessionClient) string { var logBuf strings.Builder for { @@ -352,6 +520,8 @@ func TestProvision(t *testing.T) { Apply bool // Some tests may need to be skipped until the relevant provider version is released. SkipReason string + // If SkipCacheProviders is true, then skip caching the terraform providers for this test. + SkipCacheProviders bool }{ { Name: "missing-variable", @@ -422,16 +592,18 @@ func TestProvision(t *testing.T) { Files: map[string]string{ "main.tf": `a`, }, - ErrorContains: "initialize terraform", - ExpectLogContains: "Argument or block definition required", + ErrorContains: "initialize terraform", + ExpectLogContains: "Argument or block definition required", + SkipCacheProviders: true, }, { Name: "bad-syntax-2", Files: map[string]string{ "main.tf": `;asdf;`, }, - ErrorContains: "initialize terraform", - ExpectLogContains: `The ";" character is not valid.`, + ErrorContains: "initialize terraform", + ExpectLogContains: `The ";" character is not valid.`, + SkipCacheProviders: true, }, { Name: "destroy-no-state", @@ -838,6 +1010,23 @@ func TestProvision(t *testing.T) { }, } + // Remove unused cache dirs before running tests. + // This cleans up any cache dirs that were created by tests that no longer exist. + cacheRootDir := filepath.Join(testutil.PersistentCacheDir(t), "terraform_provision_test") + expectedCacheDirs := make(map[string]bool) + for _, testCase := range testCases { + cacheDir := getTestCacheDir(t, cacheRootDir, testCase.Name, testCase.Files) + expectedCacheDirs[cacheDir] = true + } + currentCacheDirs, err := filepath.Glob(filepath.Join(cacheRootDir, "*")) + require.NoError(t, err) + for _, cacheDir := range currentCacheDirs { + if _, ok := expectedCacheDirs[cacheDir]; !ok { + t.Logf("removing unused cache dir: %s", cacheDir) + require.NoError(t, os.RemoveAll(cacheDir)) + } + } + for _, testCase := range testCases { testCase := testCase t.Run(testCase.Name, func(t *testing.T) { @@ -847,7 +1036,18 @@ func TestProvision(t *testing.T) { t.Skip(testCase.SkipReason) } - ctx, api := setupProvisioner(t, nil) + cliConfigPath := "" + if !testCase.SkipCacheProviders { + cliConfigPath = cacheProviders( + t, + cacheRootDir, + testCase.Name, + testCase.Files, + ) + } + ctx, api := setupProvisioner(t, &provisionerServeOptions{ + cliConfigPath: cliConfigPath, + }) sess := configure(ctx, t, api, &proto.Config{ TemplateSourceArchive: testutil.CreateTar(t, testCase.Files), }) diff --git a/provisioner/terraform/serve.go b/provisioner/terraform/serve.go index a84e8caf6b5ab..562946d8ef92e 100644 --- a/provisioner/terraform/serve.go +++ b/provisioner/terraform/serve.go @@ -28,7 +28,9 @@ type ServeOptions struct { BinaryPath string // CachePath must not be used by multiple processes at once. CachePath string - Tracer trace.Tracer + // CliConfigPath is the path to the Terraform CLI config file. + CliConfigPath string + Tracer trace.Tracer // ExitTimeout defines how long we will wait for a running Terraform // command to exit (cleanly) if the provision was stopped. This @@ -132,22 +134,24 @@ func Serve(ctx context.Context, options *ServeOptions) error { options.ExitTimeout = unhanger.HungJobExitTimeout } return provisionersdk.Serve(ctx, &server{ - execMut: &sync.Mutex{}, - binaryPath: options.BinaryPath, - cachePath: options.CachePath, - logger: options.Logger, - tracer: options.Tracer, - exitTimeout: options.ExitTimeout, + execMut: &sync.Mutex{}, + binaryPath: options.BinaryPath, + cachePath: options.CachePath, + cliConfigPath: options.CliConfigPath, + logger: options.Logger, + tracer: options.Tracer, + exitTimeout: options.ExitTimeout, }, options.ServeOptions) } type server struct { - execMut *sync.Mutex - binaryPath string - cachePath string - logger slog.Logger - tracer trace.Tracer - exitTimeout time.Duration + execMut *sync.Mutex + binaryPath string + cachePath string + cliConfigPath string + logger slog.Logger + tracer trace.Tracer + exitTimeout time.Duration } func (s *server) startTrace(ctx context.Context, name string, opts ...trace.SpanStartOption) (context.Context, trace.Span) { @@ -158,12 +162,13 @@ func (s *server) startTrace(ctx context.Context, name string, opts ...trace.Span func (s *server) executor(workdir string, stage database.ProvisionerJobTimingStage) *executor { return &executor{ - server: s, - mut: s.execMut, - binaryPath: s.binaryPath, - cachePath: s.cachePath, - workdir: workdir, - logger: s.logger.Named("executor"), - timings: newTimingAggregator(stage), + server: s, + mut: s.execMut, + binaryPath: s.binaryPath, + cachePath: s.cachePath, + cliConfigPath: s.cliConfigPath, + workdir: workdir, + logger: s.logger.Named("executor"), + timings: newTimingAggregator(stage), } } diff --git a/testutil/cache.go b/testutil/cache.go new file mode 100644 index 0000000000000..82d45da3b3322 --- /dev/null +++ b/testutil/cache.go @@ -0,0 +1,25 @@ +package testutil + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/require" +) + +// PersistentCacheDir returns a path to a directory +// that will be cached between test runs in Github Actions. +func PersistentCacheDir(t *testing.T) string { + t.Helper() + + // We don't use os.UserCacheDir() because the path it + // returns is different on different operating systems. + // This would make it harder to specify which cache dir to use + // in Github Actions. + home, err := os.UserHomeDir() + require.NoError(t, err) + dir := filepath.Join(home, ".cache", "coderv2-test") + + return dir +}