From f206cf91ae0862f765cabe2646de9d7cd660e7f3 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sun, 10 Aug 2025 02:08:29 +0000 Subject: [PATCH] Add universal cloud cred ref ID and orphaned instance cleanup - Add UniversalCloudCredRefID constant in internal/validation - Update validation tests to use universal constant - Add CleanupOrphanedInstances function for 1hr+ old instances - Create GitHub Action workflow (disabled by default) - Add cleanup CLI tool for manual/automated cleanup Co-Authored-By: Alec Fong --- .../workflows/cleanup-orphaned-instances.yml | 58 ++++++++++++ cmd/cleanup/main.go | 88 +++++++++++++++++++ internal/lambdalabs/v1/validation_test.go | 4 +- internal/validation/suite.go | 46 ++++++++++ 4 files changed, 194 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/cleanup-orphaned-instances.yml create mode 100644 cmd/cleanup/main.go diff --git a/.github/workflows/cleanup-orphaned-instances.yml b/.github/workflows/cleanup-orphaned-instances.yml new file mode 100644 index 0000000..a25b6c2 --- /dev/null +++ b/.github/workflows/cleanup-orphaned-instances.yml @@ -0,0 +1,58 @@ +name: Cleanup Orphaned Instances + +on: + workflow_dispatch: + # Manual trigger only - disabled by default + inputs: + dry_run: + description: 'Run in dry-run mode (list instances without deleting)' + required: false + default: 'false' + type: boolean + # Uncomment the schedule below to enable automatic hourly cleanup + # schedule: + # # Run every hour + # - cron: '0 * * * *' + +jobs: + cleanup-lambdalabs: + name: Cleanup LambdaLabs Orphaned Instances + runs-on: ubuntu-latest + if: github.event_name == 'workflow_dispatch' + + steps: + - uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v4 + with: + go-version: '1.23.0' + + - name: Cache Go modules + uses: actions/cache@v4 + with: + path: | + ~/.cache/go-build + ~/go/pkg/mod + key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }} + restore-keys: | + ${{ runner.os }}-go- + + - name: Install dependencies + run: make deps + + - name: Run cleanup for LambdaLabs + env: + LAMBDALABS_API_KEY: ${{ secrets.LAMBDALABS_API_KEY }} + DRY_RUN: ${{ github.event.inputs.dry_run }} + run: | + cd internal/lambdalabs + go run ../../cmd/cleanup/main.go -provider=lambdalabs -dry-run=$DRY_RUN + + - name: Upload cleanup results + uses: actions/upload-artifact@v4 + if: always() + with: + name: cleanup-results + path: | + cleanup-*.log diff --git a/cmd/cleanup/main.go b/cmd/cleanup/main.go new file mode 100644 index 0000000..3851f4d --- /dev/null +++ b/cmd/cleanup/main.go @@ -0,0 +1,88 @@ +package main + +import ( + "context" + "flag" + "fmt" + "log" + "os" + "time" + + lambdalabs "github.com/brevdev/cloud/internal/lambdalabs/v1" + "github.com/brevdev/cloud/internal/validation" + v1 "github.com/brevdev/cloud/pkg/v1" +) + +func main() { + var ( + provider = flag.String("provider", "", "Cloud provider to clean up (lambdalabs)") + dryRun = flag.Bool("dry-run", false, "List orphaned instances without deleting them") + ) + flag.Parse() + + if *provider == "" { + log.Fatal("Provider is required. Use -provider=lambdalabs") + } + + if *provider != "lambdalabs" { + log.Fatalf("Unsupported provider: %s", *provider) + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + defer cancel() + + err := cleanupLambdaLabs(ctx, *dryRun) + if err != nil { + log.Printf("LambdaLabs cleanup failed: %v", err) + return + } +} + +func cleanupLambdaLabs(ctx context.Context, dryRun bool) error { + apiKey := os.Getenv("LAMBDALABS_API_KEY") + if apiKey == "" { + return fmt.Errorf("LAMBDALABS_API_KEY environment variable is required") + } + + credential := lambdalabs.NewLambdaLabsCredential(validation.UniversalCloudCredRefID, apiKey) + client, err := credential.MakeClient(ctx, "") + if err != nil { + return fmt.Errorf("failed to create LambdaLabs client: %w", err) + } + + if dryRun { + return listOrphanedInstances(ctx, client) + } + + return validation.CleanupOrphanedInstances(ctx, client) +} + +func listOrphanedInstances(ctx context.Context, client v1.CloudClient) error { + instances, err := client.ListInstances(ctx, v1.ListInstancesArgs{}) + if err != nil { + return fmt.Errorf("failed to list instances: %w", err) + } + + cutoffTime := time.Now().Add(-1 * time.Hour) + var orphanedInstances []v1.Instance + + for _, instance := range instances { + if instance.CloudCredRefID == validation.UniversalCloudCredRefID { + if instance.CreatedAt.Before(cutoffTime) { + orphanedInstances = append(orphanedInstances, instance) + } + } + } + + fmt.Printf("Found %d orphaned instances with CloudCredRefID: %s\n", + len(orphanedInstances), validation.UniversalCloudCredRefID) + + for _, instance := range orphanedInstances { + fmt.Printf("- Instance: %s (created: %s, age: %s)\n", + instance.CloudID, + instance.CreatedAt.Format(time.RFC3339), + time.Since(instance.CreatedAt).Round(time.Minute)) + } + + return nil +} diff --git a/internal/lambdalabs/v1/validation_test.go b/internal/lambdalabs/v1/validation_test.go index 54b8b5b..8b724da 100644 --- a/internal/lambdalabs/v1/validation_test.go +++ b/internal/lambdalabs/v1/validation_test.go @@ -13,7 +13,7 @@ func TestValidationFunctions(t *testing.T) { apiKey := getAPIKey() config := validation.ProviderConfig{ - Credential: NewLambdaLabsCredential("validation-test", apiKey), + Credential: NewLambdaLabsCredential(validation.UniversalCloudCredRefID, apiKey), StableIDs: []v1.InstanceTypeID{"us-west-1-noSub-gpu_8x_a100_80gb_sxm4", "us-east-1-noSub-gpu_8x_a100_80gb_sxm4"}, } @@ -25,7 +25,7 @@ func TestInstanceLifecycleValidation(t *testing.T) { apiKey := getAPIKey() config := validation.ProviderConfig{ - Credential: NewLambdaLabsCredential("validation-test", apiKey), + Credential: NewLambdaLabsCredential(validation.UniversalCloudCredRefID, apiKey), } validation.RunInstanceLifecycleValidation(t, config) diff --git a/internal/validation/suite.go b/internal/validation/suite.go index ee9b9da..7a81132 100644 --- a/internal/validation/suite.go +++ b/internal/validation/suite.go @@ -2,6 +2,7 @@ package validation import ( "context" + "fmt" "testing" "time" @@ -10,6 +11,8 @@ import ( "github.com/stretchr/testify/require" ) +const UniversalCloudCredRefID = "brev-validation-test" + type ProviderConfig struct { Location string StableIDs []v1.InstanceTypeID @@ -126,3 +129,46 @@ func RunInstanceLifecycleValidation(t *testing.T, config ProviderConfig) { }) }) } + +func CleanupOrphanedInstances(ctx context.Context, client v1.CloudCreateTerminateInstance) error { + instances, err := client.ListInstances(ctx, v1.ListInstancesArgs{}) + if err != nil { + return fmt.Errorf("failed to list instances: %w", err) + } + + cutoffTime := time.Now().Add(-1 * time.Hour) + var orphanedInstances []v1.Instance + + for _, instance := range instances { + if instance.CloudCredRefID == UniversalCloudCredRefID { + if instance.CreatedAt.Before(cutoffTime) { + orphanedInstances = append(orphanedInstances, instance) + } + } + } + + if len(orphanedInstances) == 0 { + fmt.Printf("No orphaned instances found with CloudCredRefID: %s\n", UniversalCloudCredRefID) + return nil + } + + fmt.Printf("Found %d orphaned instances to clean up\n", len(orphanedInstances)) + + var cleanupErrors []error + for _, instance := range orphanedInstances { + fmt.Printf("Terminating orphaned instance: %s (created: %s)\n", + instance.CloudID, instance.CreatedAt.Format(time.RFC3339)) + + err := client.TerminateInstance(ctx, instance.CloudID) + if err != nil { + cleanupErrors = append(cleanupErrors, fmt.Errorf("failed to terminate instance %s: %w", instance.CloudID, err)) + } + } + + if len(cleanupErrors) > 0 { + return fmt.Errorf("cleanup completed with %d errors: %v", len(cleanupErrors), cleanupErrors) + } + + fmt.Printf("Successfully cleaned up %d orphaned instances\n", len(orphanedInstances)) + return nil +}