Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions .github/workflows/cleanup-orphaned-instances.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
name: Cleanup Orphaned Instances

on:
workflow_dispatch:
# Manual trigger only - disabled by default
inputs:
dry_run:
description: 'Run in dry-run mode (list instances without deleting)'
required: false
default: 'false'
type: boolean
# Uncomment the schedule below to enable automatic hourly cleanup
# schedule:
# # Run every hour
# - cron: '0 * * * *'

jobs:
cleanup-lambdalabs:
name: Cleanup LambdaLabs Orphaned Instances
runs-on: ubuntu-latest
if: github.event_name == 'workflow_dispatch'

steps:
- uses: actions/checkout@v4

- name: Set up Go
uses: actions/setup-go@v4
with:
go-version: '1.23.0'

- name: Cache Go modules
uses: actions/cache@v4
with:
path: |
~/.cache/go-build
~/go/pkg/mod
key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }}
restore-keys: |
${{ runner.os }}-go-

- name: Install dependencies
run: make deps

- name: Run cleanup for LambdaLabs
env:
LAMBDALABS_API_KEY: ${{ secrets.LAMBDALABS_API_KEY }}
DRY_RUN: ${{ github.event.inputs.dry_run }}
run: |
cd internal/lambdalabs
go run ../../cmd/cleanup/main.go -provider=lambdalabs -dry-run=$DRY_RUN

- name: Upload cleanup results
uses: actions/upload-artifact@v4
if: always()
with:
name: cleanup-results
path: |
cleanup-*.log
88 changes: 88 additions & 0 deletions cmd/cleanup/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
package main

import (
"context"
"flag"
"fmt"
"log"
"os"
"time"

lambdalabs "github.com/brevdev/cloud/internal/lambdalabs/v1"
"github.com/brevdev/cloud/internal/validation"
v1 "github.com/brevdev/cloud/pkg/v1"
)

func main() {
var (
provider = flag.String("provider", "", "Cloud provider to clean up (lambdalabs)")
dryRun = flag.Bool("dry-run", false, "List orphaned instances without deleting them")
)
flag.Parse()

if *provider == "" {
log.Fatal("Provider is required. Use -provider=lambdalabs")
}

if *provider != "lambdalabs" {
log.Fatalf("Unsupported provider: %s", *provider)
}

ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
defer cancel()

err := cleanupLambdaLabs(ctx, *dryRun)
if err != nil {
log.Printf("LambdaLabs cleanup failed: %v", err)
return
}
}

func cleanupLambdaLabs(ctx context.Context, dryRun bool) error {
apiKey := os.Getenv("LAMBDALABS_API_KEY")
if apiKey == "" {
return fmt.Errorf("LAMBDALABS_API_KEY environment variable is required")
}

credential := lambdalabs.NewLambdaLabsCredential(validation.UniversalCloudCredRefID, apiKey)
client, err := credential.MakeClient(ctx, "")
if err != nil {
return fmt.Errorf("failed to create LambdaLabs client: %w", err)
}

if dryRun {
return listOrphanedInstances(ctx, client)
}

return validation.CleanupOrphanedInstances(ctx, client)
}

func listOrphanedInstances(ctx context.Context, client v1.CloudClient) error {
instances, err := client.ListInstances(ctx, v1.ListInstancesArgs{})
if err != nil {
return fmt.Errorf("failed to list instances: %w", err)
}

cutoffTime := time.Now().Add(-1 * time.Hour)
var orphanedInstances []v1.Instance

for _, instance := range instances {
if instance.CloudCredRefID == validation.UniversalCloudCredRefID {
if instance.CreatedAt.Before(cutoffTime) {
orphanedInstances = append(orphanedInstances, instance)
}
}
}

fmt.Printf("Found %d orphaned instances with CloudCredRefID: %s\n",
len(orphanedInstances), validation.UniversalCloudCredRefID)

for _, instance := range orphanedInstances {
fmt.Printf("- Instance: %s (created: %s, age: %s)\n",
instance.CloudID,
instance.CreatedAt.Format(time.RFC3339),
time.Since(instance.CreatedAt).Round(time.Minute))
}

return nil
}
4 changes: 2 additions & 2 deletions internal/lambdalabs/v1/validation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ func TestValidationFunctions(t *testing.T) {
apiKey := getAPIKey()

config := validation.ProviderConfig{
Credential: NewLambdaLabsCredential("validation-test", apiKey),
Credential: NewLambdaLabsCredential(validation.UniversalCloudCredRefID, apiKey),
StableIDs: []v1.InstanceTypeID{"us-west-1-noSub-gpu_8x_a100_80gb_sxm4", "us-east-1-noSub-gpu_8x_a100_80gb_sxm4"},
}

Expand All @@ -25,7 +25,7 @@ func TestInstanceLifecycleValidation(t *testing.T) {
apiKey := getAPIKey()

config := validation.ProviderConfig{
Credential: NewLambdaLabsCredential("validation-test", apiKey),
Credential: NewLambdaLabsCredential(validation.UniversalCloudCredRefID, apiKey),
}

validation.RunInstanceLifecycleValidation(t, config)
Expand Down
46 changes: 46 additions & 0 deletions internal/validation/suite.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package validation

import (
"context"
"fmt"
"testing"
"time"

Expand All @@ -10,6 +11,8 @@ import (
"github.com/stretchr/testify/require"
)

const UniversalCloudCredRefID = "brev-validation-test"

type ProviderConfig struct {
Location string
StableIDs []v1.InstanceTypeID
Expand Down Expand Up @@ -126,3 +129,46 @@ func RunInstanceLifecycleValidation(t *testing.T, config ProviderConfig) {
})
})
}

func CleanupOrphanedInstances(ctx context.Context, client v1.CloudCreateTerminateInstance) error {
instances, err := client.ListInstances(ctx, v1.ListInstancesArgs{})
if err != nil {
return fmt.Errorf("failed to list instances: %w", err)
}

cutoffTime := time.Now().Add(-1 * time.Hour)
var orphanedInstances []v1.Instance

for _, instance := range instances {
if instance.CloudCredRefID == UniversalCloudCredRefID {
if instance.CreatedAt.Before(cutoffTime) {
orphanedInstances = append(orphanedInstances, instance)
}
}
}

if len(orphanedInstances) == 0 {
fmt.Printf("No orphaned instances found with CloudCredRefID: %s\n", UniversalCloudCredRefID)
return nil
}

fmt.Printf("Found %d orphaned instances to clean up\n", len(orphanedInstances))

var cleanupErrors []error
for _, instance := range orphanedInstances {
fmt.Printf("Terminating orphaned instance: %s (created: %s)\n",
instance.CloudID, instance.CreatedAt.Format(time.RFC3339))

err := client.TerminateInstance(ctx, instance.CloudID)
if err != nil {
cleanupErrors = append(cleanupErrors, fmt.Errorf("failed to terminate instance %s: %w", instance.CloudID, err))
}
}

if len(cleanupErrors) > 0 {
return fmt.Errorf("cleanup completed with %d errors: %v", len(cleanupErrors), cleanupErrors)
}

fmt.Printf("Successfully cleaned up %d orphaned instances\n", len(orphanedInstances))
return nil
}
Loading