diff --git a/terraform-aws-github-runner/modules/runners-instances/templates/install-config-runner.sh b/terraform-aws-github-runner/modules/runners-instances/templates/install-config-runner.sh index 3a9eb42174..efd22ca353 100644 --- a/terraform-aws-github-runner/modules/runners-instances/templates/install-config-runner.sh +++ b/terraform-aws-github-runner/modules/runners-instances/templates/install-config-runner.sh @@ -265,10 +265,13 @@ fi ./config.sh --unattended --name $INSTANCE_ID --work "_work" $CONFIG -# Set tag as runner id for scale down later +# Set tag `GithubRunnerID` as runner id for scale down later GH_RUNNER_ID=$(jq '.agentId' .runner) retry aws ec2 create-tags --region $REGION --resource $INSTANCE_ID --tags "Key=GithubRunnerID,Value=$GH_RUNNER_ID" +# Remove tag `Stage`` from instance to indicate that the instance is finished the previous step with fresh start +retry aws ec2 delete-tags --region "$REGION" --resources "$INSTANCE_ID" --tags "Key=Stage" + chown -R $USER_NAME:$USER_NAME . OVERWRITE_SERVICE_USER=${run_as_root_user} SERVICE_USER=$${OVERWRITE_SERVICE_USER:-$USER_NAME} diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/refresh-runner.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/refresh-runner.ts new file mode 100644 index 0000000000..f8caacb0ae --- /dev/null +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/refresh-runner.ts @@ -0,0 +1,107 @@ +import { Config } from './config'; +import { getRunnerTypes } from './gh-runners'; +import { ScaleUpMetrics } from './metrics'; +import { getRunner, RunnerInputParameters } from './runners'; +import { innerCreateRunnerConfigArgument } from './scale-up'; +import { tryRefreshRunner } from './scale-up-try-reuse-runner-utils'; +import { getRepoKey, Repo, RunnerInfo } from './utils'; + +export interface ActionRequestMessage { + id: number; + instanceId: string; + awsRegion: string; +} + +class RetryableRefreshError extends Error { + constructor(message: string) { + super(message); + this.name = 'RetryableRefreshError'; + } +} + +export async function refreshRunner( + eventSource: string, + payload: ActionRequestMessage, + metrics: ScaleUpMetrics, + ): Promise { + if (eventSource !== 'aws:sqs') { + throw new Error('Cannot handle non-SQS events!'); + } + + const { instanceId, awsRegion } = payload; + if (!instanceId || !awsRegion) { + console.warn(`[Skip] Missing required field(s):${!instanceId ? ' instanceId' : ''}${!awsRegion ? ' awsRegion' : ''}`); + return; + } + + console.debug(`Refreshing runner: instanceId=${instanceId}, region=${awsRegion}`); + + let runner: RunnerInfo | undefined; + try { + runner = await getRunner(metrics, instanceId, awsRegion); + if (!runner) { + console.warn(`Runner not found in aws: instanceId=${instanceId}, region=${awsRegion}`); + return; + } + } catch (e) { + console.error(`Failed to get runner: ${e}`); + return; + } + + const { runnerType: runnerTypeName, repositoryOwner, repositoryName, org, repo } = runner; + if (!runnerTypeName || !repositoryOwner || !repositoryName) { + console.warn(`[Skip] Missing runner metadata: ${JSON.stringify({ runnerTypeName, repositoryOwner, repositoryName })}`); + return; + } + + if (!org && !repo) { + console.warn(`Runner is missing both org and repo: instanceId=${instanceId}`); + return; + } + + const isOrgRunner = !!org; + const isEphemeral = true; + const ghesUrlHost = Config.Instance.ghesUrlHost; + const repoInfo: Repo = { owner: repositoryOwner, repo: repositoryName }; + + console.debug(`Fetching runner type for: ${runnerTypeName}`); + const runnerTypes = await getRunnerTypes(repoInfo, metrics, awsRegion); + const runnerType = runnerTypes.get(runnerTypeName); + + if (!runnerType) { + console.warn(`Runner type not found: ${runnerTypeName}`); + return; + } + + const createRunnerParams: RunnerInputParameters = { + environment: Config.Instance.environment, + runnerConfig: (awsRegion: string, experimentalRunner: boolean) => + innerCreateRunnerConfigArgument( + runnerTypeName, + repositoryName, + repositoryOwner, + awsRegion, + metrics, + ghesUrlHost, + isOrgRunner, + isEphemeral, + experimentalRunner, + runner.runnerExtraLabels, + runner.runnerTypeLabels, + runner.runnerGroupName, + ), + runnerType, + repositoryOwner, + repositoryName, + ...(Config.Instance.enableOrganizationRunners + ? { orgName: repositoryOwner } + : { repoName: getRepoKey(repoInfo) }), + }; + + try { + await tryRefreshRunner(createRunnerParams, metrics, runner); + console.debug(`Refreshed runner: instanceId=${instanceId}, region=${awsRegion}`); + } catch (e) { + console.error(`Error refreshing runner: ${e}`); + } + } diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.test.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.test.ts index 61b3cd6cee..9fd9b9cc85 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.test.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.test.ts @@ -7,7 +7,6 @@ import { listSSMParameters, resetRunnersCaches, terminateRunner, - tryReuseRunner, } from './runners'; import { RunnerInfo } from './utils'; import { ScaleUpMetrics } from './metrics'; @@ -16,6 +15,7 @@ import { Config } from './config'; import nock from 'nock'; import { locallyCached, clearLocalCache, redisLocked } from './cache'; import moment from 'moment'; +import { tryReuseRunner } from './scale-up-try-reuse-runner-utils'; const runnerConfigFn = jest.fn().mockImplementation((awsRegion: string) => { return `${awsRegion}-BLAH`; @@ -89,7 +89,17 @@ function createExpectedRunInstancesLinux( const tags = [ { Key: 'Application', Value: 'github-action-runner' }, { Key: 'RunnerType', Value: runnerParameters.runnerType.runnerTypeName }, + { Key: 'RepositoryOwner', Value: runnerParameters.repositoryOwner }, + { Key: 'RepositoryName', Value: runnerParameters.repositoryName }, ]; + + if (runnerParameters.runnerType.labels) { + tags.push({ + Key: 'RunnerTypeLabels', + Value: runnerParameters.runnerType.labels.join(','), + }); + } + if (enableOrg) { tags.push({ Key: 'Org', @@ -500,6 +510,8 @@ describe('tryReuseRunner', () => { runnerTypeName: 'linuxCpu', is_ephemeral: true, }, + repositoryOwner: 'jeanschmidt', + repositoryName: 'regularizationTheory', }; it('does not have any runner', async () => { @@ -673,6 +685,75 @@ describe('tryReuseRunner', () => { expect(mockEC2.createReplaceRootVolumeTask).not.toBeCalled(); }); + it('has a runner, but still in replacement volume mode', async () => { + // SSM putParameter + mockSSM.putParameter.mockClear().mockImplementation(() => ({ promise: jest.fn() })); + + //createTags + mockEC2.createTags.mockClear().mockImplementation(() => ({ promise: jest.fn() })); + + //deleteTags + mockEC2.deleteTags.mockClear().mockImplementation(() => ({ promise: jest.fn() })); + + //createReplaceRootVolumeTask + mockEC2.createReplaceRootVolumeTask.mockClear().mockImplementation(() => ({ promise: jest.fn() })); + + // describeInstances + mockEC2.describeInstances.mockClear().mockImplementation(() => mockDescribeInstances); + const ephemeralRunnerFinished = Math.floor( + moment(new Date()) + .subtract(Config.Instance.minimumRunningTimeInMinutes + 10, 'minutes') + .utc() + .toDate() + .getTime() / 1000, + ); + const launchTime = moment(new Date()).subtract(5, 'minutes').utc().toDate(); + const mockRunningInstances: AWS.EC2.DescribeInstancesResult = { + Reservations: [ + { + Instances: [ + { + LaunchTime: launchTime, + InstanceId: 'i-0113', + Placement: { + AvailabilityZone: 'us-east-1a', + }, + Tags: [ + { Key: 'Repo', Value: 'jeanschmidt/regularizationTheory' }, + { Key: 'Application', Value: 'github-action-runner' }, + { Key: 'GithubRunnerID', Value: '1234' }, + { Key: 'EBSVolumeReplacementRequestTm', Value: '1653609600' }, + { Key: 'Stage', Value: 'ReplaceEBSVolume' }, + ], + }, + ], + }, + ], + }; + mockDescribeInstances.promise.mockClear().mockResolvedValue(mockRunningInstances); + + await expect(tryReuseRunner(runnerParameters, metrics)).rejects.toThrowError('No runners available'); + + expect(mockEC2.describeInstances).toBeCalledWith({ + Filters: [ + { Name: 'tag:Application', Values: ['github-action-runner'] }, + { Name: 'instance-state-name', Values: ['running', 'pending'] }, + { Name: 'instance-type', Values: ['c5.2xlarge'] }, + { Name: 'tag:ApplicationDeployDatetime', Values: ['20201010T144800'] }, + { Name: 'tag:Environment', Values: ['wg113'] }, + { Name: 'tag:Repo', Values: ['jeanschmidt/regularizationTheory'] }, + { Name: 'tag:RunnerType', Values: ['linuxCpu'] }, + { Name: 'tag:GithubRunnerID', Values: ['*'] }, + { Name: 'tag:EphemeralRunnerFinished', Values: ['*'] }, + ], + }); + expect(mockSSM.putParameter).not.toBeCalled(); + expect(mockEC2.createTags).not.toBeCalled(); + expect(mockEC2.deleteTags).not.toBeCalled(); + expect(mockEC2.createReplaceRootVolumeTask).not.toBeCalled(); + }); + + // it('has a runner, and succeeds', async () => { // SSM putParameter mockSSM.putParameter.mockClear().mockImplementation(() => ({ promise: jest.fn() })); @@ -740,7 +821,10 @@ describe('tryReuseRunner', () => { }); expect(mockEC2.createTags).toBeCalledWith({ Resources: ['i-0113'], - Tags: [{ Key: 'EBSVolumeReplacementRequestTm', Value: '1653609600' }], + Tags: [ + { Key: 'EBSVolumeReplacementRequestTm', Value: '1653609600' }, + { Key: 'Stage', Value: 'ReplaceEBSVolume' }, + ], }); expect(mockEC2.deleteTags).toBeCalledWith({ Resources: ['i-0113'], @@ -811,7 +895,10 @@ describe('tryReuseRunner', () => { }); expect(mockEC2.createTags).toBeCalledWith({ Resources: ['i-0113'], - Tags: [{ Key: 'EBSVolumeReplacementRequestTm', Value: '1653609600' }], + Tags: [ + { Key: 'EBSVolumeReplacementRequestTm', Value: '1653609600' }, + { Key: 'Stage', Value: 'ReplaceEBSVolume' }, + ], }); expect(mockEC2.deleteTags).not.toBeCalled(); expect(mockEC2.createReplaceRootVolumeTask).not.toBeCalled(); @@ -831,6 +918,8 @@ describe('tryReuseRunner', () => { runnerTypeName: 'linuxCpu', is_ephemeral: true, }, + repositoryOwner: 'jeanschmidt', + repositoryName: 'test-repo', }; it('does not have any runner', async () => { @@ -942,7 +1031,10 @@ describe('tryReuseRunner', () => { }); expect(mockEC2.createTags).toBeCalledWith({ Resources: ['i-0113'], - Tags: [{ Key: 'EBSVolumeReplacementRequestTm', Value: '1653609600' }], + Tags: [ + { Key: 'EBSVolumeReplacementRequestTm', Value: '1653609600' }, + { Key: 'Stage', Value: 'ReplaceEBSVolume' }, + ], }); expect(mockEC2.deleteTags).toBeCalledWith({ Resources: ['i-0113'], @@ -1013,7 +1105,10 @@ describe('tryReuseRunner', () => { }); expect(mockEC2.createTags).toBeCalledWith({ Resources: ['i-0113'], - Tags: [{ Key: 'EBSVolumeReplacementRequestTm', Value: '1653609600' }], + Tags: [ + { Key: 'EBSVolumeReplacementRequestTm', Value: '1653609600' }, + { Key: 'Stage', Value: 'ReplaceEBSVolume' }, + ], }); expect(mockEC2.deleteTags).not.toBeCalled(); expect(mockEC2.createReplaceRootVolumeTask).not.toBeCalled(); @@ -1125,7 +1220,7 @@ describe('createRunner', () => { }); it('calls run instances with the correct config for repo && linux', async () => { - const runnerParameters = { + const runnerParameters: RunnerInputParameters = { runnerConfig: runnerConfigFn, environment: 'wg113', repoName: 'SomeAwesomeCoder/some-amazing-library', @@ -1138,6 +1233,8 @@ describe('createRunner', () => { runnerTypeName: 'linuxCpu', is_ephemeral: true, }, + repositoryOwner: 'SomeAwesomeCoder', + repositoryName: 'some-amazing-library', }; await createRunner(runnerParameters, metrics); @@ -1149,7 +1246,7 @@ describe('createRunner', () => { }); it('calls run instances with the correct config for repo && linux && organization', async () => { - const runnerParameters = { + const runnerParameters: RunnerInputParameters = { runnerConfig: runnerConfigFn, environment: 'wg113', repoName: undefined, @@ -1162,6 +1259,8 @@ describe('createRunner', () => { runnerTypeName: 'linuxCpu.nvidia.gpu', is_ephemeral: true, }, + repositoryOwner: 'SomeAwesomeCoder', + repositoryName: 'test-repo', }; await createRunner(runnerParameters, metrics); @@ -1173,7 +1272,86 @@ describe('createRunner', () => { }); it('calls run instances with the correct config for repo && windows', async () => { - const runnerParameters = { + const runnerParameters: RunnerInputParameters = { + runnerConfig: runnerConfigFn, + environment: 'wg113', + repoName: 'SomeAwesomeCoder/some-amazing-library', + orgName: undefined, + runnerType: { + instance_type: 'c5.2xlarge', + os: 'windows', + max_available: 200, + disk_size: 100, + runnerTypeName: 'linuxCpu', + is_ephemeral: true, + }, + repositoryOwner: 'SomeAwesomeCoder', + repositoryName: 'some-amazing-library', + }; + + await createRunner(runnerParameters, metrics); + + expect(runnerConfigFn).toBeCalledTimes(1); + expect(runnerConfigFn).toBeCalledWith(config.awsRegion, false); + expect(mockEC2.runInstances).toHaveBeenCalledTimes(1); + const secGroup = Config.Instance.vpcIdToSecurityGroupIds.get('vpc-agdgaduwg113') || []; + expect(mockEC2.runInstances).toBeCalledWith({ + MaxCount: 1, + MinCount: 1, + LaunchTemplate: { + LaunchTemplateName: Config.Instance.launchTemplateNameWindows, + Version: Config.Instance.launchTemplateVersionWindows, + }, + InstanceType: runnerParameters.runnerType.instance_type, + BlockDeviceMappings: [ + { + DeviceName: '/dev/sda1', + Ebs: { + VolumeSize: runnerParameters.runnerType.disk_size, + VolumeType: 'gp3', + Encrypted: true, + DeleteOnTermination: true, + }, + }, + ], + NetworkInterfaces: [ + { + Ipv6AddressCount: 1, + AssociatePublicIpAddress: true, + SubnetId: 'sub-0113', + Groups: secGroup, + DeviceIndex: 0, + }, + ], + TagSpecifications: [ + { + ResourceType: 'instance', + Tags: [ + { Key: 'Application', Value: 'github-action-runner' }, + { Key: 'RunnerType', Value: runnerParameters.runnerType.runnerTypeName }, + { Key: 'RepositoryOwner', Value: runnerParameters.repositoryOwner }, + { Key: 'RepositoryName', Value: runnerParameters.repositoryName }, + { + Key: 'Repo', + Value: runnerParameters.repoName, + }, + ], + }, + ], + }); + }); + + it('creates tags for extraTypeLabels and runnerGroupName if set', async () => { + // Custom config for this test + const customConfig = { + ...config, + runnerGroupName: 'CustomRunnerGroup', + runnersExtraLabels: 'CustomExtraLabels', + }; + // Override the Config.Instance for this test + jest.spyOn(Config, 'Instance', 'get').mockImplementation(() => customConfig as unknown as Config); + + const runnerParameters: RunnerInputParameters = { runnerConfig: runnerConfigFn, environment: 'wg113', repoName: 'SomeAwesomeCoder/some-amazing-library', @@ -1186,6 +1364,8 @@ describe('createRunner', () => { runnerTypeName: 'linuxCpu', is_ephemeral: true, }, + repositoryOwner: 'SomeAwesomeCoder', + repositoryName: 'some-amazing-library', }; await createRunner(runnerParameters, metrics); @@ -1228,10 +1408,20 @@ describe('createRunner', () => { Tags: [ { Key: 'Application', Value: 'github-action-runner' }, { Key: 'RunnerType', Value: runnerParameters.runnerType.runnerTypeName }, + { Key: 'RepositoryOwner', Value: runnerParameters.repositoryOwner }, + { Key: 'RepositoryName', Value: runnerParameters.repositoryName }, { Key: 'Repo', Value: runnerParameters.repoName, }, + { + Key: 'RunnerExtraLabels', + Value: 'CustomExtraLabels', + }, + { + Key: 'RunnerGroupName', + Value: 'CustomRunnerGroup', + }, ], }, ], @@ -1245,6 +1435,8 @@ describe('createRunner', () => { environment: 'wg113', repoName: 'SomeAwesomeCoder/some-amazing-library', orgName: undefined, + repositoryName: 'some-amazing-library', + repositoryOwner: 'SomeAwesomeCoder', runnerType: { instance_type: 'c5.2xlarge', os: 'linux', @@ -1267,7 +1459,7 @@ describe('createRunner', () => { }); it('creates ssm experiment parameters when joining experiment', async () => { - const runnerParameters = { + const runnerParameters: RunnerInputParameters = { runnerConfig: runnerConfigFn, environment: 'wg113', repoName: 'SomeAwesomeCoder/some-amazing-library', @@ -1284,6 +1476,8 @@ describe('createRunner', () => { percentage: 0.1, }, }, + repositoryOwner: 'SomeAwesomeCoder', + repositoryName: 'some-amazing-library', }; jest.spyOn(global.Math, 'random').mockReturnValueOnce(0.0999); @@ -1323,6 +1517,8 @@ describe('createRunner', () => { { runnerConfig: runnerConfigFn, environment: 'wg113', + repositoryName: 'some-amazing-library', + repositoryOwner: 'SomeAwesomeCoder', repoName: 'SomeAwesomeCoder/some-amazing-library', orgName: undefined, runnerType: { @@ -1344,7 +1540,7 @@ describe('createRunner', () => { it('fails to attach to any network and raises exception', async () => { const errorMsg = 'test error msg ASDF'; mockRunInstances.promise.mockClear().mockRejectedValue(new Error(errorMsg)); - const runnerParameters = { + const runnerParameters: RunnerInputParameters = { runnerConfig: runnerConfigFn, environment: 'wg113', repoName: 'SomeAwesomeCoder/some-amazing-library', @@ -1357,6 +1553,8 @@ describe('createRunner', () => { runnerTypeName: 'linuxCpu', is_ephemeral: true, }, + repositoryOwner: 'SomeAwesomeCoder', + repositoryName: 'some-amazing-library', }; await expect(createRunner(runnerParameters, metrics)).rejects.toThrow(); @@ -1495,7 +1693,7 @@ describe('createRunner', () => { }); it('succeed in the first try, first subnet and region', async () => { - const runnerParameters = { + const runnerParameters: RunnerInputParameters = { runnerConfig: runnerConfigFn, environment: 'wg113', repoName: 'SomeAwesomeCoder/some-amazing-library', @@ -1508,6 +1706,8 @@ describe('createRunner', () => { runnerTypeName: 'linuxCpu', is_ephemeral: true, }, + repositoryOwner: 'SomeAwesomeCoder', + repositoryName: 'some-amazing-library', }; expect(await createRunner(runnerParameters, metrics)).toEqual(config.shuffledAwsRegionInstances[0]); @@ -1522,7 +1722,7 @@ describe('createRunner', () => { mockRunInstances.promise.mockClear().mockRejectedValueOnce(new Error('test error msg')); mockRunInstances.promise.mockClear().mockResolvedValueOnce(runInstanceSuccess); - const runnerParameters = { + const runnerParameters: RunnerInputParameters = { runnerConfig: runnerConfigFn, environment: 'wg113', repoName: 'SomeAwesomeCoder/some-amazing-library', @@ -1535,6 +1735,8 @@ describe('createRunner', () => { runnerTypeName: 'linuxCpu', is_ephemeral: true, }, + repositoryOwner: 'SomeAwesomeCoder', + repositoryName: 'some-amazing-library', }; expect(await createRunner(runnerParameters, metrics)).toEqual(config.shuffledAwsRegionInstances[0]); @@ -1556,7 +1758,7 @@ describe('createRunner', () => { } mockRunInstances.promise.mockClear().mockResolvedValueOnce(runInstanceSuccess); - const runnerParameters = { + const runnerParameters: RunnerInputParameters = { runnerConfig: runnerConfigFn, environment: 'wg113', repoName: 'SomeAwesomeCoder/some-amazing-library', @@ -1569,6 +1771,8 @@ describe('createRunner', () => { runnerTypeName: 'linuxCpu', is_ephemeral: true, }, + repositoryOwner: 'SomeAwesomeCoder', + repositoryName: 'some-amazing-library', }; expect(await createRunner(runnerParameters, metrics)).toEqual(config.shuffledAwsRegionInstances[1]); @@ -1603,7 +1807,7 @@ describe('createRunner', () => { mockRunInstances.promise.mockClear().mockRejectedValueOnce(new Error('test error msg')); } - const runnerParameters = { + const runnerParameters: RunnerInputParameters = { runnerConfig: runnerConfigFn, environment: 'wg113', repoName: 'SomeAwesomeCoder/some-amazing-library', @@ -1616,6 +1820,8 @@ describe('createRunner', () => { runnerTypeName: 'linuxCpu', is_ephemeral: true, }, + repositoryOwner: 'SomeAwesomeCoder', + repositoryName: 'some-amazing-library', }; await expect(createRunner(runnerParameters, metrics)).rejects.toThrow(); diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts index 038a0629c7..5d6bc4d6a7 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts @@ -8,6 +8,7 @@ import { Metrics, ScaleUpMetrics } from './metrics'; import { getJoinedStressTestExperiment, redisCached, redisLocked } from './cache'; import moment from 'moment'; import { RetryableScalingError } from './scale-up'; +import { addSSMParameterRunnerConfig } from './scale-up-try-reuse-runner-utils'; export interface ListRunnerFilters { applicationDeployDatetime?: string; @@ -25,6 +26,8 @@ export interface RunnerInputParameters { repoName?: string; orgName?: string; runnerType: RunnerType; + repositoryOwner: string; + repositoryName: string; } export interface AmiExpermient { @@ -207,27 +210,7 @@ export async function listRunners( /* istanbul ignore next */ return ( reservation.Instances?.map((instance) => { - const ebsVolumeReplacementRequestTimestamp = instance.Tags?.find( - (e) => e.Key === 'EBSVolumeReplacementRequestTm', - )?.Value; - const ephemeralRunnerFinished = instance.Tags?.find((e) => e.Key === 'EphemeralRunnerFinished')?.Value; - return { - applicationDeployDatetime: instance.Tags?.find((e) => e.Key === 'ApplicationDeployDatetime')?.Value, - awsRegion: itm.awsRegion, - az: instance.Placement?.AvailabilityZone?.toLocaleLowerCase(), - ebsVolumeReplacementRequestTimestamp: ebsVolumeReplacementRequestTimestamp - ? parseInt(ebsVolumeReplacementRequestTimestamp) - : undefined, - environment: instance.Tags?.find((e) => e.Key === 'Environment')?.Value, - ephemeralRunnerFinished: ephemeralRunnerFinished ? parseInt(ephemeralRunnerFinished) : undefined, - ghRunnerId: instance.Tags?.find((e) => e.Key === 'GithubRunnerID')?.Value, - instanceId: instance.InstanceId as string, - instanceManagement: instance.Tags?.find((e) => e.Key == 'InstanceManagement')?.Value, - launchTime: instance.LaunchTime, - org: instance.Tags?.find((e) => e.Key === 'Org')?.Value, - repo: instance.Tags?.find((e) => e.Key === 'Repo')?.Value, - runnerType: instance.Tags?.find((e) => e.Key === 'RunnerType')?.Value, - }; + return toRunnerInfo(instance, itm.awsRegion); }) ?? [] ); }) ?? [] @@ -239,6 +222,66 @@ export async function listRunners( } } +export async function getRunner( + metrics: Metrics, + awsRegion: string, + instanceId: string, +): Promise { + try { + const result = await metrics.trackRequestRegion( + awsRegion, + metrics.ec2DescribeInstancesAWSCallSuccess, + metrics.ec2DescribeInstancesAWSCallFailure, + () => { + return new EC2({ region: awsRegion }).describeInstances({ InstanceIds: [instanceId] }).promise(); + }, + ); + const instance = result.Reservations?.[0]?.Instances?.[0]; + if (!instance) return undefined; + + return toRunnerInfo(instance, awsRegion); + } catch (e) { + console.error(`[getEc2Runner]: ${e}`); + throw e; + } +} + +/** + * converts ec2 instance metadata to RunnerInfo + * @param instance + * @param awsRegion + * @returns + */ +export function toRunnerInfo(instance: AWS.EC2.Instance, awsRegion: string): RunnerInfo { + const getTag = (key: string) => instance.Tags?.find((t) => t.Key === key)?.Value; + return { + applicationDeployDatetime: getTag('ApplicationDeployDatetime'), + awsRegion, + az: instance.Placement?.AvailabilityZone?.toLowerCase(), + ebsVolumeReplacementRequestTimestamp: getTag('EBSVolumeReplacementRequestTm') + ? parseInt(getTag('EBSVolumeReplacementRequestTm')!) + : undefined, + + environment: getTag('Environment'), + ephemeralRunnerFinished: getTag('EphemeralRunnerFinished') + ? parseInt(getTag('EphemeralRunnerFinished')!) + : undefined, + ghRunnerId: getTag('GithubRunnerID'), + instanceId: instance.InstanceId!, + instanceManagement: getTag('InstanceManagement'), + launchTime: instance.LaunchTime, + repositoryName: getTag('RepositoryName'), + repositoryOwner: getTag('RepositoryOwner'), + runnerTypeLabels: getTag('RunnerTypeLabels') ? getTag('RunnerTypeLabels')?.split(',') : undefined, + runnerExtraLabels: getTag('RunnerExtraLabels') ? getTag('RunnerExtraLabels')?.split(',') : undefined, + runnerGroupName: getTag('RunngerGroupName'), + org: getTag('Org'), + repo: getTag('Repo'), + runnerType: getTag('RunnerType'), + stage: getTag('Stage'), + }; +} + export function getParameterNameForRunner(environment: string, instanceId: string): string { return `${environment}-${instanceId}`; } @@ -359,51 +402,6 @@ export async function terminateRunner(runner: RunnerInfo, metrics: Metrics): Pro } } -async function addSSMParameterRunnerConfig( - instancesId: string[], - runnerParameters: RunnerInputParameters, - customAmiExperiment: boolean, - ssm: SSM, - metrics: Metrics, - awsRegion: string, -): Promise { - /* istanbul ignore next */ - if (instancesId.length == 0) { - console.debug(`[${awsRegion}] No SSM parameter to be created, empty list of instances`); - return; - } - - let runnerConfig = await runnerParameters.runnerConfig(awsRegion, customAmiExperiment); - if (customAmiExperiment) { - runnerConfig = `${runnerConfig} #ON_AMI_EXPERIMENT`; - } - - const createdSSMParams = await Promise.all( - /* istanbul ignore next */ - instancesId.map(async (instanceId) => { - const parameterName = getParameterNameForRunner(runnerParameters.environment, instanceId); - return await expBackOff(() => { - return metrics.trackRequestRegion( - awsRegion, - metrics.ssmPutParameterAWSCallSuccess, - metrics.ssmPutParameterAWSCallFailure, - async () => { - await ssm - .putParameter({ - Name: parameterName, - Value: runnerConfig, - Type: 'SecureString', - }) - .promise(); - return parameterName; - }, - ); - }); - }) ?? [], - ); - console.debug(`[${awsRegion}] Created SSM Parameters(s): ${createdSSMParams.join(',')}`); -} - function getLaunchTemplateName(runnerParameters: RunnerInputParameters): Array { if (runnerParameters.runnerType.os === 'linux') { /* istanbul ignore next */ @@ -460,214 +458,6 @@ async function getCreateRunnerSubnetSequence( .flat(); } -export async function tryReuseRunner( - runnerParameters: RunnerInputParameters, - metrics: ScaleUpMetrics, -): Promise { - const filters: ListRunnerFilters = { - applicationDeployDatetime: Config.Instance.datetimeDeploy, - containsTags: ['GithubRunnerID', 'EphemeralRunnerFinished'], - environment: runnerParameters.environment, - instanceType: runnerParameters.runnerType.instance_type, - orgName: runnerParameters.orgName, - repoName: runnerParameters.repoName, - runnerType: runnerParameters.runnerType.runnerTypeName, - }; - if (await getJoinedStressTestExperiment('stresstest_awsfail', runnerParameters.runnerType.runnerTypeName)) { - console.warn( - `Joining stress test stresstest_awsfail, failing AWS reuse for ${runnerParameters.runnerType.runnerTypeName}`, - ); - throw new RetryableScalingError('Stress test stockout'); - } - - const runners = shuffleArrayInPlace(await listRunners(metrics, filters)); - - /* istanbul ignore next */ - if (runnerParameters.orgName !== undefined) { - metrics.runnersReuseFoundOrg(runners.length, runnerParameters.orgName, runnerParameters.runnerType.runnerTypeName); - } else if (runnerParameters.repoName !== undefined) { - metrics.runnersReuseFoundRepo( - runners.length, - getRepo(runnerParameters.repoName), - runnerParameters.runnerType.runnerTypeName, - ); - } - - const ec2M: Map = new Map(); - const ssmM: Map = new Map(); - - for (const runner of runners) { - if (runner.ghRunnerId === undefined) { - console.debug(`[tryReuseRunner]: Runner ${runner.instanceId} does not have a GithubRunnerID tag`); - continue; - } - if (runner.awsRegion === undefined) { - console.debug(`[tryReuseRunner]: Runner ${runner.instanceId} does not have a region`); - continue; - } - if (runner.org === undefined && runner.repo === undefined) { - console.debug(`[tryReuseRunner]: Runner ${runner.instanceId} does not have org or repo`); - continue; - } - if (runner.ephemeralRunnerFinished !== undefined) { - const finishedAt = moment.unix(runner.ephemeralRunnerFinished); - - if (finishedAt > moment(new Date()).subtract(1, 'minutes').utc()) { - console.debug(`[tryReuseRunner]: Runner ${runner.instanceId} finished a job less than a minute ago`); - continue; - } - - if (finishedAt.add(Config.Instance.minimumRunningTimeInMinutes, 'minutes') < moment(new Date()).utc()) { - console.debug( - `[tryReuseRunner]: Runner ${runner.instanceId} has been idle for over minimumRunningTimeInMinutes time of ` + - `${Config.Instance.minimumRunningTimeInMinutes} mins, so it's likely to be reclaimed soon and should ` + - `not be reused. It's been idle since ${finishedAt.format()}`, - ); - continue; - } - } - - try { - if (runnerParameters.orgName !== undefined) { - metrics.runnersReuseTryOrg(1, runnerParameters.orgName, runnerParameters.runnerType.runnerTypeName); - } else if (runnerParameters.repoName !== undefined) { - metrics.runnersReuseTryRepo(1, getRepo(runnerParameters.repoName), runnerParameters.runnerType.runnerTypeName); - } - - await redisLocked( - `tryReuseRunner`, - runner.instanceId, - async () => { - // I suspect it will be too many requests against GH API to check if runner is really offline - - if (ssmM.has(runner.awsRegion) === false) { - ssmM.set(runner.awsRegion, new SSM({ region: runner.awsRegion })); - } - const ssm = ssmM.get(runner.awsRegion) as SSM; - if (ec2M.has(runner.awsRegion) === false) { - ec2M.set(runner.awsRegion, new EC2({ region: runner.awsRegion })); - } - const ec2 = ec2M.get(runner.awsRegion) as EC2; - - // should come before removing other tags, this is useful so - // there is always a tag present for scaleDown to know that - // it can/will be reused and avoid deleting it - await expBackOff(() => { - return metrics.trackRequestRegion( - runner.awsRegion, - metrics.ec2CreateTagsAWSCallSuccess, - metrics.ec2CreateTagsAWSCallFailure, - () => { - return ec2 - .createTags({ - Resources: [runner.instanceId], - Tags: [{ Key: 'EBSVolumeReplacementRequestTm', Value: `${Math.floor(Date.now() / 1000)}` }], - }) - .promise(); - }, - ); - }); - console.debug(`[tryReuseRunner]: Reuse of runner ${runner.instanceId}: Created reuse tag`); - - await expBackOff(() => { - return metrics.trackRequestRegion( - runner.awsRegion, - metrics.ec2DeleteTagsAWSCallSuccess, - metrics.ec2DeleteTagsAWSCallFailure, - () => { - return ec2 - .deleteTags({ - Resources: [runner.instanceId], - Tags: [{ Key: 'GithubRunnerID' }, { Key: 'EphemeralRunnerFinished' }], - }) - .promise(); - }, - ); - }); - console.debug(`[tryReuseRunner]: Reuse of runner ${runner.instanceId}: Tags deleted`); - - await expBackOff(() => { - return metrics.trackRequestRegion( - runner.awsRegion, - metrics.ec2CreateReplaceRootVolumeTaskSuccess, - metrics.ec2CreateReplaceRootVolumeTaskFailure, - () => { - return ec2 - .createReplaceRootVolumeTask({ - InstanceId: runner.instanceId, - DeleteReplacedRootVolume: true, - }) - .promise(); - }, - ); - }); - console.debug(`[tryReuseRunner]: Reuse of runner ${runner.instanceId}: Replace volume task created`); - - await addSSMParameterRunnerConfig( - [runner.instanceId], - runnerParameters, - false, - ssm, - metrics, - runner.awsRegion, - ); - console.debug(`[tryReuseRunner]: Reuse of runner ${runner.instanceId}: Ssm parameter created`); - }, - undefined, - 180, - 0.05, - ); - - if (runnerParameters.orgName !== undefined) { - metrics.runnersReuseSuccessOrg( - runners.length, - runnerParameters.orgName, - runnerParameters.runnerType.runnerTypeName, - ); - } else if (runnerParameters.repoName !== undefined) { - metrics.runnersReuseSuccessRepo( - runners.length, - getRepo(runnerParameters.repoName), - runnerParameters.runnerType.runnerTypeName, - ); - } - - return runner; - } catch (e) { - console.debug( - `[tryReuseRunner]: something happened preventing to reuse runnerid ` + - `${runner.instanceId}, either an error or it is already locked to be reused ${e}`, - ); - - if (runnerParameters.orgName !== undefined) { - metrics.runnersReuseFailureOrg( - runners.length, - runnerParameters.orgName, - runnerParameters.runnerType.runnerTypeName, - ); - } else if (runnerParameters.repoName !== undefined) { - metrics.runnersReuseFailureRepo( - runners.length, - getRepo(runnerParameters.repoName), - runnerParameters.runnerType.runnerTypeName, - ); - } - } - } - - if (runnerParameters.orgName !== undefined) { - metrics.runnersReuseGiveUpOrg(runners.length, runnerParameters.orgName, runnerParameters.runnerType.runnerTypeName); - } else if (runnerParameters.repoName !== undefined) { - metrics.runnersReuseGiveUpRepo( - runners.length, - getRepo(runnerParameters.repoName), - runnerParameters.runnerType.runnerTypeName, - ); - } - - throw new Error('No runners available'); -} - export async function createRunner(runnerParameters: RunnerInputParameters, metrics: Metrics): Promise { try { console.debug('Runner configuration: ' + JSON.stringify(runnerParameters)); @@ -691,7 +481,14 @@ export async function createRunner(runnerParameters: RunnerInputParameters, metr const tags = [ { Key: 'Application', Value: 'github-action-runner' }, { Key: 'RunnerType', Value: runnerParameters.runnerType.runnerTypeName }, + { Key: 'RepositoryOwner', Value: runnerParameters.repositoryOwner }, + { Key: 'RepositoryName', Value: runnerParameters.repositoryName }, ]; + + if (runnerParameters.runnerType.labels) { + tags.push({ Key: 'RunnerTypeLabels', Value: runnerParameters.runnerType.labels.join(',') ?? '' }); + } + /* istanbul ignore next */ if (Config.Instance.datetimeDeploy) { tags.push({ Key: 'ApplicationDeployDatetime', Value: Config.Instance.datetimeDeploy }); @@ -702,12 +499,24 @@ export async function createRunner(runnerParameters: RunnerInputParameters, metr Value: runnerParameters.repoName, }); } + if (runnerParameters.orgName !== undefined) { tags.push({ Key: 'Org', Value: runnerParameters.orgName, }); } + + /* istanbul ignore next */ + if (Config.Instance.runnersExtraLabels) { + tags.push({ Key: 'RunnerExtraLabels', Value: Config.Instance.runnersExtraLabels }); + } + + /* istanbul ignore next */ + if (Config.Instance.runnerGroupName) { + tags.push({ Key: 'RunnerGroupName', Value: Config.Instance.runnerGroupName }); + } + let customAmi = runnerParameters.runnerType.ami; let customAmiExperiment = false; if (runnerParameters.runnerType.ami_experiment) { diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-try-reuse-runner-utils.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-try-reuse-runner-utils.ts new file mode 100644 index 0000000000..6ef8621ef4 --- /dev/null +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-try-reuse-runner-utils.ts @@ -0,0 +1,383 @@ +import moment from 'moment'; +import { getJoinedStressTestExperiment, redisLocked } from './cache'; +import { Config } from './config'; +import { Metrics, ScaleUpMetrics } from './metrics'; +import { getParameterNameForRunner, ListRunnerFilters, listRunners, RunnerInputParameters } from './runners'; +import { RetryableScalingError } from './scale-up'; +import { expBackOff, getRepo, RunnerInfo, shuffleArrayInPlace } from './utils'; +import { EC2, SSM } from 'aws-sdk'; + +export async function tryReuseRunner( + runnerParameters: RunnerInputParameters, + metrics: ScaleUpMetrics, +): Promise { + const filters: ListRunnerFilters = buildRunnerFilters(runnerParameters); + + if (await getJoinedStressTestExperiment('stresstest_awsfail', runnerParameters.runnerType.runnerTypeName)) { + console.warn( + `Joining stress test stresstest_awsfail, failing AWS reuse for ${runnerParameters.runnerType.runnerTypeName}`, + ); + throw new RetryableScalingError('Stress test stockout'); + } + + const runners = shuffleArrayInPlace(await listRunners(metrics, filters)); + + /* istanbul ignore next */ + if (runnerParameters.orgName !== undefined) { + metrics.runnersReuseFoundOrg(runners.length, runnerParameters.orgName, runnerParameters.runnerType.runnerTypeName); + } else if (runnerParameters.repoName !== undefined) { + metrics.runnersReuseFoundRepo( + runners.length, + getRepo(runnerParameters.repoName), + runnerParameters.runnerType.runnerTypeName, + ); + } + + const ec2M: Map = new Map(); + const ssmM: Map = new Map(); + + for (const runner of runners) { + try { + if (!isRunnerReusable(runner,'tryReuseRunner')) continue; + + // additional check for scale-up + if (runner.ephemeralRunnerFinished !== undefined) { + const finishedAt = moment.unix(runner.ephemeralRunnerFinished); + // when runner.ephemeralRunnerFinished is set, it indicates that the runner is at post-test stage of github, + // there are some left cleanup in the ec2 instancdes, this gives the buffer to make sure we handle it gracefully. + if (finishedAt > moment(new Date()).subtract(1, 'minutes').utc()) { + console.debug(`[tryReuseRunner]: Runner ${runner.instanceId} finished a job less than a minute ago`); + continue + } + } + logRunnerScope(runnerParameters, metrics); + + // appies redis locks to avoid race condition between multiple scale-up/scale-down pipelines + await redisLocked( + `tryReuseRunner`, + runner.instanceId, + async () => { + // I suspect it will be too many requests against GH API to check if runner is really offline + const ssm = getOrInit(ssmM, runner.awsRegion, () => new SSM({ region: runner.awsRegion })); + const ec2 = getOrInit(ec2M, runner.awsRegion, () => new EC2({ region: runner.awsRegion })); + + // Should come before removing other tags, this is useful so + // there is always a tag present for scaleDown to know that + // it can/will be reused and avoid deleting it. + createTagForReuse(ec2, runner, metrics); + console.debug(`[tryReuseRunner]: Reuse of runner ${runner.instanceId}: Created reuse tag`); + + // Delete EphemeralRunnerFinished tag to make sure other pipelines do not + // pick this instance up since it's in next stage, in this case, it's in the ReplaceVolume stage. + deleteTagForReuse(ec2, runner, metrics); + console.debug(`[tryReuseRunner]: Reuse of runner ${runner.instanceId}: Tags deleted`); + + replaceRootVolume(ec2, runner, metrics); + console.debug(`[tryReuseRunner]: Reuse of runner ${runner.instanceId}: Replace volume task created`); + + await addSSMParameterRunnerConfig( + [runner.instanceId], + runnerParameters, + false, + ssm, + metrics, + runner.awsRegion, + ); + console.debug(`[tryReuseRunner]: Reuse of runner ${runner.instanceId}: Ssm parameter created`); + }, + undefined, + 180, + 0.05, + ); + + logReuseSucces(runnerParameters, metrics, runners.length); + return runner; + } catch (e) { + console.debug( + `[tryReuseRunner]: something happened preventing to reuse runnerid ` + + `${runner.instanceId}, either an error or it is already locked to be reused ${e}`, + ); + logReuseFailure(runnerParameters, metrics, runners.length); + } + } + + logReuseGiveup(runnerParameters, metrics, runners.length); + throw new Error('No runners available'); +} + + +export async function tryRefreshRunner( + runnerParameters: RunnerInputParameters, + metrics: ScaleUpMetrics, + runner: RunnerInfo, +){ + try { + if (!isRunnerReusable(runner,'tryRefreshRunner')) { + console.debug(`[tryRefreshRunner][Skip]: Runner ${runner.instanceId} is not reusable`); + return; + } + + logRunnerScope(runnerParameters, metrics); + + + // appies redis locks to avoid race condition between multiple scale-up/scale-down pipelines + await redisLocked( + `tryRefreshRunner`, + runner.instanceId, + async () => { + + // set new ssm and ec2 clients + const ssm = new SSM({ region: runner.awsRegion }) + const ec2 = new EC2({ region: runner.awsRegion }) + + createTagForReuse(ec2, runner, metrics); + console.debug(`[tryRefreshRunner]: Refrehing runner ${runner.instanceId}: Created reuse tag`); + + // Delete EphemeralRunnerFinished tag to make sure other pipelines do not + // pick this instance up since it's in next stage, in this case, it's in the ReplaceVolume stage. + deleteTagForReuse(ec2, runner, metrics); + console.debug(`[tryRefreshRunner]: Refrehing runner ${runner.instanceId}: Tags deleted`); + + replaceRootVolume(ec2, runner, metrics); + console.debug(`[tryRefreshRunner]: Reuse of runner ${runner.instanceId}: Replace volume task created`); + + await addSSMParameterRunnerConfig( + [runner.instanceId], + runnerParameters, + false, + ssm, + metrics, + runner.awsRegion, + ); + console.debug(`[tryRefreshRunner]: Refrehing runner ${runner.instanceId}: Ssm parameter created`); + }, + undefined, + 180, + 0.05, + ); + logReuseSucces(runnerParameters, metrics, 1); + } catch (e) { + console.debug( + `[tryReuseRunner]: something happened preventing to reuse runnerid ` + + `${runner.instanceId}, either an error or it is already locked to be reused ${e}`, + ); + logReuseFailure(runnerParameters, metrics, 1); + } +} + +function buildRunnerFilters(params: RunnerInputParameters): ListRunnerFilters { + return { + applicationDeployDatetime: Config.Instance.datetimeDeploy, + containsTags: ['GithubRunnerID', 'EphemeralRunnerFinished'], + environment: params.environment, + instanceType: params.runnerType.instance_type, + orgName: params.orgName, + repoName: params.repoName, + runnerType: params.runnerType.runnerTypeName, + }; +} + +function getOrInit(map: Map, key: string, init: () => T): T { + if (!map.has(key)) map.set(key, init()); + return map.get(key)!; +} + +/** + * + * Create tags for ec2 instance ready for reuse + * EBSVolumeReplacementRequestTm: record when was last time the task to replace volume was created. + * scale-down pipeline will not delete the runner if the EBSVolumeReplacementRequestTmp is present + * and it's less than 5 mins. + * Stage: record the stage of the runner, in this case, it's in the ReplaceEBSVolume. + * Refresh and scaleup pipelines will not reuse the runner if the Stage is present and it's ReplaceEBSVolume. + * the stage tag will be removed once the replace volume task is completed at job's startup.sh + * @param ec2 + * @param runner + * @param metrics + */ +async function createTagForReuse(ec2: EC2, runner: RunnerInfo, metrics: ScaleUpMetrics) { + await expBackOff(() => + metrics.trackRequestRegion( + runner.awsRegion, + metrics.ec2CreateTagsAWSCallSuccess, + metrics.ec2CreateTagsAWSCallFailure, + () => + ec2 + .createTags({ + Resources: [runner.instanceId], + Tags: [ + { Key: 'EBSVolumeReplacementRequestTm', Value: `${Math.floor(Date.now() / 1000)}` }, + { Key: 'Stage', Value: 'ReplaceEBSVolume' }, + ], + }) + .promise(), + ), + ); +} + +async function deleteTagForReuse(ec2: EC2, runner: RunnerInfo, metrics: ScaleUpMetrics) { + await expBackOff(() => { + return metrics.trackRequestRegion( + runner.awsRegion, + metrics.ec2DeleteTagsAWSCallSuccess, + metrics.ec2DeleteTagsAWSCallFailure, + () => { + return ec2 + .deleteTags({ + Resources: [runner.instanceId], + Tags: [{ Key: 'GithubRunnerID' }, { Key: 'EphemeralRunnerFinished' }], + }) + .promise(); + }, + ); + }); +} + +async function replaceRootVolume(ec2: EC2, runner: RunnerInfo, metrics: ScaleUpMetrics) { + await expBackOff(() => + metrics.trackRequestRegion( + runner.awsRegion, + metrics.ec2CreateReplaceRootVolumeTaskSuccess, + metrics.ec2CreateReplaceRootVolumeTaskFailure, + () => + ec2 + .createReplaceRootVolumeTask({ + InstanceId: runner.instanceId, + DeleteReplacedRootVolume: true, + }) + .promise(), + ), + ); +} + +// -------------------------------------------helper functions ----------------------------------------------- + +export async function addSSMParameterRunnerConfig( + instancesId: string[], + runnerParameters: RunnerInputParameters, + customAmiExperiment: boolean, + ssm: SSM, + metrics: Metrics, + awsRegion: string, +): Promise { + /* istanbul ignore next */ + if (instancesId.length == 0) { + console.debug(`[${awsRegion}] No SSM parameter to be created, empty list of instances`); + return; + } + + let runnerConfig = await runnerParameters.runnerConfig(awsRegion, customAmiExperiment); + if (customAmiExperiment) { + runnerConfig = `${runnerConfig} #ON_AMI_EXPERIMENT`; + } + + const createdSSMParams = await Promise.all( + /* istanbul ignore next */ + instancesId.map(async (instanceId) => { + const parameterName = getParameterNameForRunner(runnerParameters.environment, instanceId); + return await expBackOff(() => { + return metrics.trackRequestRegion( + awsRegion, + metrics.ssmPutParameterAWSCallSuccess, + metrics.ssmPutParameterAWSCallFailure, + async () => { + await ssm + .putParameter({ + Name: parameterName, + Value: runnerConfig, + Type: 'SecureString', + }) + .promise(); + return parameterName; + }, + ); + }); + }) ?? [], + ); + console.debug(`[${awsRegion}] Created SSM Parameters(s): ${createdSSMParams.join(',')}`); +} + +function logRunnerScope(runnerParameters: RunnerInputParameters, metrics: ScaleUpMetrics) { + if (runnerParameters.orgName !== undefined) { + metrics.runnersReuseTryOrg(1, runnerParameters.orgName, runnerParameters.runnerType.runnerTypeName); + } else if (runnerParameters.repoName !== undefined) { + metrics.runnersReuseTryRepo(1, getRepo(runnerParameters.repoName), runnerParameters.runnerType.runnerTypeName); + } +} + +function isRunnerReusable(runner: RunnerInfo, useCase:string): boolean { + if (runner.ghRunnerId === undefined) { + console.debug(`[${useCase}]: Runner ${runner.instanceId} does not have a GithubRunnerID tag`); + return false; + } + if (runner.awsRegion === undefined) { + console.debug(`[${useCase}]: Runner ${runner.instanceId} does not have a region`); + return false; + } + if (runner.org === undefined && runner.repo === undefined) { + console.debug(`[${useCase}]: Runner ${runner.instanceId} does not have org or repo`); + return false; + } + + if (runner.stage !== undefined && runner.stage === 'ReplaceEBSVolume') { + console.debug( + `[${useCase}]: Runner ${runner.instanceId} the runner is in ReplaceEBSVolume stage, skip to reuse it`, + ); + return false; + } + + if (runner.ephemeralRunnerFinished !== undefined) { + const finishedAt = moment.unix(runner.ephemeralRunnerFinished); + + // since the runner finshed the previous github job, it's idling for a long time that it is likely to + // be caught in scale-down pipeline, we do not reuse it to avoid the race condition. + if (finishedAt.add(Config.Instance.minimumRunningTimeInMinutes, 'minutes') < moment(new Date()).utc()) { + console.debug( + `[${useCase}]: Runner ${runner.instanceId} has been idle for over minimumRunningTimeInMinutes time of ` + + `${Config.Instance.minimumRunningTimeInMinutes} mins, so it's likely to be reclaimed soon and should ` + + `not be reused. It's been idle since ${finishedAt.format()}`, + ); + return false; + } + } + + return true; +} + +// ------------------------------------------- Metrics loggings ----------------------------------------------- + +function logReuseSucces(runnerParameters: RunnerInputParameters, metrics: ScaleUpMetrics, runnerLength: number) { + if (runnerParameters.orgName !== undefined) { + metrics.runnersReuseSuccessOrg(runnerLength, runnerParameters.orgName, runnerParameters.runnerType.runnerTypeName); + } else if (runnerParameters.repoName !== undefined) { + metrics.runnersReuseSuccessRepo( + runnerLength, + getRepo(runnerParameters.repoName), + runnerParameters.runnerType.runnerTypeName, + ); + } +} + +function logReuseFailure(runnerParameters: RunnerInputParameters, metrics: ScaleUpMetrics, runnerLength: number) { + if (runnerParameters.orgName !== undefined) { + metrics.runnersReuseFailureOrg(runnerLength, runnerParameters.orgName, runnerParameters.runnerType.runnerTypeName); + } else if (runnerParameters.repoName !== undefined) { + metrics.runnersReuseFailureRepo( + runnerLength, + getRepo(runnerParameters.repoName), + runnerParameters.runnerType.runnerTypeName, + ); + } +} + +function logReuseGiveup(runnerParameters: RunnerInputParameters, metrics: ScaleUpMetrics, runnerLength: number) { + if (runnerParameters.orgName !== undefined) { + metrics.runnersReuseGiveUpOrg(runnerLength, runnerParameters.orgName, runnerParameters.runnerType.runnerTypeName); + } else if (runnerParameters.repoName !== undefined) { + metrics.runnersReuseGiveUpRepo( + runnerLength, + getRepo(runnerParameters.repoName), + runnerParameters.runnerType.runnerTypeName, + ); + } +} diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts index 0be90e1822..d3c3ed38d6 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts @@ -1,4 +1,4 @@ -import { createRunner, tryReuseRunner } from './runners'; +import { createRunner } from './runners'; import { createRegistrationTokenOrg, createRegistrationTokenRepo, @@ -16,6 +16,7 @@ import { scaleUp, _calculateScaleUpAmount } from './scale-up'; import * as MetricsModule from './metrics'; import { getJoinedStressTestExperiment } from './cache'; import { sleep } from './utils'; +import { tryReuseRunner } from './scale-up-try-reuse-runner-utils'; jest.mock('./cache'); jest.mock('./gh-issues'); @@ -321,6 +322,8 @@ describe('scaleUp', () => { runnerConfig: expect.any(Function), orgName: repo.owner, runnerType: runnerType1, + repositoryName: repo.repo, + repositoryOwner: repo.owner, }, metrics, ); @@ -406,6 +409,8 @@ describe('scaleUp', () => { runnerConfig: expect.any(Function), repoName: 'owner/repo', runnerType: runnerType1, + repositoryName: repo.repo, + repositoryOwner: repo.owner, }, metrics, ); @@ -491,6 +496,8 @@ describe('scaleUp', () => { runnerConfig: expect.any(Function), repoName: 'owner/repo', runnerType: runnerType1, + repositoryName: repo.repo, + repositoryOwner: repo.owner, }, metrics, ); @@ -575,6 +582,8 @@ describe('scaleUp', () => { runnerConfig: expect.any(Function), repoName: 'owner/repo', runnerType: runnerType1, + repositoryName: repo.repo, + repositoryOwner: repo.owner, }, metrics, ); @@ -660,6 +669,8 @@ describe('scaleUp', () => { runnerConfig: expect.any(Function), repoName: 'owner/repo', runnerType: runnerType1, + repositoryName: repo.repo, + repositoryOwner: repo.owner, }, metrics, ); @@ -745,6 +756,8 @@ describe('scaleUp', () => { runnerConfig: expect.any(Function), repoName: 'owner/repo', runnerType: runnerType1, + repositoryName: repo.repo, + repositoryOwner: repo.owner, }, metrics, ); @@ -830,6 +843,8 @@ describe('scaleUp', () => { runnerConfig: expect.any(Function), repoName: 'owner/repo', runnerType: runnerType1, + repositoryName: repo.repo, + repositoryOwner: repo.owner, }, metrics, ); @@ -945,10 +960,12 @@ describe('scaleUp', () => { expect(mockedCreateRunner).toBeCalledWith( { environment: config.environment, - // eslint-disable-next-line max-len - runnerConfig: expect.any(Function), repoName: 'owner/repo', runnerType: runnerType1, + repositoryName: 'repo', + repositoryOwner: 'owner', + // eslint-disable-next-line max-len + runnerConfig: expect.any(Function), }, metrics, ); diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts index 65752c9138..bf3bef0e23 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts @@ -1,6 +1,6 @@ import { Metrics, ScaleUpMetrics } from './metrics'; import { Repo, getRepoKey, sleep } from './utils'; -import { RunnerType, RunnerInputParameters, createRunner, tryReuseRunner } from './runners'; +import { RunnerType, RunnerInputParameters, createRunner } from './runners'; import { createRegistrationTokenOrg, createRegistrationTokenRepo, @@ -13,6 +13,7 @@ import { import { Config } from './config'; import { getRepoIssuesWithLabel } from './gh-issues'; import { getJoinedStressTestExperiment } from './cache'; +import { tryReuseRunner } from './scale-up-try-reuse-runner-utils'; export interface ActionRequestMessage { id: number; @@ -114,6 +115,8 @@ export async function scaleUp( ); }, runnerType: runnerType, + repositoryOwner: repo.owner, + repositoryName: repo.repo, }; if (Config.Instance.enableOrganizationRunners) { createRunnerParams.orgName = repo.owner; @@ -178,32 +181,30 @@ async function createRunnerConfigArgument( ); await sleep(60 * 1000); } - - const ephemeralArgument = runnerType.is_ephemeral || experimentalRunner ? '--ephemeral' : ''; - const labelsArgument = [ - `AWS:${awsRegion}`, - `${runnerType.runnerTypeName}`, - ...(experimentalRunner ? ['experimental.ami'] : []), - ...(Config.Instance.runnersExtraLabels ? Config.Instance.runnersExtraLabels.split(',') : []), - ...(runnerType.labels ?? []), - ].join(','); - - if (Config.Instance.enableOrganizationRunners) { - /* istanbul ignore next */ - const runnerGroupArgument = - Config.Instance.runnerGroupName !== undefined ? `--runnergroup ${Config.Instance.runnerGroupName}` : ''; - const token = await createRegistrationTokenOrg(repo.owner, metrics, installationId); - return ( - `--url ${Config.Instance.ghesUrlHost}/${repo.owner} ` + - `--token ${token} --labels ${labelsArgument} ${ephemeralArgument} ${runnerGroupArgument}` - ); - } else { - const token = await createRegistrationTokenRepo(repo, metrics, installationId); - return ( - `--url ${Config.Instance.ghesUrlHost}/${repo.owner}/${repo.repo} ` + - `--token ${token} --labels ${labelsArgument} ${ephemeralArgument}` - ); - } + const runnerTypeName = runnerType.runnerTypeName; + const isEphemeral = runnerType.is_ephemeral || experimentalRunner; + const runnerTypeLabels = runnerType.labels; + + const extraRunnerLabels = Config.Instance.runnersExtraLabels?.split(',') ?? []; + const isOrgRunner = Config.Instance.enableOrganizationRunners; + const runnerGroupName = Config.Instance.runnerGroupName; + const ghesUrlHost = Config.Instance.ghesUrlHost; + + return innerCreateRunnerConfigArgument( + runnerTypeName, + repo.repo, + repo.owner, + awsRegion, + metrics, + ghesUrlHost, + isOrgRunner, + isEphemeral, + experimentalRunner, + runnerTypeLabels, + extraRunnerLabels, + runnerGroupName, + installationId, + ); } async function shouldSkipForRepo(repo: Repo, metrics: Metrics): Promise { @@ -390,3 +391,65 @@ export function _calculateScaleUpAmount( return scaleUpAmount; } + +/** + * + * @param runnerTypeName + * @param repositoryName + * @param repositoryOwner + * @param awsRegion + * @param metrics + * @param ghesUrlHost + * @param isOrgRunner + * @param isEphemeral + * @param experimentalRunner + * @param runnersExtraLabels + * @param runnerLabels + * @param runnerGroupName + * @param installationId + * @returns + */ +export async function innerCreateRunnerConfigArgument( + runnerTypeName: string, + repositoryName: string, + repositoryOwner: string, + awsRegion: string, + metrics: Metrics, + ghesUrlHost: string, + isOrgRunner: boolean, + isEphemeral: boolean, + experimentalRunner: boolean, + runnersExtraLabels?: string[] | undefined, + runnerLabels?: string[] | undefined, + runnerGroupName?: string | undefined, + installationId?: number | undefined, +): Promise { + const ephemeralArgument = isEphemeral ? '--ephemeral' : ''; + const labelsArgument = [ + `AWS:${awsRegion}`, + runnerTypeName, + ...(experimentalRunner ? ['experimental.ami'] : []), + ...(runnersExtraLabels ? runnersExtraLabels : []), + ...(runnerLabels ?? []), + ].join(','); + + if (isOrgRunner) { + /* istanbul ignore next */ + const runnerGroupArgument = runnerGroupName !== undefined ? `--runnergroup ${Config.Instance.runnerGroupName}` : ''; + const token = await createRegistrationTokenOrg(repositoryOwner, metrics, installationId); + return ( + `--url ${ghesUrlHost}/${repositoryOwner} ` + + `--token ${token} --labels ${labelsArgument} ${ephemeralArgument} ${runnerGroupArgument}` + ); + } else { + const token = await createRegistrationTokenRepo( + { repo: repositoryName, owner: repositoryOwner }, + metrics, + installationId, + ); + return ( + `--url ${ghesUrlHost}/${repositoryOwner}/${repositoryName} ` + + `--token ${token} --labels ${labelsArgument} ${ephemeralArgument}` + ); + } +} diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/utils.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/utils.ts index ca944a943e..065a979d3a 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/utils.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/utils.ts @@ -16,9 +16,15 @@ export interface RunnerInfo { instanceId: string; instanceManagement?: string; launchTime?: Date; + repositoryOwner?: string; + repositoryName?: string; + runnerTypeLabels?: string[]; + runnerExtraLabels?: string[]; + runnerGroupName?: string; org?: string; repo?: string; runnerType?: string; + stage?: string; } export function getRepoKey(repo: Repo): string {