Skip to content

Commit

Permalink
Add retry mechanisme for scaling errors
Browse files Browse the repository at this point in the history
Add retry mechanisme for scaling errors

Add retry mechanisme for scaling errors

Add retry mechanisme for scaling errors
  • Loading branch information
npalm committed Nov 30, 2021
1 parent a537f9a commit 65129c8
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 30 deletions.
18 changes: 12 additions & 6 deletions examples/default/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@ module "runners" {
webhook_secret = random_password.random.result
}

webhook_lambda_zip = "lambdas-download/webhook.zip"
runner_binaries_syncer_lambda_zip = "lambdas-download/runner-binaries-syncer.zip"
runners_lambda_zip = "lambdas-download/runners.zip"
enable_organization_runners = false
runner_extra_labels = "default,example"
# webhook_lambda_zip = "lambdas-download/webhook.zip"
# runner_binaries_syncer_lambda_zip = "lambdas-download/runner-binaries-syncer.zip"
# runners_lambda_zip = "lambdas-download/runners.zip"
enable_organization_runners = true
runner_extra_labels = "default,example"

# enable access to the runners via SSM
enable_ssm_on_runners = true
Expand All @@ -61,8 +61,14 @@ module "runners" {
instance_types = ["m5.large", "c5.large"]

# override delay of events in seconds
delay_webhook_event = 0
delay_webhook_event = 10
//job_queue_retention_in_seconds = 600
//job_queue_retention_in_seconds = 60
runners_maximum_count = 1

# override scaling down
scale_down_schedule_expression = "cron(* * * * ? *)"

enable_ephemeral_runners = true
disable_check_wokflow_job_labels = true
}
26 changes: 17 additions & 9 deletions modules/runners/lambdas/runners/src/lambda.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,29 @@ import { scaleUp } from './scale-runners/scale-up';
import { scaleDown } from './scale-runners/scale-down';
import { SQSEvent, ScheduledEvent, Context, Callback } from 'aws-lambda';
import { logger } from './scale-runners/logger';
import ScaleError from './scale-runners/ScaleError';
import 'source-map-support/register';

export async function scaleUpHandler(event: SQSEvent, context: Context, callback: Callback): Promise<void> {
logger.setSettings({ requestId: context.awsRequestId });
logger.debug(JSON.stringify(event));
try {
for (const e of event.Records) {
await scaleUp(e.eventSource, JSON.parse(e.body));
}

callback(null);
} catch (e) {
logger.error(e);
callback('Failed handling SQS event');
// TODO find the a more elegant way :(
if (event.Records.length != 1) {
logger.warn('Event ignored, only on record at the time can be handled, ensure the lambda batch size is set to 1.');
return new Promise((resolve) => resolve());
}

return new Promise((resolve, reject) => {
scaleUp(event.Records[0].eventSource, JSON.parse(event.Records[0].body))
.then(() => resolve())
.catch((e: Error) => {
if (e instanceof ScaleError) {
reject(e);
} else {
logger.warn('Ignoring error: ', e);
}
});
});
}

export async function scaleDownHandler(event: ScheduledEvent, context: Context, callback: Callback): Promise<void> {
Expand Down
21 changes: 12 additions & 9 deletions modules/runners/lambdas/runners/src/scale-runners/runners.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,16 +85,19 @@ export async function createRunner(runnerParameters: RunnerInputParameters, laun
.runInstances(getInstanceParams(launchTemplateName, runnerParameters))
.promise();
logger.info('Created instance(s): ', runInstancesResponse.Instances?.map((i) => i.InstanceId).join(','));

const ssm = new SSM();
runInstancesResponse.Instances?.forEach(async (i: EC2.Instance) => {
await ssm
.putParameter({
Name: runnerParameters.environment + '-' + (i.InstanceId as string),
Value: runnerParameters.runnerServiceConfig,
Type: 'SecureString',
})
.promise();
});
if (runInstancesResponse.Instances != undefined) {
for (let i = 0; i < runInstancesResponse.Instances?.length; i++) {
await ssm
.putParameter({
Name: runnerParameters.environment + '-' + (runInstancesResponse.Instances[i].InstanceId as string),
Value: runnerParameters.runnerServiceConfig,
Type: 'SecureString',
})
.promise();
}
}
}

function getInstanceParams(
Expand Down
13 changes: 7 additions & 6 deletions modules/runners/lambdas/runners/src/scale-runners/scale-up.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { createOctoClient, createGithubAppAuth, createGithubInstallationAuth } f
import yn from 'yn';
import { Octokit } from '@octokit/rest';
import { logger as rootLogger } from './logger';
import ScaleError from './ScaleError';

const logger = rootLogger.getChildLogger();

Expand Down Expand Up @@ -57,17 +58,14 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage
const runnerType = enableOrgLevel ? 'Org' : 'Repo';
const runnerOwner = enableOrgLevel ? payload.repositoryOwner : `${payload.repositoryOwner}/${payload.repositoryName}`;

const isQueued = await getJobStatus(githubInstallationClient, payload);
// ephemeral runners should be created on every event, will only work with `workflow_job` events.
if (ephemeral || isQueued) {
if (ephemeral || (await getJobStatus(githubInstallationClient, payload))) {
const currentRunners = await listEC2Runners({
environment,
runnerType,
runnerOwner,
});
logger.info(`${runnerType} ${runnerOwner} has ${currentRunners.length}/${maximumRunners} runners`);

// TODO: how to handle the event if the max is reached in case of ephemeral runners
if (currentRunners.length < maximumRunners) {
console.info(`Attempting to launch a new runner`);
// create token
Expand All @@ -94,7 +92,10 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage
runnerType,
});
} else {
logger.info('No runner will be created, maximum number of runners reached.');
logger.warn('No runner created: maximum number of runners reached.');
if (ephemeral) {
throw new ScaleError('No runners create: maximum of runners reached.');
}
}
}
}
Expand Down Expand Up @@ -139,6 +140,6 @@ export async function createRunnerLoop(runnerParameters: RunnerInputParameters):
}
}
if (launched == false) {
throw Error('All launch templates failed');
throw new ScaleError('All launch templates failed');
}
}
1 change: 1 addition & 0 deletions modules/runners/scale-up.tf
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ resource "aws_cloudwatch_log_group" "scale_up" {
resource "aws_lambda_event_source_mapping" "scale_up" {
event_source_arn = var.sqs_build_queue.arn
function_name = aws_lambda_function.scale_up.arn
batch_size = 1
}

resource "aws_lambda_permission" "scale_runners_lambda" {
Expand Down

0 comments on commit 65129c8

Please sign in to comment.