feat: added changes to enable tracing in lambdas. (#3554)

This PR addresses the need to enable tracing for the lambdas used in the runners architecture: # Highlights: - This feature enables the tracing in all lambdas which allow to debug/investigate any issues that arise out of day-to-day use of runners infrastructure. - If user decides to add all the features provisioned in the PR, user should be able to find the complete linked trace between the time a webhook is triggered with workflow job event to the API gateway endpoint to the execution of scale up lambda which creates a new runner to fulfill the need of creating a new runner and also find the relevant logs linked to the trace in **AWS CloudWatch ServiceLens**. As of result, user need not navigate to various log groups to find any issue in any given service. - Please find the X-ray costing in this [link](https://aws.amazon.com/xray/pricing/) detailing the cost involved in enabling this feature. # Additions: - [x] Provide an option to enable traces in EC2 bash script which allows to find and link any issues that may arise out of starting the runner and find this information linked in the trace created out of this feature. # Options: Use Cloudwatch config agent which now supports to capture traces ([link](https://aws.amazon.com/about-aws/whats-new/2023/08/amazon-cloudwatch-agent-opentelemetry-traces-x-ray/)) can be used to capture traces and link them to the log groups. --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: Niek Palm <npalm@users.noreply.github.com>
philips-labs · Nov 8, 2023 · 970e8a6 · 970e8a6
1 parent d85511a
commit 970e8a6
Show file tree

Hide file tree

Showing 63 changed files with 531 additions and 197 deletions.
diff --git a/README.md b/README.md
@@ -32,6 +32,7 @@ This [Terraform](https://www.terraform.io/) module creates the required infrastr
 - [Examples](#examples)
 - [Sub modules](#sub-modules)
 - [Logging](#logging)
+- [Tracing](#tracing)
 - [Debugging](#debugging)
 - [Security Considerations](#security-considerations)
 - [Requirements](#requirements)
@@ -427,6 +428,17 @@ An example log message of the scale-up function:
     }
 }
 ```
+## Tracing
+For the distributed architecture of this application it can be difficult to troubleshoot this application.
+We support the option to enable tracing for all the lambda functions created by this application. To enable tracing user can simply provide the `tracing_config` option inside the root module or inner modules.
+
+This tracing config generates timelines for following events:
+- Basic lifecycle of lambda function
+- Traces for Github API calls (can be configured by capture_http_requests).
+- Traces for all AWS SDK calls
+
+This feature has been disabled by default.
+
 
 ## Debugging
 
@@ -543,7 +555,7 @@ We welcome any improvement to the standard module to make the default as secure
 | <a name="input_lambda_s3_bucket"></a> [lambda\_s3\_bucket](#input\_lambda\_s3\_bucket) | S3 bucket from which to specify lambda functions. This is an alternative to providing local files directly. | `string` | `null` | no |
 | <a name="input_lambda_security_group_ids"></a> [lambda\_security\_group\_ids](#input\_lambda\_security\_group\_ids) | List of security group IDs associated with the Lambda function. | `list(string)` | `[]` | no |
 | <a name="input_lambda_subnet_ids"></a> [lambda\_subnet\_ids](#input\_lambda\_subnet\_ids) | List of subnets in which the action runners will be launched, the subnets needs to be subnets in the `vpc_id`. | `list(string)` | `[]` | no |
-| <a name="input_lambda_tracing_mode"></a> [lambda\_tracing\_mode](#input\_lambda\_tracing\_mode) | Enable X-Ray tracing for the lambda functions. | `string` | `null` | no |
+| <a name="input_lambda_tracing_mode"></a> [lambda\_tracing\_mode](#input\_lambda\_tracing\_mode) | DEPRECATED: Replaced by `tracing_config`. | `string` | `null` | no |
 | <a name="input_log_level"></a> [log\_level](#input\_log\_level) | Logging level for lambda logging. Valid values are  'silly', 'trace', 'debug', 'info', 'warn', 'error', 'fatal'. | `string` | `"info"` | no |
 | <a name="input_logging_kms_key_id"></a> [logging\_kms\_key\_id](#input\_logging\_kms\_key\_id) | Specifies the kms key id to encrypt the logs with. | `string` | `null` | no |
 | <a name="input_logging_retention_in_days"></a> [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | Specifies the number of days you want to retain log events for the lambda log group. Possible values are: 0, 1, 3, 5, 7, 14, 30, 60, 90, 120, 150, 180, 365, 400, 545, 731, 1827, and 3653. | `number` | `180` | no |
@@ -593,6 +605,7 @@ We welcome any improvement to the standard module to make the default as secure
 | <a name="input_syncer_lambda_s3_key"></a> [syncer\_lambda\_s3\_key](#input\_syncer\_lambda\_s3\_key) | S3 key for syncer lambda function. Required if using an S3 bucket to specify lambdas. | `string` | `null` | no |
 | <a name="input_syncer_lambda_s3_object_version"></a> [syncer\_lambda\_s3\_object\_version](#input\_syncer\_lambda\_s3\_object\_version) | S3 object version for syncer lambda function. Useful if S3 versioning is enabled on source bucket. | `string` | `null` | no |
 | <a name="input_tags"></a> [tags](#input\_tags) | Map of tags that will be added to created resources. By default resources will be tagged with name and environment. | `map(string)` | `{}` | no |
+| <a name="input_tracing_config"></a> [tracing\_config](#input\_tracing\_config) | Configuration for lambda tracing. | <pre>object({<br>    mode                  = optional(string, null)<br>    capture_http_requests = optional(bool, false)<br>    capture_error         = optional(bool, false)<br>  })</pre> | `{}` | no |
 | <a name="input_userdata_post_install"></a> [userdata\_post\_install](#input\_userdata\_post\_install) | Script to be ran after the GitHub Actions runner is installed on the EC2 instances | `string` | `""` | no |
 | <a name="input_userdata_pre_install"></a> [userdata\_pre\_install](#input\_userdata\_pre\_install) | Script to be ran before the GitHub Actions runner is installed on the EC2 instances | `string` | `""` | no |
 | <a name="input_userdata_template"></a> [userdata\_template](#input\_userdata\_template) | Alternative user-data template, replacing the default template. By providing your own user\_data you have to take care of installing all required software, including the action runner. Variables userdata\_pre/post\_install are ignored. | `string` | `null` | no |

diff --git a/examples/ephemeral/main.tf b/examples/ephemeral/main.tf
@@ -69,11 +69,20 @@ module "runners" {
   #
   enable_job_queued_check = true
 
+  # tracing_config = {
+  #   mode                  = "Active"
+  #   capture_error         = true
+  #   capture_http_requests = true
+  # }
+
+
   # configure your pre-built AMI
   # enable_userdata = false
-  # ami_filter       = { name = ["github-runner-al2023-x86_64-*"], state = ["available"] }
-  # data "aws_caller_identity" "current" {}
-  # ami_owners       = [data.aws_caller_identity.current.account_id]
+  # ami_filter      = { name = ["github-runner-al2023-x86_64-*"], state = ["available"] }
+  # ami_owners      = [data.aws_caller_identity.current.account_id]
+
+  # or use the default AMI
+  # enable_userdata = true
 
   # Enable debug logging for the lambda functions
   # log_level = "debug"

diff --git a/examples/multi-runner/main.tf b/examples/multi-runner/main.tf
@@ -57,6 +57,12 @@ module "runners" {
     id             = var.github_app.id
     webhook_secret = random_id.random.hex
   }
+  # enable this section for tracing
+  # tracing_config = {
+  #   mode                  = "Active"
+  #   capture_error         = true
+  #   capture_http_requests = true
+  # }
   # Assuming local build lambda's to use pre build ones, uncomment the lines below and download the
   # lambda zip files lambda_download
   # webhook_lambda_zip                = "../lambdas-download/webhook.zip"

diff --git a/lambdas/functions/ami-housekeeper/src/ami.ts b/lambdas/functions/ami-housekeeper/src/ami.ts
@@ -10,6 +10,7 @@ import {
 } from '@aws-sdk/client-ec2';
 import { DescribeParametersCommand, GetParameterCommand, SSMClient } from '@aws-sdk/client-ssm';
 import { createChildLogger } from '@terraform-aws-github-runner/aws-powertools-util';
+import { getTracedAWSV3Client } from '@terraform-aws-github-runner/aws-powertools-util';
 
 const logger = createChildLogger('ami');
 
@@ -82,7 +83,7 @@ async function getAmisNotInUse(options: AmiCleanupOptions) {
   const amiIdsInSSM = await getAmisReferedInSSM(options);
   const amiIdsInTemplates = await getAmiInLatestTemplates(options);
 
-  const ec2Client = new EC2Client({});
+  const ec2Client = getTracedAWSV3Client(new EC2Client({}));
   logger.debug('Getting all AMIs from ec2 with filters', { filters: options.amiFilters });
   const amiEc2 = await ec2Client.send(
     new DescribeImagesCommand({
@@ -133,7 +134,7 @@ async function deleteAmi(amiDetails: Image, options: AmiCleanupOptionsInternal):
 
   try {
     logger.info(`deleting ami ${amiDetails.Name || amiDetails.ImageId} created at ${amiDetails.CreationDate}`);
-    const ec2Client = new EC2Client({});
+    const ec2Client = getTracedAWSV3Client(new EC2Client({}));
     await ec2Client.send(new DeregisterImageCommand({ ImageId: amiDetails.ImageId, DryRun: options.dryRun }));
     await deleteSnapshot(options, amiDetails, ec2Client);
   } catch (error) {
@@ -158,7 +159,7 @@ async function deleteSnapshot(options: AmiCleanupOptions, amiDetails: Image, ec2
 }
 
 async function getAmiInLatestTemplates(options: AmiCleanupOptions): Promise<(string | undefined)[]> {
-  const ec2Client = new EC2Client({});
+  const ec2Client = getTracedAWSV3Client(new EC2Client({}));
   const launnchTemplates = await ec2Client.send(
     new DescribeLaunchTemplatesCommand({
       LaunchTemplateNames: options.launchTemplateNames,
@@ -188,7 +189,7 @@ async function getAmisReferedInSSM(options: AmiCleanupOptions): Promise<(string
     return [];
   }
 
-  const ssmClient = new SSMClient({});
+  const ssmClient = getTracedAWSV3Client(new SSMClient({}));
   const ssmParams = await ssmClient.send(
     new DescribeParametersCommand({
       ParameterFilters: [

diff --git a/lambdas/functions/control-plane/package.json b/lambdas/functions/control-plane/package.json
@@ -40,11 +40,13 @@
   "dependencies": {
     "@aws-sdk/client-ec2": "^3.436.0",
     "@aws-sdk/types": "^3.433.0",
+    "@middy/core": "^3.6.2",
     "@octokit/auth-app": "6.0.1",
     "@octokit/rest": "20.0.2",
     "@octokit/types": "^12.1.1",
     "@terraform-aws-github-runner/aws-powertools-util": "*",
     "@terraform-aws-github-runner/aws-ssm-util": "*",
+    "axios": "^1.5.1",
     "cron-parser": "^4.8.1",
     "typescript": "^5.0.4"
   }

diff --git a/lambdas/functions/control-plane/src/aws/runners.d.ts b/lambdas/functions/control-plane/src/aws/runners.d.ts
@@ -39,4 +39,5 @@ export interface RunnerInputParameters {
   };
   numberOfRunners?: number;
   amiIdSsmParameterName?: string;
+  tracingEnabled?: boolean;
 }
diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts
@@ -10,6 +10,7 @@ import {
   TerminateInstancesCommand,
 } from '@aws-sdk/client-ec2';
 import { GetParameterCommand, GetParameterResult, PutParameterCommand, SSMClient } from '@aws-sdk/client-ssm';
+import { tracer } from '@terraform-aws-github-runner/aws-powertools-util';
 import { mockClient } from 'aws-sdk-client-mock';
 import 'aws-sdk-client-mock-jest';
 
@@ -236,6 +237,15 @@ describe('create runner', () => {
       Name: 'my-ami-id-param',
     });
   });
+  it('calls create fleet of 1 instance with runner tracing enabled', async () => {
+    tracer.getRootXrayTraceId = jest.fn().mockReturnValue('123');
+
+    await createRunner(createRunnerConfig({ ...defaultRunnerConfig, tracingEnabled: true }));
+
+    expect(mockEC2Client).toHaveReceivedCommandWith(CreateFleetCommand, {
+      ...expectedCreateFleetRequest({ ...defaultExpectedFleetRequestValues, tracingEnabled: true }),
+    });
+  });
 });
 
 describe('create runner with errors', () => {
@@ -350,6 +360,7 @@ interface RunnerConfig {
   allocationStrategy: SpotAllocationStrategy;
   maxSpotPrice?: string;
   amiIdSsmParameterName?: string;
+  tracingEnabled?: boolean;
 }
 
 function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
@@ -366,6 +377,7 @@ function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
     },
     subnets: ['subnet-123', 'subnet-456'],
     amiIdSsmParameterName: runnerConfig.amiIdSsmParameterName,
+    tracingEnabled: runnerConfig.tracingEnabled,
   };
 }
 
@@ -376,6 +388,7 @@ interface ExpectedFleetRequestValues {
   maxSpotPrice?: string;
   totalTargetCapacity: number;
   imageId?: string;
+  tracingEnabled?: boolean;
 }
 
 function expectedCreateFleetRequest(expectedValues: ExpectedFleetRequestValues): CreateFleetCommandInput {
@@ -385,6 +398,10 @@ function expectedCreateFleetRequest(expectedValues: ExpectedFleetRequestValues):
     { Key: 'ghr:Type', Value: expectedValues.type },
     { Key: 'ghr:Owner', Value: REPO_NAME },
   ];
+  if (expectedValues.tracingEnabled) {
+    const traceId = tracer.getRootXrayTraceId();
+    tags.push({ Key: 'ghr:trace_id', Value: traceId! });
+  }
   const request: CreateFleetCommandInput = {
     LaunchTemplateConfigs: [
       {

diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts
@@ -9,6 +9,7 @@ import {
   _InstanceType,
 } from '@aws-sdk/client-ec2';
 import { createChildLogger } from '@terraform-aws-github-runner/aws-powertools-util';
+import { getTracedAWSV3Client, tracer } from '@terraform-aws-github-runner/aws-powertools-util';
 import { getParameter } from '@terraform-aws-github-runner/aws-ssm-util';
 import moment from 'moment';
 
@@ -56,7 +57,7 @@ function constructFilters(filters?: Runners.ListRunnerFilters): Ec2Filter[][] {
 }
 
 async function getRunners(ec2Filters: Ec2Filter[]): Promise<Runners.RunnerList[]> {
-  const ec2 = new EC2Client({ region: process.env.AWS_REGION });
+  const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION }));
   const runners: Runners.RunnerList[] = [];
   let nextToken;
   let hasNext = true;
@@ -93,7 +94,7 @@ function getRunnerInfo(runningInstances: DescribeInstancesResult) {
 }
 
 export async function terminateRunner(instanceId: string): Promise<void> {
-  const ec2 = new EC2Client({ region: process.env.AWS_REGION });
+  const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION }));
   await ec2.send(new TerminateInstancesCommand({ InstanceIds: [instanceId] }));
   logger.info(`Runner ${instanceId} has been terminated.`);
 }
@@ -126,7 +127,7 @@ export async function createRunner(runnerParameters: Runners.RunnerInputParamete
     },
   });
 
-  const ec2Client = new EC2Client({ region: process.env.AWS_REGION });
+  const ec2Client = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION }));
 
   let amiIdOverride = undefined;
 
@@ -145,13 +146,19 @@ export async function createRunner(runnerParameters: Runners.RunnerInputParamete
   }
 
   const numberOfRunners = runnerParameters.numberOfRunners ? runnerParameters.numberOfRunners : 1;
+
   const tags = [
     { Key: 'ghr:Application', Value: 'github-action-runner' },
     { Key: 'ghr:created_by', Value: numberOfRunners === 1 ? 'scale-up-lambda' : 'pool-lambda' },
     { Key: 'ghr:Type', Value: runnerParameters.runnerType },
     { Key: 'ghr:Owner', Value: runnerParameters.runnerOwner },
   ];
 
+  if (runnerParameters.tracingEnabled) {
+    const traceId = tracer.getRootXrayTraceId();
+    tags.push({ Key: 'ghr:trace_id', Value: traceId! });
+  }
+
   let fleet: CreateFleetResult;
   try {
     // see for spec https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_CreateFleet.html

diff --git a/lambdas/functions/control-plane/src/axios/fetch-override.test.ts b/lambdas/functions/control-plane/src/axios/fetch-override.test.ts
@@ -0,0 +1,31 @@
+import axios, { AxiosResponse } from 'axios';
+
+import { axiosFetch } from './fetch-override';
+
+jest.mock('axios');
+type FetchResponse = AxiosResponse & { json: () => string };
+
+describe('axiosFetch', () => {
+  beforeEach(() => {
+    jest.clearAllMocks();
+  });
+  it('should return a promise that resolves with the response data', async () => {
+    // Arrange
+    const url = 'https://example.com';
+    const options = { body: { foo: 'bar' } };
+    const responseData = { data: { baz: 'qux' } };
+    const mockedAxios = axios as unknown as jest.Mock;
+    mockedAxios.mockResolvedValue(responseData);
+
+    // Act
+    const result = (await axiosFetch(url, options)) as FetchResponse;
+
+    // Assert
+    expect(axios).toHaveBeenCalledWith(url, { ...options, data: options.body });
+    expect(result).toEqual({
+      ...responseData,
+      json: expect.any(Function),
+    });
+    expect(result.json()).toEqual(responseData.data);
+  });
+});
diff --git a/lambdas/functions/control-plane/src/axios/fetch-override.ts b/lambdas/functions/control-plane/src/axios/fetch-override.ts
@@ -0,0 +1,19 @@
+import axios, { AxiosRequestConfig, AxiosResponse } from 'axios';
+
+type FetchResponse = AxiosResponse & { json: () => string };
+
+type FetchOptions = AxiosRequestConfig & { body?: object };
+
+// Fetch is not covered to be traced by xray so we need to override it with axios
+// https://github.com/aws/aws-xray-sdk-node/issues/531
+export const axiosFetch = async (url: string, options: FetchOptions): Promise<FetchResponse> => {
+  const response = await axios(url, { ...options, data: options.body });
+  return new Promise((resolve) => {
+    resolve({
+      ...response,
+      json: () => {
+        return response.data;
+      },
+    });
+  });
+};
diff --git a/lambdas/functions/control-plane/src/gh-auth/gh-auth.test.ts b/lambdas/functions/control-plane/src/gh-auth/gh-auth.test.ts
@@ -95,7 +95,7 @@ ${decryptedValue}`,
 
     // Assert
     expect(mockedCreatAppAuth).toBeCalledTimes(1);
-    expect(mockedCreatAppAuth).toBeCalledWith(authOptions);
+    expect(mockedCreatAppAuth).toBeCalledWith({ ...authOptions, request: expect.anything() });
   });
 
   test('Creates auth object for public GitHub', async () => {
@@ -121,7 +121,7 @@ ${decryptedValue}`,
     expect(getParameter).toBeCalledWith(PARAMETER_GITHUB_APP_KEY_BASE64_NAME);
 
     expect(mockedCreatAppAuth).toBeCalledTimes(1);
-    expect(mockedCreatAppAuth).toBeCalledWith(authOptions);
+    expect(mockedCreatAppAuth).toBeCalledWith({ ...authOptions, request: expect.anything() });
     expect(mockedAuth).toBeCalledWith({ type: authType });
     expect(result.token).toBe(token);
   });

diff --git a/lambdas/functions/control-plane/src/gh-auth/gh-auth.ts b/lambdas/functions/control-plane/src/gh-auth/gh-auth.ts
@@ -13,11 +13,13 @@ import { Octokit } from '@octokit/rest';
 import { createChildLogger } from '@terraform-aws-github-runner/aws-powertools-util';
 import { getParameter } from '@terraform-aws-github-runner/aws-ssm-util';
 
-const logger = createChildLogger('gh-auth');
+import { axiosFetch } from '../axios/fetch-override';
 
+const logger = createChildLogger('gh-auth');
 export async function createOctoClient(token: string, ghesApiUrl = ''): Promise<Octokit> {
   const ocktokitOptions: OctokitOptions = {
     auth: token,
+    request: { fetch: axiosFetch },
   };
   if (ghesApiUrl) {
     ocktokitOptions.baseUrl = ghesApiUrl;
@@ -64,7 +66,12 @@ async function createAuth(installationId: number | undefined, ghesApiUrl: string
   if (ghesApiUrl) {
     authOptions.request = request.defaults({
       baseUrl: ghesApiUrl,
+      request: {
+        fetch: axiosFetch,
+      },
     });
+  } else {
+    authOptions.request = request.defaults({ request: { fetch: axiosFetch } });
   }
   return createAppAuth(authOptions);
 }
diff --git a/lambdas/functions/control-plane/src/lambda.test.ts b/lambdas/functions/control-plane/src/lambda.test.ts
@@ -1,8 +1,8 @@
-import { logger } from '@terraform-aws-github-runner/aws-powertools-util';
+import { captureLambdaHandler, logger } from '@terraform-aws-github-runner/aws-powertools-util';
 import { Context, SQSEvent, SQSRecord } from 'aws-lambda';
 import { mocked } from 'jest-mock';
 
-import { adjustPool, scaleDownHandler, scaleUpHandler, ssmHousekeeper } from './lambda';
+import { addMiddleware, adjustPool, scaleDownHandler, scaleUpHandler, ssmHousekeeper } from './lambda';
 import { adjust } from './pool/pool';
 import ScaleError from './scale-runners/ScaleError';
 import { scaleDown } from './scale-runners/scale-down';
@@ -161,6 +161,14 @@ describe('Adjust pool.', () => {
   });
 });
 
+describe('Test middleware', () => {
+  it('Should have a working middleware', async () => {
+    const mockedLambdaHandler = captureLambdaHandler as unknown as jest.Mock;
+    mockedLambdaHandler.mockReturnValue({ before: jest.fn(), after: jest.fn(), onError: jest.fn() });
+    expect(addMiddleware).not.toThrowError();
+  });
+});
+
 describe('Test ssm housekeeper lambda wrapper.', () => {
   it('Invoke without errors.', async () => {
     const mock = mocked(cleanSSMTokens);