Merge branch 'main' into rlim/add-output-csv-short-read

chanzuckerberg · Sep 8, 2023 · dee9b64 · dee9b64
2 parents 6481544 + 814e255
commit dee9b64
Show file tree

Hide file tree

Showing 94 changed files with 5,169 additions and 1,329 deletions.
diff --git a/.github/workflows/wdl-ci-integration.yml b/.github/workflows/wdl-ci-integration.yml
@@ -10,14 +10,14 @@ env:
 
 jobs:
   cancel-previous:
-    runs-on: self-hosted
+    runs-on: [self-hosted, idseq-dev]
     steps:
       - uses: styfle/cancel-workflow-action@0.9.0
         with:
           access_token: ${{ github.token }}
 
   list-workflow-dirs:
-    runs-on: self-hosted
+    runs-on: [self-hosted, idseq-dev]
     outputs:
       matrix: ${{steps.list_dirs.outputs.matrix}}
     steps:
@@ -29,7 +29,7 @@ jobs:
         run: echo "matrix=$(./scripts/diff-workflows.sh |jq -cnR '[inputs|select(length>0)]')" >> $GITHUB_OUTPUT
 
   wdl-ci:
-    runs-on: self-hosted
+    runs-on: [self-hosted, idseq-dev]
     needs: list-workflow-dirs
     if: ${{ needs['list-workflow-dirs'].outputs.matrix != '[]' && needs['list-workflow-dirs'].outputs.matrix != '' }}
     strategy:

diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,8 @@
 __pycache__
 pyrightconfig.json
+
+
+# Makefile kludge
+.venv
+.build-*
+.python_dependencies_installed
diff --git a/Makefile b/Makefile
@@ -1,18 +1,112 @@
-lint:
+# Makefile for czid-workflows
+
+## OPTIONAL VARIABLES
+WORKFLOW?=short-read-mngs# default needed to build dag-test
+VERSION?=latest
+EXTRA_INPUTS?=
+TASK?=
+
+ifeq ($(WORKFLOW),short-read-mngs)
+    TEST_FILE?=workflows/$(WORKFLOW)/test/local_test_viral.yml
+else
+    TEST_FILE?=workflows/$(WORKFLOW)/test/local_test.yml
+endif
+
+ifeq ($(TASK),)
+ifneq ($(wildcard $(TEST_FILE)), )
+	INPUT?=-i $(TEST_FILE)
+endif
+endif
+
+TASK_CMD := $(if $(TASK), --task $(TASK),)
+
+
+.PHONY: help
+help: 
+	@grep -E '^[a-zA-Z_-]+%?:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
+
+.PHONY: build
+build: .build-$(WORKFLOW) ## Build docker images for a given workflow. eg. make build WORKFLOW=short-read-mngs
+
+.build-%:
+	./scripts/docker-build.sh workflows/$* -t czid-$*
+	touch .build-$*
+
+.PHONY: rebuild
+rebuild: ## Rebuild docker images for a given workflow. If you change anything in the lib/ directory you will likely need to rebuild. 
+	rm .build-$(WORKFLOW) | true
+	$(MAKE) build WORKFLOW=$(WORKFLOW)
+
+.PHONY: pull
+pull: .pull-$(WORKFLOW) ## Pull docker image from public github repository. Faster than build. Possibly less accurate. Defaults to latest eg. make pull WORKFLOW=long-read-mngs
+
+.pull-%: 
+	docker pull ghcr.io/chanzuckerberg/czid-workflows/czid-$(WORKFLOW)-public:$(VERSION)
+	docker tag ghcr.io/chanzuckerberg/czid-workflows/czid-$(WORKFLOW)-public:$(VERSION) czid-$(WORKFLOW)
+	touch .build-$*
+
+.PHONY: lint
+lint: ## lint files
 	pre-commit run --all-files
 	flake8 .
 	flake8 --ignore E302,F401,E501,W504,E711,E712,E722,E741 lib/idseq-dag/idseq_dag
 
-publish:
-	scripts/publish.sh
+.PHONY: release
+release:
+	scripts/release.sh
 
-test-%:
-	pytest -v -n `python3 -c 'import multiprocessing as mp; print(max(1,mp.cpu_count()-1))'` --durations=0 --tb=short --log-cli-level=11 workflows/$*/test
+test-%: ## run miniwdl step tests eg. make test-short-read-mngs
+	pytest -v -n $$(( $$(nproc) - 1)) --durations=0 --tb=short --log-cli-level=11 workflows/$*/test
 
-integration-test-%:
-	pytest -v -n `python3 -c 'import multiprocessing as mp; print(max(1,mp.cpu_count()-1))'` --durations=0 --tb=short --log-cli-level=11 workflows/$*/integration_test
+integration-test-%: ## run miniwdl integration tests eg. make integration-test-short-read-mngs
+	pytest -v -n $$(( $$(nproc) - 1)) --durations=0 --tb=short --log-cli-level=11 workflows/$*/integration_test
 
-test:
+.PHONY: test
+test: ## run miniwdl step tests for all workflows eg. make test
 	for i in $$(ls workflows); do $(MAKE) test-$$i; done
 
-.PHONY: lint publish test
+.PHONY: dag-test
+dag-test: build ## run tests for idseq-dag
+	docker run -it -v ${PWD}/lib/idseq-dag:/work -w /work czid-$(WORKFLOW) pytest -s
+
+.PHONY: python-dependencies
+python-dependencies: .python_dependencies_installed # install python dependencies
+
+.python_dependencies_installed: 
+	virtualenv -p python3 .venv
+	.venv/bin/pip install -r requirements-dev.txt
+	echo "Run: source .venv/bin/activate"
+	touch .python_dependencies_installed
+
+# TODO: Break this up into 2 functions, one to resolve the RUNFILE and one to run the workflow
+.PHONY: run
+run: build python-dependencies ## run a miniwdl workflow. eg. make run WORKFLOW=consensus-genome. args: WORKFLOW,EXTRA_INPUT,INPUT,TASK_CMD
+	if [ "$(WORKFLOW)" = "short-read-mngs" ]; then \
+		RUNFILE="local_driver.wdl"; \
+	elif [ -f "workflows/$(WORKFLOW)/$(WORKFLOW).wdl" ]; then \
+		RUNFILE="$(WORKFLOW).wdl"; \
+	else \
+		RUNFILE="run.wdl"; \
+	fi; \
+	.venv/bin/miniwdl run workflows/$(WORKFLOW)/$$RUNFILE docker_image_id=czid-$(WORKFLOW) $(EXTRA_INPUTS) $(INPUT) $(TASK_CMD)
+
+.PHONY: miniwdl-explore
+miniwdl-explore: ## !ADVANCED! explore a previous miniwdl workflow run in the cli. eg. make miniwdl-explore OUTPATH='/mnt/path/to/output/'
+	cat $(OUTPATH)/inputs.json | jq ' [values[]] | flatten | .[] | tostring | select(startswith("s3"))' | xargs -I {} aws s3 cp "{}" $(OUTPATH)/work/_miniwdl_inputs/0/
+	cat $(OUTPATH)/inputs.json | jq ' [values[]] | flatten | .[] | tostring | select(startswith("/"))' | xargs -I {} cp "{}" $(OUTPATH)/work/_miniwdl_inputs/0/
+	docker run -it --entrypoint bash -w /mnt/miniwdl_task_container/work -v$(OUTPATH):/mnt/miniwdl_task_container czid-$(WORKFLOW)
+
+.PHONY: ls
+ls: ## list workflows
+	@ls -1 workflows/
+
+.PHONY: check
+check: python-dependencies ## run miniwdl check on the given workflow
+	if [ "$(WORKFLOW)" = "short-read-mngs" ]; then \
+                RUNFILE="local_driver.wdl"; \
+	elif [ -f "workflows/$(WORKFLOW)/$(WORKFLOW).wdl" ]; then \
+		RUNFILE="$(WORKFLOW).wdl"; \
+	else \
+			RUNFILE="run.wdl"; \
+	fi; \
+	.venv/bin/miniwdl check workflows/$(WORKFLOW)/$$RUNFILE
diff --git a/README.md b/README.md
@@ -17,14 +17,67 @@ Currently we have 5 main workflows. The details of each pipeline are in a README
 * [short-read-mngs](workflows/short-read-mngs/README.md) 
 * [consensus-genome](workflows/consensus-genome/README.md)
 * [phylotree-ng](workflows/phylotree-ng/README.md)
-* long-read-mngs (Beta)
-* amr (Beta)
+* [long-read-mngs](workflows/long-read-mngs/README.md)
+* [amr](workflows/amr/README.md)
 
 ## Running these workflows
 This repository contains [WDL](https://openwdl.org/) workflows that the [CZ ID](https://czid.org) platform uses in
 production. See [Running WDL workflows locally](https://github.com/chanzuckerberg/czid-workflows/wiki/Running-WDL-workflows-locally)
 to get started with them.
 
+### System Requirements 
+The system requirements differs for each workflow and depending on the database being used. For example running the short-read-mngs workflow with the full NT and NR databases would require an instance with >1TB of disk space and >100GB of memory. Running other workflows (e.g. consensus-genome, amr) requires much less space. 
+
+If using an ARM Mac try following [these](RunningCZIDWorkflowsOnARMMacs.md) setup instructions
+### Software requirements
+* docker with buildx support (version >= 19.03)
+* python3 
+* virtualenv
+* requirements-dev.txt - to automatically install this run `make python-dependencies`
+
+### Quick Setup
+To get setup, first set the workflow you want to run with 
+
+```export WORKFLOW=<workflow-name>``` e.g.
+
+```export WORKFLOW=amr```
+
+You can see available workflows with `make ls`
+
+Either `build` or `pull` the workflow docker image with 
+
+```make pull  ## The faster option``` or 
+
+```make build ## The slower option, but necessary if you're modifying the docker container```
+
+### Running a workflow
+Run a workflow with 
+
+```make run```
+
+Which simply runs the ```miniwdl run path_to_wdl.wdl``` command with some defaults
+
+Each workflow has a number of required and optional inputs, and all require at least an input file (usually a fastq). Default inputs are set from the `workflows/<workflow-name>/test/local_test.yml` file. These may or may not be accurate for every analysis. You can override these defaults and add your own with:
+
+```make run INPUT='-i your_file_here.yml'```
+
+If you're happy with the defaults, you can add arguments to the `miniwdl` command using 
+
+```make run EXTRA_INPUTS='input_fastq=/path/to/input.fastq' ```
+
+### An example
+Lets say I want to run a consensus-genome workflow. I would run the following:
+
+```
+export WORKFLOW=consensus-genome
+make pull # pull the latest docker container from github packages
+make python-dependencies # create a .venv and install the requirements-dev.txt dependencies 
+
+make run EXTRA_INPUTS='fastqs_0=workflows/consensus-genome/test/sample_sars-cov-2_paired_r1.fastq.gz \
+        fastqs_1=workflows/consensus-genome/test/sample_sars-cov-2_paired_r2.fastq.gz \
+        technology="Illumina" \
+        sample="my_sample_name"'
+```
 ## CI/CD
 
 We use GitHub Actions for CI/CD. Lint and unit tests run on GitHub from jobs in `.github/workflows/wdl-ci.yml`

diff --git a/RunningCZIDWorkflowsOnARMMacs.md b/RunningCZIDWorkflowsOnARMMacs.md
@@ -0,0 +1,39 @@
+## Running czid-workflows on Silicon (M1/M2) Macs
+
+Creating the docker containers for the workflows is challenging on machines that don't use the `x86` architecture. Many of the dependencies that we use right now don't have options for building for other architectures. 
+
+The following is how to set up a virtual machine using `colima` where you can build and run docker containers locally in machines that use the ARM CPU architecture
+
+
+### Install Colima
+
+`brew install colima`
+
+Go to the czid-workflows repo
+
+`cd ~/czid-workflows`
+
+### Boot and mount the colima VM
+``` 
+# uses 4GB of memory & 4 CPUs, change this to suit your needs 
+colima start --mount $PWD:/work:w -m 4 -c 4
+
+```
+
+### Set up the VM
+
+SSH into the VM
+
+`colima ssh`
+
+In the VM run:
+
+`sudo apk add py3-virtualenv gcc python3-dev musl-dev linux-headers make`
+
+Go to where the `czid-workflows` repo is mounted
+
+`cd /work`
+
+Then you should be able to build and run the docker containers as normal
+
+`make build WORKFLOW=minimap2`
diff --git a/lib/bin/filter_clipped_alignments.py b/lib/bin/filter_clipped_alignments.py
@@ -0,0 +1,101 @@
+import csv
+import re
+import argparse
+import sys
+
+# the SAM file input has the full sequence which can be very long
+#   increase the csv field size limit to ensure they will fit
+_field_size_limit = sys.maxsize
+while True:
+    try:
+        csv.field_size_limit(_field_size_limit)
+        break
+    # if the size limit is too high make it one bit smaller and try again
+    except OverflowError:
+        _field_size_limit = int(_field_size_limit >> 1)
+
+front_pattern = re.compile(r"^\d+[SH]")
+end_pattern = re.compile(r"\d+[SH]$")
+
+
+def split_cigar(cigar_str):
+    """splits the count from type in a cigar string"""
+    return int(cigar_str[:-1]), cigar_str[-1]
+
+
+def get_clipping(cigar, alength, pattern):
+    """Get the amount of clipping from the front or end of the cigar string"""
+    clipping = 0
+
+    match = re.search(pattern, cigar)
+    if match:
+        clip_length, clip_type = split_cigar(match.group())
+        clipping += clip_length
+        if clip_type == "H":
+            alength += clip_length
+
+    return clipping, alength
+
+
+UNMAPPED_READ = [
+    "read_id",  # QNAME
+    "4",  # FLAG
+    "*",  # RNAME
+    "0",  # POS
+    "0",  # MAPQ
+    "*",  # CIGAR
+    "*",  # MRNM
+    "0",  # MPOS
+    "0",  # ISIZE
+    "AAAA",  # SEQ
+    "????",  # QUAL
+]
+
+
+def main(reads_to_contig_sam, output_file, max_percent):
+    with open(reads_to_contig_sam, "r") as csv_file, open(
+        output_file, mode="w", newline=""
+    ) as output_file:
+        csv_reader = csv.reader(csv_file, delimiter="\t", quotechar="\t")
+        output = csv.writer(
+            output_file,
+            delimiter="\t",
+            lineterminator="\n",
+            quotechar="\t",
+            quoting=csv.QUOTE_NONE,
+        )
+
+        # Loop through each row in the CSV file
+        for row in csv_reader:
+            if len(row) > 1 and len(row[0]) > 1 and row[0][0] == "@":
+                output.writerow(row)
+            elif row[5] != "*":
+                cigar = row[5]
+                alength = len(row[9])
+                front_clipping, alength = get_clipping(cigar, alength, front_pattern)
+                end_clipping, alength = get_clipping(cigar, alength, end_pattern)
+                percent_clipped = ((front_clipping + end_clipping) / alength) * 100
+                if percent_clipped < max_percent:
+                    output.writerow(row)
+                else:
+                    unmapped_read = UNMAPPED_READ.copy()
+                    unmapped_read[0], unmapped_read[9], unmapped_read[10] = (
+                        row[0],
+                        row[9],
+                        row[10],
+                    )
+                    output.writerow(unmapped_read)
+            else:
+                output.writerow(row)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="Filter clipped alignments",
+        description="Filter out alignments from minimap2 that are too heavily clipped",
+    )
+    parser.add_argument("input_file")
+    parser.add_argument("output_file")
+    parser.add_argument("-m", "--max_percent", default=50.0, type=float)
+    args = parser.parse_args()
+    main(args.input_file, args.output_file, args.max_percent)