Skip to content

Commit

Permalink
Merge branch 'main' into rlim/add-output-csv-short-read
Browse files Browse the repository at this point in the history
  • Loading branch information
rzlim08 authored Sep 8, 2023
2 parents 6481544 + 814e255 commit dee9b64
Show file tree
Hide file tree
Showing 94 changed files with 5,169 additions and 1,329 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/wdl-ci-integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ env:

jobs:
cancel-previous:
runs-on: self-hosted
runs-on: [self-hosted, idseq-dev]
steps:
- uses: styfle/cancel-workflow-action@0.9.0
with:
access_token: ${{ github.token }}

list-workflow-dirs:
runs-on: self-hosted
runs-on: [self-hosted, idseq-dev]
outputs:
matrix: ${{steps.list_dirs.outputs.matrix}}
steps:
Expand All @@ -29,7 +29,7 @@ jobs:
run: echo "matrix=$(./scripts/diff-workflows.sh |jq -cnR '[inputs|select(length>0)]')" >> $GITHUB_OUTPUT

wdl-ci:
runs-on: self-hosted
runs-on: [self-hosted, idseq-dev]
needs: list-workflow-dirs
if: ${{ needs['list-workflow-dirs'].outputs.matrix != '[]' && needs['list-workflow-dirs'].outputs.matrix != '' }}
strategy:
Expand Down
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
__pycache__
pyrightconfig.json


# Makefile kludge
.venv
.build-*
.python_dependencies_installed
112 changes: 103 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,18 +1,112 @@
lint:
# Makefile for czid-workflows

## OPTIONAL VARIABLES
WORKFLOW?=short-read-mngs# default needed to build dag-test
VERSION?=latest
EXTRA_INPUTS?=
TASK?=

ifeq ($(WORKFLOW),short-read-mngs)
TEST_FILE?=workflows/$(WORKFLOW)/test/local_test_viral.yml
else
TEST_FILE?=workflows/$(WORKFLOW)/test/local_test.yml
endif

ifeq ($(TASK),)
ifneq ($(wildcard $(TEST_FILE)), )
INPUT?=-i $(TEST_FILE)
endif
endif

TASK_CMD := $(if $(TASK), --task $(TASK),)


.PHONY: help
help:
@grep -E '^[a-zA-Z_-]+%?:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'

.PHONY: build
build: .build-$(WORKFLOW) ## Build docker images for a given workflow. eg. make build WORKFLOW=short-read-mngs

.build-%:
./scripts/docker-build.sh workflows/$* -t czid-$*
touch .build-$*

.PHONY: rebuild
rebuild: ## Rebuild docker images for a given workflow. If you change anything in the lib/ directory you will likely need to rebuild.
rm .build-$(WORKFLOW) | true
$(MAKE) build WORKFLOW=$(WORKFLOW)

.PHONY: pull
pull: .pull-$(WORKFLOW) ## Pull docker image from public github repository. Faster than build. Possibly less accurate. Defaults to latest eg. make pull WORKFLOW=long-read-mngs

.pull-%:
docker pull ghcr.io/chanzuckerberg/czid-workflows/czid-$(WORKFLOW)-public:$(VERSION)
docker tag ghcr.io/chanzuckerberg/czid-workflows/czid-$(WORKFLOW)-public:$(VERSION) czid-$(WORKFLOW)
touch .build-$*

.PHONY: lint
lint: ## lint files
pre-commit run --all-files
flake8 .
flake8 --ignore E302,F401,E501,W504,E711,E712,E722,E741 lib/idseq-dag/idseq_dag

publish:
scripts/publish.sh
.PHONY: release
release:
scripts/release.sh

test-%:
pytest -v -n `python3 -c 'import multiprocessing as mp; print(max(1,mp.cpu_count()-1))'` --durations=0 --tb=short --log-cli-level=11 workflows/$*/test
test-%: ## run miniwdl step tests eg. make test-short-read-mngs
pytest -v -n $$(( $$(nproc) - 1)) --durations=0 --tb=short --log-cli-level=11 workflows/$*/test

integration-test-%:
pytest -v -n `python3 -c 'import multiprocessing as mp; print(max(1,mp.cpu_count()-1))'` --durations=0 --tb=short --log-cli-level=11 workflows/$*/integration_test
integration-test-%: ## run miniwdl integration tests eg. make integration-test-short-read-mngs
pytest -v -n $$(( $$(nproc) - 1)) --durations=0 --tb=short --log-cli-level=11 workflows/$*/integration_test

test:
.PHONY: test
test: ## run miniwdl step tests for all workflows eg. make test
for i in $$(ls workflows); do $(MAKE) test-$$i; done

.PHONY: lint publish test
.PHONY: dag-test
dag-test: build ## run tests for idseq-dag
docker run -it -v ${PWD}/lib/idseq-dag:/work -w /work czid-$(WORKFLOW) pytest -s

.PHONY: python-dependencies
python-dependencies: .python_dependencies_installed # install python dependencies

.python_dependencies_installed:
virtualenv -p python3 .venv
.venv/bin/pip install -r requirements-dev.txt
echo "Run: source .venv/bin/activate"
touch .python_dependencies_installed

# TODO: Break this up into 2 functions, one to resolve the RUNFILE and one to run the workflow
.PHONY: run
run: build python-dependencies ## run a miniwdl workflow. eg. make run WORKFLOW=consensus-genome. args: WORKFLOW,EXTRA_INPUT,INPUT,TASK_CMD
if [ "$(WORKFLOW)" = "short-read-mngs" ]; then \
RUNFILE="local_driver.wdl"; \
elif [ -f "workflows/$(WORKFLOW)/$(WORKFLOW).wdl" ]; then \
RUNFILE="$(WORKFLOW).wdl"; \
else \
RUNFILE="run.wdl"; \
fi; \
.venv/bin/miniwdl run workflows/$(WORKFLOW)/$$RUNFILE docker_image_id=czid-$(WORKFLOW) $(EXTRA_INPUTS) $(INPUT) $(TASK_CMD)

.PHONY: miniwdl-explore
miniwdl-explore: ## !ADVANCED! explore a previous miniwdl workflow run in the cli. eg. make miniwdl-explore OUTPATH='/mnt/path/to/output/'
cat $(OUTPATH)/inputs.json | jq ' [values[]] | flatten | .[] | tostring | select(startswith("s3"))' | xargs -I {} aws s3 cp "{}" $(OUTPATH)/work/_miniwdl_inputs/0/
cat $(OUTPATH)/inputs.json | jq ' [values[]] | flatten | .[] | tostring | select(startswith("/"))' | xargs -I {} cp "{}" $(OUTPATH)/work/_miniwdl_inputs/0/
docker run -it --entrypoint bash -w /mnt/miniwdl_task_container/work -v$(OUTPATH):/mnt/miniwdl_task_container czid-$(WORKFLOW)

.PHONY: ls
ls: ## list workflows
@ls -1 workflows/

.PHONY: check
check: python-dependencies ## run miniwdl check on the given workflow
if [ "$(WORKFLOW)" = "short-read-mngs" ]; then \
RUNFILE="local_driver.wdl"; \
elif [ -f "workflows/$(WORKFLOW)/$(WORKFLOW).wdl" ]; then \
RUNFILE="$(WORKFLOW).wdl"; \
else \
RUNFILE="run.wdl"; \
fi; \
.venv/bin/miniwdl check workflows/$(WORKFLOW)/$$RUNFILE
57 changes: 55 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,67 @@ Currently we have 5 main workflows. The details of each pipeline are in a README
* [short-read-mngs](workflows/short-read-mngs/README.md)
* [consensus-genome](workflows/consensus-genome/README.md)
* [phylotree-ng](workflows/phylotree-ng/README.md)
* long-read-mngs (Beta)
* amr (Beta)
* [long-read-mngs](workflows/long-read-mngs/README.md)
* [amr](workflows/amr/README.md)

## Running these workflows
This repository contains [WDL](https://openwdl.org/) workflows that the [CZ ID](https://czid.org) platform uses in
production. See [Running WDL workflows locally](https://github.com/chanzuckerberg/czid-workflows/wiki/Running-WDL-workflows-locally)
to get started with them.

### System Requirements
The system requirements differs for each workflow and depending on the database being used. For example running the short-read-mngs workflow with the full NT and NR databases would require an instance with >1TB of disk space and >100GB of memory. Running other workflows (e.g. consensus-genome, amr) requires much less space.

If using an ARM Mac try following [these](RunningCZIDWorkflowsOnARMMacs.md) setup instructions
### Software requirements
* docker with buildx support (version >= 19.03)
* python3
* virtualenv
* requirements-dev.txt - to automatically install this run `make python-dependencies`

### Quick Setup
To get setup, first set the workflow you want to run with

```export WORKFLOW=<workflow-name>``` e.g.

```export WORKFLOW=amr```

You can see available workflows with `make ls`

Either `build` or `pull` the workflow docker image with

```make pull ## The faster option``` or

```make build ## The slower option, but necessary if you're modifying the docker container```

### Running a workflow
Run a workflow with

```make run```

Which simply runs the ```miniwdl run path_to_wdl.wdl``` command with some defaults

Each workflow has a number of required and optional inputs, and all require at least an input file (usually a fastq). Default inputs are set from the `workflows/<workflow-name>/test/local_test.yml` file. These may or may not be accurate for every analysis. You can override these defaults and add your own with:

```make run INPUT='-i your_file_here.yml'```

If you're happy with the defaults, you can add arguments to the `miniwdl` command using

```make run EXTRA_INPUTS='input_fastq=/path/to/input.fastq' ```

### An example
Lets say I want to run a consensus-genome workflow. I would run the following:

```
export WORKFLOW=consensus-genome
make pull # pull the latest docker container from github packages
make python-dependencies # create a .venv and install the requirements-dev.txt dependencies
make run EXTRA_INPUTS='fastqs_0=workflows/consensus-genome/test/sample_sars-cov-2_paired_r1.fastq.gz \
fastqs_1=workflows/consensus-genome/test/sample_sars-cov-2_paired_r2.fastq.gz \
technology="Illumina" \
sample="my_sample_name"'
```
## CI/CD

We use GitHub Actions for CI/CD. Lint and unit tests run on GitHub from jobs in `.github/workflows/wdl-ci.yml`
Expand Down
39 changes: 39 additions & 0 deletions RunningCZIDWorkflowsOnARMMacs.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
## Running czid-workflows on Silicon (M1/M2) Macs

Creating the docker containers for the workflows is challenging on machines that don't use the `x86` architecture. Many of the dependencies that we use right now don't have options for building for other architectures.

The following is how to set up a virtual machine using `colima` where you can build and run docker containers locally in machines that use the ARM CPU architecture


### Install Colima

`brew install colima`

Go to the czid-workflows repo

`cd ~/czid-workflows`

### Boot and mount the colima VM
```
# uses 4GB of memory & 4 CPUs, change this to suit your needs
colima start --mount $PWD:/work:w -m 4 -c 4
```

### Set up the VM

SSH into the VM

`colima ssh`

In the VM run:

`sudo apk add py3-virtualenv gcc python3-dev musl-dev linux-headers make`

Go to where the `czid-workflows` repo is mounted

`cd /work`

Then you should be able to build and run the docker containers as normal

`make build WORKFLOW=minimap2`
101 changes: 101 additions & 0 deletions lib/bin/filter_clipped_alignments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import csv
import re
import argparse
import sys

# the SAM file input has the full sequence which can be very long
# increase the csv field size limit to ensure they will fit
_field_size_limit = sys.maxsize
while True:
try:
csv.field_size_limit(_field_size_limit)
break
# if the size limit is too high make it one bit smaller and try again
except OverflowError:
_field_size_limit = int(_field_size_limit >> 1)

front_pattern = re.compile(r"^\d+[SH]")
end_pattern = re.compile(r"\d+[SH]$")


def split_cigar(cigar_str):
"""splits the count from type in a cigar string"""
return int(cigar_str[:-1]), cigar_str[-1]


def get_clipping(cigar, alength, pattern):
"""Get the amount of clipping from the front or end of the cigar string"""
clipping = 0

match = re.search(pattern, cigar)
if match:
clip_length, clip_type = split_cigar(match.group())
clipping += clip_length
if clip_type == "H":
alength += clip_length

return clipping, alength


UNMAPPED_READ = [
"read_id", # QNAME
"4", # FLAG
"*", # RNAME
"0", # POS
"0", # MAPQ
"*", # CIGAR
"*", # MRNM
"0", # MPOS
"0", # ISIZE
"AAAA", # SEQ
"????", # QUAL
]


def main(reads_to_contig_sam, output_file, max_percent):
with open(reads_to_contig_sam, "r") as csv_file, open(
output_file, mode="w", newline=""
) as output_file:
csv_reader = csv.reader(csv_file, delimiter="\t", quotechar="\t")
output = csv.writer(
output_file,
delimiter="\t",
lineterminator="\n",
quotechar="\t",
quoting=csv.QUOTE_NONE,
)

# Loop through each row in the CSV file
for row in csv_reader:
if len(row) > 1 and len(row[0]) > 1 and row[0][0] == "@":
output.writerow(row)
elif row[5] != "*":
cigar = row[5]
alength = len(row[9])
front_clipping, alength = get_clipping(cigar, alength, front_pattern)
end_clipping, alength = get_clipping(cigar, alength, end_pattern)
percent_clipped = ((front_clipping + end_clipping) / alength) * 100
if percent_clipped < max_percent:
output.writerow(row)
else:
unmapped_read = UNMAPPED_READ.copy()
unmapped_read[0], unmapped_read[9], unmapped_read[10] = (
row[0],
row[9],
row[10],
)
output.writerow(unmapped_read)
else:
output.writerow(row)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="Filter clipped alignments",
description="Filter out alignments from minimap2 that are too heavily clipped",
)
parser.add_argument("input_file")
parser.add_argument("output_file")
parser.add_argument("-m", "--max_percent", default=50.0, type=float)
args = parser.parse_args()
main(args.input_file, args.output_file, args.max_percent)
Loading

0 comments on commit dee9b64

Please sign in to comment.