-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into rlim/add-output-csv-short-read
- Loading branch information
Showing
94 changed files
with
5,169 additions
and
1,329 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,8 @@ | ||
__pycache__ | ||
pyrightconfig.json | ||
|
||
|
||
# Makefile kludge | ||
.venv | ||
.build-* | ||
.python_dependencies_installed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,112 @@ | ||
lint: | ||
# Makefile for czid-workflows | ||
|
||
## OPTIONAL VARIABLES | ||
WORKFLOW?=short-read-mngs# default needed to build dag-test | ||
VERSION?=latest | ||
EXTRA_INPUTS?= | ||
TASK?= | ||
|
||
ifeq ($(WORKFLOW),short-read-mngs) | ||
TEST_FILE?=workflows/$(WORKFLOW)/test/local_test_viral.yml | ||
else | ||
TEST_FILE?=workflows/$(WORKFLOW)/test/local_test.yml | ||
endif | ||
|
||
ifeq ($(TASK),) | ||
ifneq ($(wildcard $(TEST_FILE)), ) | ||
INPUT?=-i $(TEST_FILE) | ||
endif | ||
endif | ||
|
||
TASK_CMD := $(if $(TASK), --task $(TASK),) | ||
|
||
|
||
.PHONY: help | ||
help: | ||
@grep -E '^[a-zA-Z_-]+%?:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' | ||
|
||
.PHONY: build | ||
build: .build-$(WORKFLOW) ## Build docker images for a given workflow. eg. make build WORKFLOW=short-read-mngs | ||
|
||
.build-%: | ||
./scripts/docker-build.sh workflows/$* -t czid-$* | ||
touch .build-$* | ||
|
||
.PHONY: rebuild | ||
rebuild: ## Rebuild docker images for a given workflow. If you change anything in the lib/ directory you will likely need to rebuild. | ||
rm .build-$(WORKFLOW) | true | ||
$(MAKE) build WORKFLOW=$(WORKFLOW) | ||
|
||
.PHONY: pull | ||
pull: .pull-$(WORKFLOW) ## Pull docker image from public github repository. Faster than build. Possibly less accurate. Defaults to latest eg. make pull WORKFLOW=long-read-mngs | ||
|
||
.pull-%: | ||
docker pull ghcr.io/chanzuckerberg/czid-workflows/czid-$(WORKFLOW)-public:$(VERSION) | ||
docker tag ghcr.io/chanzuckerberg/czid-workflows/czid-$(WORKFLOW)-public:$(VERSION) czid-$(WORKFLOW) | ||
touch .build-$* | ||
|
||
.PHONY: lint | ||
lint: ## lint files | ||
pre-commit run --all-files | ||
flake8 . | ||
flake8 --ignore E302,F401,E501,W504,E711,E712,E722,E741 lib/idseq-dag/idseq_dag | ||
|
||
publish: | ||
scripts/publish.sh | ||
.PHONY: release | ||
release: | ||
scripts/release.sh | ||
|
||
test-%: | ||
pytest -v -n `python3 -c 'import multiprocessing as mp; print(max(1,mp.cpu_count()-1))'` --durations=0 --tb=short --log-cli-level=11 workflows/$*/test | ||
test-%: ## run miniwdl step tests eg. make test-short-read-mngs | ||
pytest -v -n $$(( $$(nproc) - 1)) --durations=0 --tb=short --log-cli-level=11 workflows/$*/test | ||
|
||
integration-test-%: | ||
pytest -v -n `python3 -c 'import multiprocessing as mp; print(max(1,mp.cpu_count()-1))'` --durations=0 --tb=short --log-cli-level=11 workflows/$*/integration_test | ||
integration-test-%: ## run miniwdl integration tests eg. make integration-test-short-read-mngs | ||
pytest -v -n $$(( $$(nproc) - 1)) --durations=0 --tb=short --log-cli-level=11 workflows/$*/integration_test | ||
|
||
test: | ||
.PHONY: test | ||
test: ## run miniwdl step tests for all workflows eg. make test | ||
for i in $$(ls workflows); do $(MAKE) test-$$i; done | ||
|
||
.PHONY: lint publish test | ||
.PHONY: dag-test | ||
dag-test: build ## run tests for idseq-dag | ||
docker run -it -v ${PWD}/lib/idseq-dag:/work -w /work czid-$(WORKFLOW) pytest -s | ||
|
||
.PHONY: python-dependencies | ||
python-dependencies: .python_dependencies_installed # install python dependencies | ||
|
||
.python_dependencies_installed: | ||
virtualenv -p python3 .venv | ||
.venv/bin/pip install -r requirements-dev.txt | ||
echo "Run: source .venv/bin/activate" | ||
touch .python_dependencies_installed | ||
|
||
# TODO: Break this up into 2 functions, one to resolve the RUNFILE and one to run the workflow | ||
.PHONY: run | ||
run: build python-dependencies ## run a miniwdl workflow. eg. make run WORKFLOW=consensus-genome. args: WORKFLOW,EXTRA_INPUT,INPUT,TASK_CMD | ||
if [ "$(WORKFLOW)" = "short-read-mngs" ]; then \ | ||
RUNFILE="local_driver.wdl"; \ | ||
elif [ -f "workflows/$(WORKFLOW)/$(WORKFLOW).wdl" ]; then \ | ||
RUNFILE="$(WORKFLOW).wdl"; \ | ||
else \ | ||
RUNFILE="run.wdl"; \ | ||
fi; \ | ||
.venv/bin/miniwdl run workflows/$(WORKFLOW)/$$RUNFILE docker_image_id=czid-$(WORKFLOW) $(EXTRA_INPUTS) $(INPUT) $(TASK_CMD) | ||
|
||
.PHONY: miniwdl-explore | ||
miniwdl-explore: ## !ADVANCED! explore a previous miniwdl workflow run in the cli. eg. make miniwdl-explore OUTPATH='/mnt/path/to/output/' | ||
cat $(OUTPATH)/inputs.json | jq ' [values[]] | flatten | .[] | tostring | select(startswith("s3"))' | xargs -I {} aws s3 cp "{}" $(OUTPATH)/work/_miniwdl_inputs/0/ | ||
cat $(OUTPATH)/inputs.json | jq ' [values[]] | flatten | .[] | tostring | select(startswith("/"))' | xargs -I {} cp "{}" $(OUTPATH)/work/_miniwdl_inputs/0/ | ||
docker run -it --entrypoint bash -w /mnt/miniwdl_task_container/work -v$(OUTPATH):/mnt/miniwdl_task_container czid-$(WORKFLOW) | ||
|
||
.PHONY: ls | ||
ls: ## list workflows | ||
@ls -1 workflows/ | ||
|
||
.PHONY: check | ||
check: python-dependencies ## run miniwdl check on the given workflow | ||
if [ "$(WORKFLOW)" = "short-read-mngs" ]; then \ | ||
RUNFILE="local_driver.wdl"; \ | ||
elif [ -f "workflows/$(WORKFLOW)/$(WORKFLOW).wdl" ]; then \ | ||
RUNFILE="$(WORKFLOW).wdl"; \ | ||
else \ | ||
RUNFILE="run.wdl"; \ | ||
fi; \ | ||
.venv/bin/miniwdl check workflows/$(WORKFLOW)/$$RUNFILE |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
## Running czid-workflows on Silicon (M1/M2) Macs | ||
|
||
Creating the docker containers for the workflows is challenging on machines that don't use the `x86` architecture. Many of the dependencies that we use right now don't have options for building for other architectures. | ||
|
||
The following is how to set up a virtual machine using `colima` where you can build and run docker containers locally in machines that use the ARM CPU architecture | ||
|
||
|
||
### Install Colima | ||
|
||
`brew install colima` | ||
|
||
Go to the czid-workflows repo | ||
|
||
`cd ~/czid-workflows` | ||
|
||
### Boot and mount the colima VM | ||
``` | ||
# uses 4GB of memory & 4 CPUs, change this to suit your needs | ||
colima start --mount $PWD:/work:w -m 4 -c 4 | ||
``` | ||
|
||
### Set up the VM | ||
|
||
SSH into the VM | ||
|
||
`colima ssh` | ||
|
||
In the VM run: | ||
|
||
`sudo apk add py3-virtualenv gcc python3-dev musl-dev linux-headers make` | ||
|
||
Go to where the `czid-workflows` repo is mounted | ||
|
||
`cd /work` | ||
|
||
Then you should be able to build and run the docker containers as normal | ||
|
||
`make build WORKFLOW=minimap2` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
import csv | ||
import re | ||
import argparse | ||
import sys | ||
|
||
# the SAM file input has the full sequence which can be very long | ||
# increase the csv field size limit to ensure they will fit | ||
_field_size_limit = sys.maxsize | ||
while True: | ||
try: | ||
csv.field_size_limit(_field_size_limit) | ||
break | ||
# if the size limit is too high make it one bit smaller and try again | ||
except OverflowError: | ||
_field_size_limit = int(_field_size_limit >> 1) | ||
|
||
front_pattern = re.compile(r"^\d+[SH]") | ||
end_pattern = re.compile(r"\d+[SH]$") | ||
|
||
|
||
def split_cigar(cigar_str): | ||
"""splits the count from type in a cigar string""" | ||
return int(cigar_str[:-1]), cigar_str[-1] | ||
|
||
|
||
def get_clipping(cigar, alength, pattern): | ||
"""Get the amount of clipping from the front or end of the cigar string""" | ||
clipping = 0 | ||
|
||
match = re.search(pattern, cigar) | ||
if match: | ||
clip_length, clip_type = split_cigar(match.group()) | ||
clipping += clip_length | ||
if clip_type == "H": | ||
alength += clip_length | ||
|
||
return clipping, alength | ||
|
||
|
||
UNMAPPED_READ = [ | ||
"read_id", # QNAME | ||
"4", # FLAG | ||
"*", # RNAME | ||
"0", # POS | ||
"0", # MAPQ | ||
"*", # CIGAR | ||
"*", # MRNM | ||
"0", # MPOS | ||
"0", # ISIZE | ||
"AAAA", # SEQ | ||
"????", # QUAL | ||
] | ||
|
||
|
||
def main(reads_to_contig_sam, output_file, max_percent): | ||
with open(reads_to_contig_sam, "r") as csv_file, open( | ||
output_file, mode="w", newline="" | ||
) as output_file: | ||
csv_reader = csv.reader(csv_file, delimiter="\t", quotechar="\t") | ||
output = csv.writer( | ||
output_file, | ||
delimiter="\t", | ||
lineterminator="\n", | ||
quotechar="\t", | ||
quoting=csv.QUOTE_NONE, | ||
) | ||
|
||
# Loop through each row in the CSV file | ||
for row in csv_reader: | ||
if len(row) > 1 and len(row[0]) > 1 and row[0][0] == "@": | ||
output.writerow(row) | ||
elif row[5] != "*": | ||
cigar = row[5] | ||
alength = len(row[9]) | ||
front_clipping, alength = get_clipping(cigar, alength, front_pattern) | ||
end_clipping, alength = get_clipping(cigar, alength, end_pattern) | ||
percent_clipped = ((front_clipping + end_clipping) / alength) * 100 | ||
if percent_clipped < max_percent: | ||
output.writerow(row) | ||
else: | ||
unmapped_read = UNMAPPED_READ.copy() | ||
unmapped_read[0], unmapped_read[9], unmapped_read[10] = ( | ||
row[0], | ||
row[9], | ||
row[10], | ||
) | ||
output.writerow(unmapped_read) | ||
else: | ||
output.writerow(row) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
prog="Filter clipped alignments", | ||
description="Filter out alignments from minimap2 that are too heavily clipped", | ||
) | ||
parser.add_argument("input_file") | ||
parser.add_argument("output_file") | ||
parser.add_argument("-m", "--max_percent", default=50.0, type=float) | ||
args = parser.parse_args() | ||
main(args.input_file, args.output_file, args.max_percent) |
Oops, something went wrong.