Skip to content

Commit

Permalink
feat: add tax_id parameter (#147)
Browse files Browse the repository at this point in the history
* feat: add org param

* refactor: avoid duplicate mappings (#131)

Co-authored-by: Boris Jurič <499542@mail.muni.cz>
Co-authored-by: Alex Kanitz <alexander.kanitz@alumni.ethz.ch>

* fix typo, update pylint config

* feat: add org_id param #108

* refactor: get_library_source.py #108

* test: add org param tests #108

* fix: update Pydantic version (#146)

* fix pydantic issues

* fix: update pydantic version in envs

* fix: pin sphinx-rtd-theme into env

* fix: update readthedocs config

* update readme, gitignore

* feat: infer org source if id not in dict #108

* replace json with model_dump

* feat: add org_id param #108

* feat: add org_id param #108

* refactor: replace org with tax-id

* refactor get_library_source

* refactor get_library_source tests

* refactor: update models.py

* refactor: fix typos

---------

Co-authored-by: Boris Jurič <74237898+BorisYourich@users.noreply.github.com>
Co-authored-by: Boris Jurič <499542@mail.muni.cz>
Co-authored-by: Alex Kanitz <alexander.kanitz@alumni.ethz.ch>
  • Loading branch information
4 people committed Nov 15, 2023
1 parent 5ef3322 commit 8443c05
Show file tree
Hide file tree
Showing 10 changed files with 238 additions and 18 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,4 @@ tests/.DS_Store
results_htsinfer
.snakemake
tests/cluster_tests/results_sra_downloads
*.out
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ htsinfer [--output-directory PATH]
[--library-type-mates-cutoff FLOAT]
[--read-orientation-min-mapped-reads INT]
[--read-orientation-min-fraction FLOAT]
[--tax-id INT]
[--verbosity {DEBUG,INFO,WARN,ERROR,CRITICAL}]
[-h] [--version]
PATH [PATH]
Expand Down
11 changes: 11 additions & 0 deletions htsinfer/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,17 @@ def __call__(
"be reported. Must be above 0.5"
)
)
parser.add_argument(
'--tax-id',
dest="tax_id",
metavar="INT",
type=int,
default=None,
help=(
"NCBI taxonomic identifier of source organism of the library; "
"if provided, will not be inferred by the application"
)
)
parser.add_argument(
"--verbosity",
choices=[e.name for e in LogLevels],
Expand Down
4 changes: 4 additions & 0 deletions htsinfer/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,7 @@ class TranscriptsFastaProblem(Exception):

class CutadaptProblem(Exception):
"""Exception raised when running cutadapt commands."""


class UnsupportedSampleSourceException(Exception):
"""Exception raised when taxonomy ID is not supported."""
83 changes: 75 additions & 8 deletions htsinfer/get_library_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
import subprocess as sp
import tempfile

from Bio import SeqIO # type: ignore
import pandas as pd # type: ignore
from pandas import DataFrame # type: ignore

from htsinfer.exceptions import (
FileProblem,
KallistoProblem,
TranscriptsFastaProblem,
UnsupportedSampleSourceException,
)
from htsinfer.models import (
ResultsSource,
Expand Down Expand Up @@ -50,6 +52,7 @@ class GetLibSource:
min_freq_ratio: Minimum frequency ratio between the first and second
most frequent source in order for the former to be considered the
library's source.
tax_id: Taxonomy ID of the sample source.
"""
def __init__( # pylint: disable=E1101
self,
Expand All @@ -63,6 +66,7 @@ def __init__( # pylint: disable=E1101
self.tmp_dir = config.args.tmp_dir
self.min_match_pct = config.args.lib_source_min_match_pct
self.min_freq_ratio = config.args.lib_source_min_freq_ratio
self.tax_id = config.args.tax_id

def evaluate(self) -> ResultsSource:
"""Infer read source.
Expand All @@ -71,16 +75,36 @@ def evaluate(self) -> ResultsSource:
Source results object.
"""
source = ResultsSource()
index = self.create_kallisto_index()
source.file_1 = self.get_source(
fastq=self.paths[0],
index=index,
)
if self.paths[1] is not None:
source.file_2 = self.get_source(
fastq=self.paths[1],
# Check if library_source is provided, otherwise infer it
if self.tax_id is not None:
source.file_1.taxon_id = self.tax_id
src_name = self.get_source_name(
self.tax_id,
self.transcripts_file
)
source.file_1.short_name = src_name

if self.paths[1] is not None:
source.file_2.taxon_id = self.tax_id
source.file_2.short_name = source.file_1.short_name

else:
index = self.create_kallisto_index()
library_source = self.get_source(
fastq=self.paths[0],
index=index,
)
source.file_1.short_name = library_source.short_name
source.file_1.taxon_id = library_source.taxon_id

if self.paths[1] is not None:
library_source = self.get_source(
fastq=self.paths[1],
index=index,
)
source.file_2.short_name = library_source.short_name
source.file_2.taxon_id = library_source.taxon_id

return source

def create_kallisto_index(self) -> Path:
Expand Down Expand Up @@ -281,3 +305,46 @@ def get_source_expression(

# return as dictionary
return dat_agg.sort_values(["tpm"], ascending=False)

@staticmethod
def get_source_name(
taxon_id: int,
transcripts_file: Path,
) -> str:
"""Return name of the source organism, based on tax ID.
Args:
taxon_id: Taxonomy ID of a given organism.
transcripts_file: Path to FASTA file containing transcripts.
Returns:
Short name of the organism belonging to the given tax ID.
Raises:
FileProblem: Could not process input FASTA file.
UnsupportedSampleSourceException: Taxon ID is not supported.
"""
src_dict = {}

try:
for record in list(SeqIO.parse(
handle=transcripts_file,
format='fasta',
)):
tax_id = int(record.description.split("|")[4])
src_name = record.description.split("|")[3]

src_dict[tax_id] = src_name

except OSError as exc:
raise FileProblem(
f"Could not process file '{transcripts_file}'"
) from exc

try:
return src_dict[taxon_id]

except KeyError as exc:
raise UnsupportedSampleSourceException(
f'Taxon ID "{taxon_id}" is not supported by HTSinfer.'
) from exc
2 changes: 1 addition & 1 deletion htsinfer/get_read_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def evaluate(self) -> None:
try:
with open(self.path, encoding="utf-8") as _f: # type: ignore

LOGGER.debug("Procecssing Reads")
LOGGER.debug("Processing Reads")
try:
for record in FastqGeneralIterator(source=_f):
read = record[1]
Expand Down
12 changes: 6 additions & 6 deletions htsinfer/htsinfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,15 +85,15 @@ def evaluate(self):
self.get_library_stats()
LOGGER.info(
"Library stats determined: "
f"{self.config.results.library_stats.json()}"
f"{self.config.results.library_stats.model_dump_json()}"
)

# determine library source
LOGGER.info("Determining library source...")
self.config.results.library_source = self.get_library_source()
LOGGER.info(
"Library source determined: "
f"{self.config.results.library_source.json()}"
f"{self.config.results.library_source.model_dump_json()}"
)

# determine library type
Expand All @@ -106,7 +106,7 @@ def evaluate(self):
LOGGER.warning(f"{type(exc).__name__}: {str(exc)}")
LOGGER.info(
"Library type determined: "
f"{self.config.results.library_type.json()}"
f"{self.config.results.library_type.model_dump_json()}"
)

# determine read orientation
Expand All @@ -119,7 +119,7 @@ def evaluate(self):
LOGGER.warning(f"{type(exc).__name__}: {str(exc)}")
LOGGER.info(
"Read orientation determined: "
f"{self.config.results.read_orientation.json()}"
f"{self.config.results.read_orientation.model_dump_json()}"
)

# determine read layout
Expand All @@ -132,7 +132,7 @@ def evaluate(self):
LOGGER.warning(f"{type(exc).__name__}: {str(exc)}")
LOGGER.info(
"Read layout determined: "
f"{self.config.results.read_layout.json()}"
f"{self.config.results.read_layout.model_dump_json()}"
)

except FileProblem as exc:
Expand All @@ -148,7 +148,7 @@ def evaluate(self):
LOGGER.error(f"{type(exc).__name__}: {str(exc)}")

# log results
LOGGER.info(f"Results: {self.config.results.json()}")
LOGGER.info(f"Results: {self.config.results.model_dump_json()}")

def prepare_env(self):
"""Set up work environment."""
Expand Down
2 changes: 2 additions & 0 deletions htsinfer/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@ class Args(BaseModel):
records: Number of input file records to process; set to `0` to
process all records.
threads: Number of threads to run STAR with.
tax_id: Taxonomy ID of the sample source.
transcripts_file: File path to transcripts FASTA file.
read_layout_adapter_file: Path to text file containing 3' adapter
sequences to scan for (one sequence per line).
Expand Down Expand Up @@ -429,6 +430,7 @@ class Args(BaseModel):
CleanupRegimes.DEFAULT
records: int = 1000000
threads: int = 1
tax_id: Optional[int] = None
transcripts_file: Path = Path()
read_layout_adapter_file: Path = Path()
read_layout_min_match_pct: float = 0.1
Expand Down
4 changes: 2 additions & 2 deletions pylint.cfg
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[MESSAGES CONTROL]
disable=C0330,I1101,R0801,R0902,R0903,R0913,R0914,W1202,W1203,W1510
extension-pkg-white-list=pysam,ahocorasick
disable=I1101,R0801,R0902,R0903,R0913,R0914,W1202,W1203,W1510
extension-pkg-whitelist=pysam,ahocorasick
ignored-classes=pysam
Loading

0 comments on commit 8443c05

Please sign in to comment.