feat: add tax_id parameter (#147)

* feat: add org param * refactor: avoid duplicate mappings (#131) Co-authored-by: Boris Jurič <499542@mail.muni.cz> Co-authored-by: Alex Kanitz <alexander.kanitz@alumni.ethz.ch> * fix typo, update pylint config * feat: add org_id param #108 * refactor: get_library_source.py #108 * test: add org param tests #108 * fix: update Pydantic version (#146) * fix pydantic issues * fix: update pydantic version in envs * fix: pin sphinx-rtd-theme into env * fix: update readthedocs config * update readme, gitignore * feat: infer org source if id not in dict #108 * replace json with model_dump * feat: add org_id param #108 * feat: add org_id param #108 * refactor: replace org with tax-id * refactor get_library_source * refactor get_library_source tests * refactor: update models.py * refactor: fix typos --------- Co-authored-by: Boris Jurič <74237898+BorisYourich@users.noreply.github.com> Co-authored-by: Boris Jurič <499542@mail.muni.cz> Co-authored-by: Alex Kanitz <alexander.kanitz@alumni.ethz.ch>
zavolanlab · Nov 15, 2023 · 8443c05 · 8443c05
1 parent 5ef3322
commit 8443c05
Show file tree

Hide file tree

Showing 10 changed files with 238 additions and 18 deletions.
diff --git a/.gitignore b/.gitignore
@@ -117,3 +117,4 @@ tests/.DS_Store
 results_htsinfer
 .snakemake
 tests/cluster_tests/results_sra_downloads
+*.out
diff --git a/README.md b/README.md
@@ -109,6 +109,7 @@ htsinfer [--output-directory PATH]
          [--library-type-mates-cutoff FLOAT]
          [--read-orientation-min-mapped-reads INT]
          [--read-orientation-min-fraction FLOAT]
+         [--tax-id INT]
          [--verbosity {DEBUG,INFO,WARN,ERROR,CRITICAL}]
          [-h] [--version]
          PATH [PATH]

diff --git a/htsinfer/cli.py b/htsinfer/cli.py
@@ -262,6 +262,17 @@ def __call__(
             "be reported. Must be above 0.5"
         )
     )
+    parser.add_argument(
+        '--tax-id',
+        dest="tax_id",
+        metavar="INT",
+        type=int,
+        default=None,
+        help=(
+            "NCBI taxonomic identifier of source organism of the library; "
+            "if provided, will not be inferred by the application"
+        )
+    )
     parser.add_argument(
         "--verbosity",
         choices=[e.name for e in LogLevels],

diff --git a/htsinfer/exceptions.py b/htsinfer/exceptions.py
@@ -44,3 +44,7 @@ class TranscriptsFastaProblem(Exception):
 
 class CutadaptProblem(Exception):
     """Exception raised when running cutadapt commands."""
+
+
+class UnsupportedSampleSourceException(Exception):
+    """Exception raised when taxonomy ID is not supported."""
diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py
@@ -5,13 +5,15 @@
 import subprocess as sp
 import tempfile
 
+from Bio import SeqIO  # type: ignore
 import pandas as pd  # type: ignore
 from pandas import DataFrame  # type: ignore
 
 from htsinfer.exceptions import (
     FileProblem,
     KallistoProblem,
     TranscriptsFastaProblem,
+    UnsupportedSampleSourceException,
 )
 from htsinfer.models import (
     ResultsSource,
@@ -50,6 +52,7 @@ class GetLibSource:
         min_freq_ratio: Minimum frequency ratio between the first and second
             most frequent source in order for the former to be considered the
             library's source.
+        tax_id: Taxonomy ID of the sample source.
     """
     def __init__(  # pylint: disable=E1101
         self,
@@ -63,6 +66,7 @@ def __init__(  # pylint: disable=E1101
         self.tmp_dir = config.args.tmp_dir
         self.min_match_pct = config.args.lib_source_min_match_pct
         self.min_freq_ratio = config.args.lib_source_min_freq_ratio
+        self.tax_id = config.args.tax_id
 
     def evaluate(self) -> ResultsSource:
         """Infer read source.
@@ -71,16 +75,36 @@ def evaluate(self) -> ResultsSource:
             Source results object.
         """
         source = ResultsSource()
-        index = self.create_kallisto_index()
-        source.file_1 = self.get_source(
-            fastq=self.paths[0],
-            index=index,
-        )
-        if self.paths[1] is not None:
-            source.file_2 = self.get_source(
-                fastq=self.paths[1],
+        # Check if library_source is provided, otherwise infer it
+        if self.tax_id is not None:
+            source.file_1.taxon_id = self.tax_id
+            src_name = self.get_source_name(
+                self.tax_id,
+                self.transcripts_file
+            )
+            source.file_1.short_name = src_name
+
+            if self.paths[1] is not None:
+                source.file_2.taxon_id = self.tax_id
+                source.file_2.short_name = source.file_1.short_name
+
+        else:
+            index = self.create_kallisto_index()
+            library_source = self.get_source(
+                fastq=self.paths[0],
                 index=index,
             )
+            source.file_1.short_name = library_source.short_name
+            source.file_1.taxon_id = library_source.taxon_id
+
+            if self.paths[1] is not None:
+                library_source = self.get_source(
+                    fastq=self.paths[1],
+                    index=index,
+                )
+                source.file_2.short_name = library_source.short_name
+                source.file_2.taxon_id = library_source.taxon_id
+
         return source
 
     def create_kallisto_index(self) -> Path:
@@ -281,3 +305,46 @@ def get_source_expression(
 
         # return as dictionary
         return dat_agg.sort_values(["tpm"], ascending=False)
+
+    @staticmethod
+    def get_source_name(
+        taxon_id: int,
+        transcripts_file: Path,
+    ) -> str:
+        """Return name of the source organism, based on tax ID.
+
+        Args:
+            taxon_id: Taxonomy ID of a given organism.
+            transcripts_file: Path to FASTA file containing transcripts.
+
+        Returns:
+            Short name of the organism belonging to the given tax ID.
+
+        Raises:
+            FileProblem: Could not process input FASTA file.
+            UnsupportedSampleSourceException: Taxon ID is not supported.
+        """
+        src_dict = {}
+
+        try:
+            for record in list(SeqIO.parse(
+                    handle=transcripts_file,
+                    format='fasta',
+            )):
+                tax_id = int(record.description.split("|")[4])
+                src_name = record.description.split("|")[3]
+
+                src_dict[tax_id] = src_name
+
+        except OSError as exc:
+            raise FileProblem(
+                f"Could not process file '{transcripts_file}'"
+            ) from exc
+
+        try:
+            return src_dict[taxon_id]
+
+        except KeyError as exc:
+            raise UnsupportedSampleSourceException(
+                f'Taxon ID "{taxon_id}" is not supported by HTSinfer.'
+            ) from exc
diff --git a/htsinfer/get_read_layout.py b/htsinfer/get_read_layout.py
@@ -221,7 +221,7 @@ def evaluate(self) -> None:
         try:
             with open(self.path, encoding="utf-8") as _f:  # type: ignore
 
-                LOGGER.debug("Procecssing Reads")
+                LOGGER.debug("Processing Reads")
                 try:
                     for record in FastqGeneralIterator(source=_f):
                         read = record[1]

diff --git a/htsinfer/htsinfer.py b/htsinfer/htsinfer.py
@@ -85,15 +85,15 @@ def evaluate(self):
                 self.get_library_stats()
                 LOGGER.info(
                     "Library stats determined: "
-                    f"{self.config.results.library_stats.json()}"
+                    f"{self.config.results.library_stats.model_dump_json()}"
                 )
 
                 # determine library source
                 LOGGER.info("Determining library source...")
                 self.config.results.library_source = self.get_library_source()
                 LOGGER.info(
                     "Library source determined: "
-                    f"{self.config.results.library_source.json()}"
+                    f"{self.config.results.library_source.model_dump_json()}"
                 )
 
                 # determine library type
@@ -106,7 +106,7 @@ def evaluate(self):
                     LOGGER.warning(f"{type(exc).__name__}: {str(exc)}")
                 LOGGER.info(
                     "Library type determined: "
-                    f"{self.config.results.library_type.json()}"
+                    f"{self.config.results.library_type.model_dump_json()}"
                 )
 
                 # determine read orientation
@@ -119,7 +119,7 @@ def evaluate(self):
                     LOGGER.warning(f"{type(exc).__name__}: {str(exc)}")
                 LOGGER.info(
                     "Read orientation determined: "
-                    f"{self.config.results.read_orientation.json()}"
+                    f"{self.config.results.read_orientation.model_dump_json()}"
                 )
 
                 # determine read layout
@@ -132,7 +132,7 @@ def evaluate(self):
                     LOGGER.warning(f"{type(exc).__name__}: {str(exc)}")
                 LOGGER.info(
                     "Read layout determined: "
-                    f"{self.config.results.read_layout.json()}"
+                    f"{self.config.results.read_layout.model_dump_json()}"
                 )
 
             except FileProblem as exc:
@@ -148,7 +148,7 @@ def evaluate(self):
             LOGGER.error(f"{type(exc).__name__}: {str(exc)}")
 
         # log results
-        LOGGER.info(f"Results: {self.config.results.json()}")
+        LOGGER.info(f"Results: {self.config.results.model_dump_json()}")
 
     def prepare_env(self):
         """Set up work environment."""

diff --git a/htsinfer/models.py b/htsinfer/models.py
@@ -356,6 +356,7 @@ class Args(BaseModel):
         records: Number of input file records to process; set to `0` to
             process all records.
         threads: Number of threads to run STAR with.
+        tax_id: Taxonomy ID of the sample source.
         transcripts_file: File path to transcripts FASTA file.
         read_layout_adapter_file: Path to text file containing 3' adapter
             sequences to scan for (one sequence per line).
@@ -429,6 +430,7 @@ class Args(BaseModel):
         CleanupRegimes.DEFAULT
     records: int = 1000000
     threads: int = 1
+    tax_id: Optional[int] = None
     transcripts_file: Path = Path()
     read_layout_adapter_file: Path = Path()
     read_layout_min_match_pct: float = 0.1

diff --git a/pylint.cfg b/pylint.cfg
@@ -1,4 +1,4 @@
 [MESSAGES CONTROL]
-disable=C0330,I1101,R0801,R0902,R0903,R0913,R0914,W1202,W1203,W1510
-extension-pkg-white-list=pysam,ahocorasick
+disable=I1101,R0801,R0902,R0903,R0913,R0914,W1202,W1203,W1510
+extension-pkg-whitelist=pysam,ahocorasick
 ignored-classes=pysam