feat: add org_id param #108

zavolanlab · Nov 9, 2023 · 00c0d6b · 00c0d6b
1 parent 11f7e08
commit 00c0d6b
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 21 deletions.
diff --git a/htsinfer/cli.py b/htsinfer/cli.py
@@ -273,17 +273,6 @@ def __call__(
             "not be inferred by the application"
         )
     )
-    parser.add_argument(
-        '--org-name',
-        dest="org_name",
-        metavar="STR",
-        type=str,
-        default=None,
-        help=(
-            "source organism of the sequencing library; if provided, will not "
-            "not be inferred by the application"
-        )
-    )
     parser.add_argument(
         "--verbosity",
         choices=[e.name for e in LogLevels],

diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py
@@ -2,10 +2,11 @@
 
 import logging
 from pathlib import Path
-from typing import Optional
 import subprocess as sp
 import tempfile
 
+from typing import Optional
+from Bio import SeqIO  # type: ignore
 import pandas as pd  # type: ignore
 from pandas import DataFrame  # type: ignore
 
@@ -51,6 +52,7 @@ class GetLibSource:
         min_freq_ratio: Minimum frequency ratio between the first and second
             most frequent source in order for the former to be considered the
             library's source.
+        org_id: Taxonomy ID of the organism.
     """
     def __init__(  # pylint: disable=E1101
         self,
@@ -64,7 +66,6 @@ def __init__(  # pylint: disable=E1101
         self.tmp_dir = config.args.tmp_dir
         self.min_match_pct = config.args.lib_source_min_match_pct
         self.min_freq_ratio = config.args.lib_source_min_freq_ratio
-        self.org_name = config.args.org_name
         self.org_id = config.args.org_id
 
     def evaluate(self) -> ResultsSource:
@@ -75,9 +76,12 @@ def evaluate(self) -> ResultsSource:
         """
         source = ResultsSource()
         # Check if library_source is provided, otherwise infer it
-        if self.org_name is not None:
-            source.file_1.short_name = self.org_name
+        if self.org_id is not None:
             source.file_1.taxon_id = self.org_id
+            source.file_1.short_name = self.get_organism_name(
+                self.org_id,
+                self.transcripts_file
+            )
         else:
             # Infer library source here and set it to source.library_source
             index = self.create_kallisto_index()
@@ -89,10 +93,11 @@ def evaluate(self) -> ResultsSource:
             source.file_1.taxon_id = library_source.taxon_id
 
         if self.paths[1] is not None:
-            # Check if library_source is provided for file_2, otherwise infer it
-            if self.org_name is not None:
-                source.file_2.short_name = self.org_name
+            # Check if library_source is provided for file_2,
+            # otherwise infer it
+            if self.org_id is not None:
                 source.file_2.taxon_id = self.org_id
+                source.file_2.short_name = source.file_1.short_name
             else:
                 library_source = self.get_source(
                     fastq=self.paths[1],
@@ -301,3 +306,48 @@ def get_source_expression(
 
         # return as dictionary
         return dat_agg.sort_values(["tpm"], ascending=False)
+
+    @staticmethod
+    def get_organism_name(
+        taxon_id: int,
+        transcripts_file: Path,
+    ) -> Optional[str]:
+        """Return name of the organism, based on tax ID.
+
+        Args:
+            taxon_id: Taxonomy ID of a given organism (int).
+            transcripts_file: Path to fasta file containing transcripts.
+
+        Returns:
+            Short name of the organism belonging to the given tax ID.
+
+        Raises:
+            Could not process input FASTA file.
+        """
+        org_dict = {}
+        # Construct dictionary of organism ID's and names
+        try:
+            for record in SeqIO.parse(
+                    handle=transcripts_file,
+                    format='fasta',
+            ):
+                org_id = int(record.description.split("|")[4])
+                org_name = record.description.split("|")[3]
+
+                if org_id not in org_dict:
+                    org_dict[org_id] = org_name
+                else:
+                    org_dict[org_id] = org_name
+
+        except OSError as exc:
+            raise FileProblem(
+                f"Could not process file '{transcripts_file}'"
+            ) from exc
+
+        if taxon_id in org_dict:
+            return org_dict[taxon_id]
+        LOGGER.warning(
+            f"Taxon ID '{taxon_id}' not found in organism dictionary, "
+            "using default 'None' for organism."
+        )
+        return None
diff --git a/htsinfer/models.py b/htsinfer/models.py
@@ -6,7 +6,7 @@
 )
 import logging
 import re
-from typing import Optional, Union
+from typing import Optional
 from pathlib import Path
 import tempfile
 
@@ -356,7 +356,6 @@ class Args(BaseModel):
         records: Number of input file records to process; set to `0` to
             process all records.
         threads: Number of threads to run STAR with.
-        org_name: Organism name.
         org_id: Organism ID.
         transcripts_file: File path to transcripts FASTA file.
         read_layout_adapter_file: Path to text file containing 3' adapter
@@ -431,7 +430,6 @@ class Args(BaseModel):
         CleanupRegimes.DEFAULT
     records: int = 0
     threads: int = 1
-    org_name: Optional[str] = None
     org_id: Optional[int] = None
     transcripts_file: Path = Path()
     read_layout_adapter_file: Path = Path()