Skip to content

Commit

Permalink
feat: improve lib type inference for SRA PE samples (#161)
Browse files Browse the repository at this point in the history
* feat: add asumed mates

* update get lib type tests

* lower lib_type_mate_cutoff
  • Loading branch information
balajtimate committed Jan 23, 2024
1 parent 7b65c43 commit 69ea74c
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 8 deletions.
2 changes: 1 addition & 1 deletion htsinfer/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def __call__(
dest="lib_type_mates_cutoff",
metavar="FLOAT",
type=float,
default=0.95,
default=0.85,
help=(
"minimum fraction of mates that can be mapped to compatible loci "
"and are considered concordant pairs / all mates"
Expand Down
33 changes: 30 additions & 3 deletions htsinfer/get_library_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,21 @@ def _evaluate_mate_relationship(
self.mapping.library_type.relationship = (
StatesTypeRelationship.split_mates
)
# Infer mate relationship, even when assumed to be single
elif (
self.results.file_1 == StatesType.single and
self.results.file_2 == StatesType.single
) and (
self.library_source.file_1.short_name is not None or
self.library_source.file_2.short_name is not None
):
LOGGER.debug("Determining mate relationship by alignment...")
self.mapping.library_type.relationship \
= StatesTypeRelationship.not_available
self.mapping.library_source = self.library_source
self.mapping.paths = self.path_1, self.path_2
self.mapping.evaluate()
self._align_mates()
elif (
self.library_source.file_1.short_name is not None or
self.library_source.file_2.short_name is not None
Expand Down Expand Up @@ -209,15 +224,15 @@ def _align_mates(self):
LOGGER.debug(f"Number of aligned reads file 2: {aligned_mate2}")
LOGGER.debug(f"Number of concordant reads: {concordant}")

self._update_relationship(
self._update_relationship_type(
concordant, min(aligned_mate1, aligned_mate2)
)

samfile1.close()
samfile2.close()

def _update_relationship(self, concordant, aligned_reads):
"""Helper function to update relationship based on alignment."""
def _update_relationship_type(self, concordant, aligned_reads):
"""Helper function to update relationship and type."""
try:
ratio = concordant / aligned_reads
except ZeroDivisionError:
Expand All @@ -238,6 +253,18 @@ def _update_relationship(self, concordant, aligned_reads):
self.results.relationship = (
StatesTypeRelationship.not_mates
)
if self.results.relationship == (
StatesTypeRelationship.split_mates
) and (
self.results.file_1 == StatesType.single and
self.results.file_2 == StatesType.single
) or (
self.results.file_1 == StatesType.not_available and
self.results.file_2 == StatesType.not_available
):
# Update first and second relationship
self.results.file_1 = StatesType.first_mate_assumed
self.results.file_2 = StatesType.second_mate_assumed

class AlignedSegment:
"""Placeholder class for mypy "Missing attribute"
Expand Down
4 changes: 3 additions & 1 deletion htsinfer/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,9 +155,11 @@ class StatesType(Enum):
that the library represents a single-end library.
"""
first_mate = "first_mate"
first_mate_assumed = "first_mate_assumed"
mixed_mates = "mixed_mates"
not_available = None
second_mate = "second_mate"
second_mate_assumed = "second_mate_assumed"
single = "single"


Expand Down Expand Up @@ -438,7 +440,7 @@ class Args(BaseModel):
lib_source_min_match_pct: float = 2
lib_source_min_freq_ratio: float = 2
lib_type_max_distance: int = 1000
lib_type_mates_cutoff: float = 0.95
lib_type_mates_cutoff: float = 0.85
read_orientation_min_mapped_reads: int = 20
read_orientation_min_fraction: float = 0.75
path_1_processed: Path = Path()
Expand Down
34 changes: 31 additions & 3 deletions tests/test_get_library_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,34 @@ def test_evaluate_mate_relationship_split_mates(self):
StatesTypeRelationship.split_mates
)

def test_evaluate_mate_relationship_assumed_single(self, tmpdir):
"""Test mate relationship evaluation logic with input files being
mates of a paired-end library but assumed single based on seq_ids.
"""
CONFIG.args.path_1_processed = FILE_MATE_1
CONFIG.args.path_2_processed = FILE_MATE_2
CONFIG.args.t_file_processed = FILE_TRANSCRIPTS
CONFIG.args.tmp_dir = tmpdir
CONFIG.results.library_source = ResultsSource(
file_1=Source(short_name="hsapiens", taxon_id=9606),
file_2=Source(short_name="hsapiens", taxon_id=9606),
)
MAPPING.paths = (FILE_MATE_1, FILE_MATE_2)
MAPPING.transcripts_file = FILE_TRANSCRIPTS
MAPPING.tmp_dir = tmpdir
test_instance = GetLibType(config=CONFIG,
mapping=MAPPING)
test_instance.results.file_1 = StatesType.single
test_instance.results.file_2 = StatesType.single
test_instance._evaluate_mate_relationship(
ids_1=["A", "B", "C"],
ids_2=["A", "B", "C"],
)
assert (
test_instance.results.relationship ==
StatesTypeRelationship.not_available
)

def test_evaluate_mate_relationship_not_mates(self, tmpdir):
"""Test mate relationship evaluation logic with input files that are
mates, but the relationship is not enough to trigger split_mates.
Expand Down Expand Up @@ -167,7 +195,7 @@ def test_evaluate_mate_relationship_not_available(self, tmpdir):
StatesTypeRelationship.not_available
)

def test_update_relationship_not_mates(self, tmpdir):
def test_update_relationship_type_not_mates(self, tmpdir):
"""Test update_relationship logic."""
CONFIG.args.path_1_processed = FILE_IDS_NOT_MATCH_1
CONFIG.args.path_2_processed = FILE_MATE_2
Expand All @@ -185,8 +213,8 @@ def test_update_relationship_not_mates(self, tmpdir):
concordant = 0
read_counter = 20

# Call the _update_relationship method
test_instance._update_relationship(concordant, read_counter)
# Call the _update_relationship_type method
test_instance._update_relationship_type(concordant, read_counter)

assert (
test_instance.results.relationship ==
Expand Down

0 comments on commit 69ea74c

Please sign in to comment.