From e100d720a4ce5d644d13aa2ad842f62a9b170a03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <51365402+balajtimate@users.noreply.github.com> Date: Wed, 31 Jan 2024 21:16:16 +0100 Subject: [PATCH] refactor: pre-release cleanup (#162) --- .gitignore | 2 - README.md | 30 +- htsinfer/get_library_type.py | 26 +- htsinfer/models.py | 2 +- tests/cluster_tests/cluster_test_strategy.py | 283 ------------------- tests/cluster_tests/mined_test_data.tsv | 59 ---- tests/test_get_library_type.py | 20 +- tests/test_htsinfer.py | 10 +- 8 files changed, 52 insertions(+), 380 deletions(-) delete mode 100644 tests/cluster_tests/cluster_test_strategy.py delete mode 100644 tests/cluster_tests/mined_test_data.tsv diff --git a/.gitignore b/.gitignore index c53c6f18..3fae8b9c 100644 --- a/.gitignore +++ b/.gitignore @@ -111,10 +111,8 @@ dmypy.json # Custom additions .vscode .idea -results .DS_Store tests/.DS_Store results_htsinfer .snakemake -tests/cluster_tests/results_sra_downloads *.out diff --git a/README.md b/README.md index 53e7267d..391892b5 100644 --- a/README.md +++ b/README.md @@ -32,16 +32,6 @@ example library: ```json { - "library_source": { - "file_1": { - "short_name": "hsapiens", - "taxon_id": "9606" - }, - "file_2": { - "short_name": "hsapiens", - "taxon_id": "9606" - } - }, "library_stats": { "file_1": { "read_length": { @@ -62,11 +52,26 @@ example library: } } }, + "library_source": { + "file_1": { + "short_name": "hsapiens", + "taxon_id": "9606" + }, + "file_2": { + "short_name": "hsapiens", + "taxon_id": "9606" + } + }, "library_type": { "file_1": "first_mate", "file_2": "second_mate", "relationship": "split_mates" }, + "read_orientation": { + "file_1": "SF", + "file_2": "SR", + "relationship": "ISF" + }, "read_layout": { "file_1": { "adapt_3": "AATGATACGGCGACC", @@ -76,11 +81,6 @@ example library: "adapt_3": "AATGATACGGCGACC", "polyA_frac": 10.0 } - }, - "read_orientation": { - "file_1": "SF", - "file_2": "SR", - "relationship": "ISF" } } ``` diff --git a/htsinfer/get_library_type.py b/htsinfer/get_library_type.py index e4aea818..8e4166b3 100644 --- a/htsinfer/get_library_type.py +++ b/htsinfer/get_library_type.py @@ -253,18 +253,20 @@ def _update_relationship_type(self, concordant, aligned_reads): self.results.relationship = ( StatesTypeRelationship.not_mates ) - if self.results.relationship == ( - StatesTypeRelationship.split_mates - ) and ( - self.results.file_1 == StatesType.single and - self.results.file_2 == StatesType.single - ) or ( - self.results.file_1 == StatesType.not_available and - self.results.file_2 == StatesType.not_available - ): - # Update first and second relationship - self.results.file_1 = StatesType.first_mate_assumed - self.results.file_2 = StatesType.second_mate_assumed + if ( + self.results.relationship == ( + StatesTypeRelationship.split_mates + ) + and ( + (self.results.file_1 == StatesType.single and + self.results.file_2 == StatesType.single) or + (self.results.file_1 == StatesType.not_available and + self.results.file_2 == StatesType.not_available) + ) + ): + # Update first and second relationship + self.results.file_1 = StatesType.first_mate_assumed + self.results.file_2 = StatesType.second_mate_assumed class AlignedSegment: """Placeholder class for mypy "Missing attribute" diff --git a/htsinfer/models.py b/htsinfer/models.py index e8e27781..f8ab629c 100644 --- a/htsinfer/models.py +++ b/htsinfer/models.py @@ -339,8 +339,8 @@ class Results(BaseModel): read_layout: Read layout inference results. """ library_stats: ResultsStats = ResultsStats() - library_type: ResultsType = ResultsType() library_source: ResultsSource = ResultsSource() + library_type: ResultsType = ResultsType() read_orientation: ResultsOrientation = ResultsOrientation() read_layout: ResultsLayout = ResultsLayout() diff --git a/tests/cluster_tests/cluster_test_strategy.py b/tests/cluster_tests/cluster_test_strategy.py deleted file mode 100644 index 400d3f5e..00000000 --- a/tests/cluster_tests/cluster_test_strategy.py +++ /dev/null @@ -1,283 +0,0 @@ -"""Testing strategy for HTSinfer.""" - -import json -import subprocess as sp -import pandas as pd -import numpy as np -import seaborn as sns -import matplotlib.pyplot as plt -from pathlib import Path -from htsinfer.models import Results - -records = 100000 -min_match = 2 -min_freq = 0.3 - -ZARP_CMD = 'TEST_PATH=tests/cluster_tests; \ - ZARP_PATH=../zarp; \ - conda run -n zarp snakemake --unlock \ - --snakefile="$ZARP_PATH/workflow/rules/sra_download.smk" \ - --profile="$ZARP_PATH/profiles/local-conda" \ - --config samples="$TEST_PATH/mined_test_data.tsv" \ - outdir="$TEST_PATH/results_sra_downloads" \ - samples_out="$TEST_PATH/results_sra_downloads/mined_data.out.tsv" \ - log_dir="$ZARP_PATH/logs" \ - cluster_log_dir="$ZARP_PATH/logs/cluster_log"' - -FOLDER_CMD = 'FOLDER=$(date +%m%d_%H%M%S); \ - TEST_PATH=tests/cluster_tests; \ - mkdir $TEST_PATH/results_htsinfer/$FOLDER; \ - echo "$FOLDER" > $TEST_PATH/results_htsinfer/temp.txt' - -HTS_CMD = 'TEST_PATH=tests/cluster_tests; \ - FOLDER=`cat $TEST_PATH/results_htsinfer/temp.txt`; \ - conda run -n htsinfer htsinfer \ - --output-directory $TEST_PATH/results_htsinfer/$FOLDER \ - --cleanup-regime KEEP_ALL \ - --verbosity DEBUG \ - --read-layout-min-match-percentage ' + str(min_match) + '\ - --read-layout-min-frequency-ratio ' + str(min_freq) + '\ - --threads 7 \ - --records ' + str(records) - -result_data = {} -graph_data = {} - -# test files -PACKAGE_DIR = Path(__file__).resolve().parents[3] / "htsinfer" -ZARP_DIR = Path(__file__).resolve().parent / "zarp" -RESULTS_SRA_DIR = Path(__file__).resolve().parent / "results_sra_downloads" -RESULTS_HTS_DIR = Path(__file__).resolve().parent / "results_htsinfer" -MINED_DATA = Path(__file__).resolve().parent / "mined_test_data.tsv" - - -class TestDownloadSRASamples: - """Download the SRA Samples.""" - - def test_zarp_download(self): - sp.Popen(ZARP_CMD, shell=True, - executable='/bin/bash').communicate() - - -class TestHTSinfer: - """Test HTSinfer on downloaded samples.""" - - def test_htsinfer_se_pe(self): - with open(MINED_DATA, encoding="utf-8") as tsv_file: - source = pd.read_csv(tsv_file, sep='\t') - sp.Popen(FOLDER_CMD, shell=True, - executable='/bin/bash').communicate() - for index, row in source.iterrows(): - if row['layout'] == 'SE': - sample_se = str(RESULTS_SRA_DIR) + '/' + row['sample'] \ - + '/' + row['sample'] + '.fastq.gz' - sp.Popen(HTS_CMD + ' ' + sample_se + ' > ' - + str(RESULTS_HTS_DIR) + '/$FOLDER/' - + row['sample'] + '_result.json', - shell=True, - executable='/bin/bash').communicate() - else: - sample_1 = str(RESULTS_SRA_DIR) + '/' + row['sample'] \ - + '/' + row['sample'] + '_1.fastq.gz' - sample_2 = str(RESULTS_SRA_DIR) + '/' + row['sample'] \ - + '/' + row['sample'] + '_2.fastq.gz' - sp.Popen(HTS_CMD + ' ' + sample_1 + ' ' - + sample_2 + ' > ' + str(RESULTS_HTS_DIR) - + '/$FOLDER/' + row['sample'] + '_result.json', - shell=True, - executable='/bin/bash').communicate() - - def test_write_results(self): - with open(MINED_DATA, encoding="utf-8") as tsv_file: - source = pd.read_csv(tsv_file, sep='\t') - results_folder = ( - sp.check_output('TEST_P=tests/cluster_tests/results_htsinfer; \ - FOLDER=`cat $TEST_P/temp.txt`; \ - echo $FOLDER', shell=True, - executable='/bin/bash').strip().decode( - 'ascii')) - for index, row in source.iterrows(): - if row['layout'] == 'SE': - with open(str(RESULTS_HTS_DIR) + '/' + results_folder + '/' - + row['sample'] + '_result.json', - encoding="utf-8") as json_file: - data_model = Results(**json.load(json_file)) - result_data[index] = { - 'pred_org': - data_model.library_source.file_1.short_name, - 'pred_orient': - data_model.read_orientation.file_1, - 'pred_adapter': - data_model.read_layout.file_1.adapt_3, - 'pred_length_min_1': - data_model.library_stats.file_1.read_length.min, - 'pred_length_max_1': - data_model.library_stats.file_1.read_length.max} - result = source.fillna( - pd.DataFrame.from_dict(result_data, - orient='index')) - else: - with open(str(RESULTS_HTS_DIR) + '/' + results_folder + '/' - + row['sample'] + '_result.json', - encoding="utf-8") as json_file: - data_model = Results(**json.load(json_file)) - result_data[index] = { - 'pred_org': - data_model.library_source.file_1.short_name, - 'pred_orient': - data_model.read_orientation.file_1, - 'pred_adapter': - data_model.read_layout.file_1.adapt_3, - 'pred_length_min_1': - data_model.library_stats.file_1.read_length.min, - 'pred_length_max_1': - data_model.library_stats.file_1.read_length.max, - 'pred_length_min_2': - data_model.library_stats.file_2.read_length.min, - 'pred_length_max_2': - data_model.library_stats.file_2.read_length.max} - result = source.fillna( - pd.DataFrame.from_dict(result_data, - orient='index')) - - # Comparison of results - result['match_org'] = np.where( - result['org'] == result['pred_org'], True, False) - result['match_length'] = np.where( - result['pred_length_max_1'] == - result['length_max_1'], True, False) - result['match_adapter'] = result.apply( - lambda x: str(x.pred_adapter) in str(x.adapter), axis=1) - pd.DataFrame.to_csv( - result, 'tests/cluster_tests/results_htsinfer/' - + results_folder + '/mined_test_data_result.tsv', - sep='\t', index=False) - - with open(MINED_DATA, encoding="utf-8") as tsv_file: - source = pd.read_csv(tsv_file, sep='\t') - results_folder = ( - sp.check_output('TEST_P=tests/cluster_tests/results_htsinfer; \ - FOLDER=`cat $TEST_P/temp.txt`; \ - echo $FOLDER', shell=True, - executable='/bin/bash').strip().decode( - 'ascii')) - for index, row in source.iterrows(): - if row['layout'] == 'SE': - with open(str(RESULTS_HTS_DIR) + '/' + results_folder + '/' - + 'read_layout_' + row['sample'] + '.fastq.json', - encoding="utf-8") as json_file: - graph_model = json.load(json_file) - graph_data[index] = { - '1_adapt_1': graph_model['data'][0][0], - '1_percent_1': graph_model['data'][0][1], - '1_adapt_2': graph_model['data'][1][0], - '1_percent_2': graph_model['data'][1][1]} - else: - with open(str(RESULTS_HTS_DIR) + '/' + results_folder + '/' - + 'read_layout_' + row['sample'] - + '_1.fastq.json', - encoding="utf-8") as json_file_1, open( - str(RESULTS_HTS_DIR) + '/' + results_folder + '/' - + 'read_layout_' + row['sample'] + - '_2.fastq.json', - encoding="utf-8") as json_file_2: - graph_model_1 = json.load(json_file_1) - graph_model_2 = json.load(json_file_2) - graph_data[index] = { - '1_adapt_1': graph_model_1['data'][0][0], - '1_percent_1': graph_model_1['data'][0][1], - '1_adapt_2': graph_model_1['data'][1][0], - '1_percent_2': graph_model_1['data'][1][1], - '2_adapt_1': graph_model_2['data'][0][0], - '2_percent_1': graph_model_2['data'][0][1], - '2_adapt_2': graph_model_2['data'][1][0], - '2_percent_2': graph_model_2['data'][1][1]} - graph_result = pd.DataFrame.from_dict(graph_data, orient='index') - result_final = pd.concat([result, graph_result], - axis=1, join="inner") - # print(result_final) - pd.DataFrame.to_csv(result_final, - 'tests/cluster_tests/results_htsinfer/' - + results_folder + '/graph_result.tsv', - sep='\t', index=False) - - # Barplot - result_final.loc[result_final['pred_adapter'].notnull(), - 'pred_adapter'] = 1 - result_final['pred_adapter'] = ( - result_final['pred_adapter'].fillna(0)) - fig, axs = plt.subplots(1, figsize=[8, 8]) - ax = sns.barplot(x='pred_adapter', y='pred_adapter', - data=result_final, - estimator=lambda x: len(x) / - len(result_final) * 100) - ax.set(ylabel="Percent") - ax.bar_label(ax.containers[0], fmt='%.f%%') - ax.set(xticklabels=["Not identified", "Identified"]) - ax.set(title='Number and fraction of identified adapters\nID: ' - + str(results_folder) - + '\nNo. of records: ' + str(records) - + '\nRead layout min-match percentage: ' + str(min_match) - + '\nRead layout min frequency ratio: ' + str(min_freq)) - plt.savefig(str(RESULTS_HTS_DIR) + '/' + str(results_folder) - + '/1_Barplot_predicted_adapters', dpi=100) - - # Histogram of 1st predicted adapter percent #1 - # # Drop 0 percentages of both SE and PE reads - result_final_n = result_final.drop(result_final[ - (result_final['1_percent_1'] == 0) & - (result_final['2_percent_1'] == 0) | - (result_final['1_percent_1'] == 0) & - result_final['2_percent_1'].isna()].index) - all_percent = pd.concat([result_final_n['1_percent_1'], - result_final_n['2_percent_1']]) - all_percent = all_percent[all_percent != 0] - fig, axs = plt.subplots(1, figsize=[8, 8]) - sns.histplot(data=all_percent, binwidth=2).set( - title='Fraction of reads containing most ' - + 'prevalent adapter\nID: ' + str(results_folder) - + '\nNo. of records: ' + str(records) - + '\nRead layout min-match percentage: ' + str(min_match) - + '\nRead layout min frequency ratio: ' + str(min_freq)) - plt.xlim(0, 100) - plt.savefig(str(RESULTS_HTS_DIR) + '/' + str(results_folder) - + '/2_Hist_1st_pred_adapter_full.png', dpi=100) - - # Histogram of 1st predicted adapter percent #2 - fig, axs = plt.subplots(1, figsize=[8, 8]) - sns.histplot(data=all_percent, binwidth=0.2).set( - title='Fraction of reads containing most ' - + 'prevalent adapter\nID: ' - + str(results_folder) - + '\nNo. of records: ' + str(records) - + '\nRead layout min-match percentage: ' + str(min_match) - + '\nRead layout min frequency ratio: ' + str(min_freq)) - plt.xlim(0, 10) - plt.savefig(str(RESULTS_HTS_DIR) + '/' + str(results_folder) - + '/3_Hist_1st_pred_adapter_10.png', dpi=100) - - # Histogram of 1st vs 2nd predicted adapter ratio - result_final_n = result_final.drop(result_final[ - (result_final['1_percent_1'] == 0) & - (result_final['2_percent_1'] == 0) | - (result_final['1_percent_1'] == 0) & - result_final['2_percent_1'].isna()].index) - result_final_n['1_ratio'] = ( - result_final_n['1_percent_1'] / ( - result_final_n['1_percent_2'] + 0.01)) - result_final_n['2_ratio'] = ( - result_final_n['2_percent_1'] / ( - result_final_n['2_percent_2'] + 0.01)) - all_ratios = pd.concat( - [result_final_n['1_ratio'], result_final_n['2_ratio']]) - fig, axs = plt.subplots(1, figsize=[8, 8]) - sns.histplot(data=all_ratios).set( - title='Fraction of reads with most prevalent adapter ' - + 'vs. second most prevalent\nID: ' - + str(results_folder) - + '\nNo. of records: ' + str(records) - + '\nRead layout min-match percentage: ' + str(min_match) - + '\nRead layout min frequency ratio: ' + str(min_freq), - xscale="log") - plt.savefig(str(RESULTS_HTS_DIR) + '/' + str(results_folder) - + '/4_Hist_1st_vs_2nd_pred_adapter_ratio.png', dpi=100) diff --git a/tests/cluster_tests/mined_test_data.tsv b/tests/cluster_tests/mined_test_data.tsv deleted file mode 100644 index 8e21e345..00000000 --- a/tests/cluster_tests/mined_test_data.tsv +++ /dev/null @@ -1,59 +0,0 @@ -sample sra org orient adapter layout length_min_1 length_max_1 length_2 pred_org pred_orient pred_adapter pred_length_min_1 pred_length_max_1 pred_length_min_2 pred_length_max_2 -SRR350746 SRX099799 tgondii SE 100 -SRR582892 SRX192993 sratti SE 36 -SRR922417 SRX316167 obarthii SE 50 -SRR1103279 SRX422668 tkitauei AGATCGGAAGAGCACACGTCTGAACTCCAGTCA / AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT SE 101 -SRR1289523 SRX544837 dnovemcinctus ISR PE 148 -SRR1289524 SRX544837 dnovemcinctus ISR PE 148 -SRR1735385 SRX220387 cmilii ISF PE 152 -SRR2131239 SRX1121877 lanatina ISR PE 196 -SRR5223634 SRX2532815 lsdomestica CTGTCTCTTATA PE 240 -SRR5351980 SRX2648341 dmelanogaster CTGTCTCTTATACACATCT SE 51 -SRR6987423 SRX3926910 athaliana TGGAATTCTCGGGTGCCAAGG SE 17 30 -SRR7905052 SRX4741405 dnovemcinctus SR SE 75 -SRR7905053 SRX4741405 dnovemcinctus SR SE 75 -SRR8506668 SRX5310522 mmusculus CTGTCTCTTATACACATCT PE 50 -SRR8506669 SRX5310522 mmusculus CTGTCTCTTATACACATCT PE 50 -SRR8506670 SRX5310522 mmusculus CTGTCTCTTATACACATCT PE 50 -SRR8951910 SRX5731760 aqueenslandica SE 55 -SRR8951911 SRX5731760 aqueenslandica SE 55 -SRR9042958 SRX5819712 tpseudonana PE 300 -SRR9042959 SRX5819712 tpseudonana PE 300 -SRR9042960 SRX5819712 tpseudonana PE 300 -SRR9042961 SRX5819712 tpseudonana PE 300 -SRR9259170 SRX6029612 ddiscoideum SE 51 -SRR9289060 SRX6058708 pchabaudi ISR PE 302 -SRR9289061 SRX6058708 pchabaudi ISR PE 302 -SRR9289073 SRX6058720 pchabaudi ISR PE 302 -SRR9289074 SRX6058720 pchabaudi ISR PE 302 -SRR9958771 SRX6706323 pmarinus SE 50 -SRR10409017 SRX7107235 avaga PE 300 -SRR10409020 SRX7107238 avaga SE 135 -SRR10446403 SRX7141114 tspiralis PE 148 -SRR10446726 SRX7141398 acarolinensis TGGAATTCTCGGGTGCCAAGG SE 33 -SRR12145895 SRX8666619 ddiscoideum PE 300 -SRR12442715 SRX8937362 dnovemcinctus SR SE 101 -SRR12735068 SRX9208006 ptricornotum ISR AGATCGGAAGAGCACACGTCTGAACTCCAGTCA / AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT PE 150 -SRR12867725 SRX9333864 ssclerotiorum SE 125 -SRR13754963 SRX10141734 kpastoris PE 300 -SRR13922067 SRX10301515 hsapiens AGATCGGAAGAGCACACGTCTGAACTCCAGTCA / AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT SE 50 -SRR14102454 SRX10474460 creinhardtii PE 151 76 -SRR14597451 SRX10944439 cintestinalis SE 101 -SRR14827012 SRX11156629 avaga PE 178 -SRR16596798 SRX12798176 xtropicalis PE 202 -SRR16892632 SRX13085341 celegans SF SE 75 -SRR16892633 SRX13085341 celegans SF SE 75 -SRR16892634 SRX13085341 celegans SF SE 75 -SRR16892635 SRX13085341 celegans SF SE 75 -SRR16936848 SRX13128815 smansoni SR SE 85 -SRR16948910 SRX13140796 ppatens ISR PE 151 -SRR16955213 SRX13146816 obimaculoides PE 151 -SRR17248189 SRX13427171 smansoni SR SE 75 -SRR17334639 SRX13510193 cintestinalis PE 151 -SRR19548813 SRX15600951 spurpuratus PE 250 -SRR19548823 SRX15600941 spurpuratus PE 152 -SRR19548828 SRX15600936 smaritima PE 150 -SRR19548848 SRX15600916 cmilii PE 250 -SRR19548868 SRX15600896 blanceolatum PE 250 -SRR21639732 SRX17639978 hsapiens ISF GATCGGAAGAGCACACGTCTGAACTCCAGTCAC SE 26 98 -SRR21639733 SRX17639978 hsapiens ISF GATCGGAAGAGCACACGTCTGAACTCCAGTCAC SE 26 98 diff --git a/tests/test_get_library_type.py b/tests/test_get_library_type.py index 6c1bc657..5d900df5 100644 --- a/tests/test_get_library_type.py +++ b/tests/test_get_library_type.py @@ -213,16 +213,30 @@ def test_update_relationship_type_not_mates(self, tmpdir): concordant = 0 read_counter = 20 - # Call the _update_relationship_type method test_instance._update_relationship_type(concordant, read_counter) assert ( test_instance.results.relationship == StatesTypeRelationship.not_mates ) + + # Simulate a scenario where ratio is above the cutoff + concordant = 20 + read_counter = 20 + + test_instance._update_relationship_type(concordant, read_counter) + assert ( - test_instance.mapping.library_type.relationship == - StatesTypeRelationship.not_available + test_instance.results.relationship == + StatesTypeRelationship.split_mates + ) + assert ( + test_instance.results.file_1 == + StatesType.first_mate_assumed + ) + assert ( + test_instance.results.file_2 == + StatesType.second_mate_assumed ) def test_evaluate_mate_relationship_not_determined(self, tmpdir): diff --git a/tests/test_htsinfer.py b/tests/test_htsinfer.py index ea6e65f7..3d2bf871 100644 --- a/tests/test_htsinfer.py +++ b/tests/test_htsinfer.py @@ -400,11 +400,6 @@ def test_print(self, capsys): } } }, - "library_type": { - "file_1": null, - "file_2": null, - "relationship": null - }, "library_source": { "file_1": { "short_name": null, @@ -415,6 +410,11 @@ def test_print(self, capsys): "taxon_id": null } }, + "library_type": { + "file_1": null, + "file_2": null, + "relationship": null + }, "read_orientation": { "file_1": null, "file_2": null,