Scripts used for the analysis of Fusarium genomes Note - all this work was performed in the directory: /home/groups/harrisonlab/project_files/fusarium
The following is a summary of the work presented in this Readme.
The following processes were applied to Fusarium genomes prior to analysis: Data qc Genome assembly Repeatmasking Gene prediction Functional annotation
Analyses performed on these genomes involved BLAST searching for:
ls contigs were identified using: Alignment of raw reads to assembled genomes Assembly of remaining reads
#Building of directory structure
RawDatDir=/home/groups/harrisonlab/raw_data/raw_seq/fusarium/HAPI_seq_3/
ProjectDir=/home/groups/harrisonlab/project_files/fusarium
mkdir -p $ProjectDir/raw_dna/paired/F.proliferatum/A8/F
mkdir -p $ProjectDir/raw_dna/paired/F.proliferatum/A8/R
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_narcissi/N139/F
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_narcissi/N139/R
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/PG3/F
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/PG3/R
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/PG18/F
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/PG18/R
ProjectDir=/home/groups/harrisonlab/project_files/fusarium
RawDatDir=/home/groups/harrisonlab/raw_data/raw_seq/raw_reads/160115_M01678_0041_000000000-AEMMK
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/FOP1/F
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/FOP1/R
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/FOP5/F
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/FOP5/R
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/L5/F
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/L5/R
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/HB6/F
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/HB6/R
ProjectDir=/home/groups/harrisonlab/project_files/fusarium
RawDatDir=/home/groups/harrisonlab/raw_data/raw_seq/raw_reads/160119_M016878_0042-AGDN7
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/A1-2/F
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/A1-2/R
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/A13/F
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/A13/R
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/CB3/F
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/CB3/R
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/HB6/F
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/HB6/R
mkdir -p $ProjectDir/raw_dna/paired/F.avenaceum/PG8/F
mkdir -p $ProjectDir/raw_dna/paired/F.avenaceum/PG8/R
ProjectDir=/home/groups/harrisonlab/project_files/fusarium
RawDatDir=/home/groups/harrisonlab/raw_data/raw_seq/raw_reads/160401_M004465_0007-AGKF2
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/FOP2/F
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/FOP2/R
ProjectDir=/home/groups/harrisonlab/project_files/fusarium
RawDatDir=/home/groups/harrisonlab/raw_data/raw_seq/raw_reads/160725_M04465_0020-AP1N0
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/AF064/F
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/AF064/R
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/CH5-2/F
mkdir -p $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/CH5-2/R
Sequence data was moved into the appropriate directories
RawDatDir=/home/groups/harrisonlab/raw_data/raw_seq/fusarium/HAPI_seq_3
ProjectDir=/home/groups/harrisonlab/project_files/fusarium
cp $RawDatDir/Fproliferatum_S1_L001_R1_001.fastq.gz $ProjectDir/raw_dna/paired/F.proliferatum/A8/F/.
cp $RawDatDir/Fproliferatum_S1_L001_R2_001.fastq.gz $ProjectDir/raw_dna/paired/F.proliferatum/A8/R/.
cp $RawDatDir/FoxysporumN139_S2_L001_R1_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_narcissi/N139/F/.
cp $RawDatDir/FoxysporumN139_S2_L001_R2_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_narcissi/N139/R/.
cp $RawDatDir/FoxysporumPG3_S3_L001_R1_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/PG3/F/.
cp $RawDatDir/FoxysporumPG3_S3_L001_R2_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/PG3/R/.
cp $RawDatDir/FoxysporumPG18_S4_L001_R1_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/PG18/F/.
cp $RawDatDir/FoxysporumPG18_S4_L001_R2_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/PG18/R/.
RawDatDir=/home/groups/harrisonlab/raw_data/raw_seq/raw_reads/160115_M01678_0041_000000000-AEMMK
ProjectDir=/home/groups/harrisonlab/project_files/fusarium
cp $RawDatDir/FOP1_S1_L001_R1_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/FOP1/F/.
cp $RawDatDir/FOP1_S1_L001_R2_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/FOP1/R/.
cp $RawDatDir/FOP5_S2_L001_R1_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/FOP5/F/.
cp $RawDatDir/FOP5_S2_L001_R2_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/FOP5/R/.
cp $RawDatDir/L5_S3_L001_R1_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/L5/F/.
cp $RawDatDir/L5_S3_L001_R2_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/L5/R/.
cp $RawDatDir/HB6_S4_L001_R1_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/HB6/F/.
cp $RawDatDir/HB6_S4_L001_R2_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/HB6/R/.
RawDatDir=/home/groups/harrisonlab/raw_data/raw_seq/raw_reads/160119_M016878_0042-AGDN7
ProjectDir=/home/groups/harrisonlab/project_files/fusarium
cp $RawDatDir/A12_S1_L001_R1_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/A1-2/F/.
cp $RawDatDir/A12_S1_L001_R2_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/A1-2/R/.
cp $RawDatDir/A13_S2_L001_R1_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/A13/F/.
cp $RawDatDir/A13_S2_L001_R2_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/A13/R/.
cp $RawDatDir/CB3_S3_L001_R1_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/CB3/F/.
cp $RawDatDir/CB3_S3_L001_R2_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/CB3/R/.
cp $RawDatDir/HB6_S5_L001_R1_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/HB6/F/.
cp $RawDatDir/HB6_S5_L001_R2_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/HB6/R/.
cp $RawDatDir/PG8_S4_L001_R1_001.fastq.gz $ProjectDir/raw_dna/paired/F.avenaceum/PG8/F/.
cp $RawDatDir/PG8_S4_L001_R2_001.fastq.gz $ProjectDir/raw_dna/paired/F.avenaceum/PG8/R/.
RawDatDir=/home/groups/harrisonlab/raw_data/raw_seq/raw_reads/160401_M004465_0007-AGKF2
ProjectDir=/home/groups/harrisonlab/project_files/fusarium
cp $RawDatDir/FOP2_S1_L001_R1_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/FOP2/F/.
cp $RawDatDir/FOP2_S1_L001_R2_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_pisi/FOP2/R/.
ProjectDir=/home/groups/harrisonlab/project_files/fusarium
RawDatDir=/home/groups/harrisonlab/raw_data/raw_seq/raw_reads/160725_M04465_0020-AP1N0
cp $RawDatDir/AF064_S3_L001_R1_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/AF064/F/.
cp $RawDatDir/AF064_S3_L001_R2_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/AF064/R/.
cp $RawDatDir/CH5-2_S2_L001_R1_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/CH5-2/F/.
cp $RawDatDir/CH5-2_S2_L001_R2_001.fastq.gz $ProjectDir/raw_dna/paired/F.oxysporum_fsp_cepae/CH5-2/R/.
This process was repeated for RNAseq data:
RawDatDir=/home/groups/harrisonlab/raw_data/raw_seq/fusarium/rna_seq
ProjectDir=/home/groups/harrisonlab/project_files/fusarium
mkdir -p $ProjectDir/raw_rna/paired/F.oxysporum_fsp_cepae/Fus2_PDB/F
mkdir -p $ProjectDir/raw_rna/paired/F.oxysporum_fsp_cepae/Fus2_PDB/R
mkdir -p $ProjectDir/raw_rna/paired/F.oxysporum_fsp_cepae/Fus2_CzapekDox/F
mkdir -p $ProjectDir/raw_rna/paired/F.oxysporum_fsp_cepae/Fus2_CzapekDox/R
mkdir -p $ProjectDir/raw_rna/paired/F.oxysporum_fsp_cepae/Fus2_GlucosePeptone/F
mkdir -p $ProjectDir/raw_rna/paired/F.oxysporum_fsp_cepae/Fus2_GlucosePeptone/R
mkdir -p $ProjectDir/raw_rna/paired/F.oxysporum_fsp_cepae/Fus2_PDA/F
mkdir -p $ProjectDir/raw_rna/paired/F.oxysporum_fsp_cepae/Fus2_PDA/R
cp $RawDatDir/4_S1_L001_R1_001.fastq.gz $ProjectDir/raw_rna/paired/F.oxysporum_fsp_cepae/Fus2_PDB/F/.
cp $RawDatDir/4_S1_L001_R2_001.fastq.gz $ProjectDir/raw_rna/paired/F.oxysporum_fsp_cepae/Fus2_PDB/R/.
cp $RawDatDir/6_S2_L001_R1_001.fastq.gz $ProjectDir/raw_rna/paired/F.oxysporum_fsp_cepae/Fus2_CzapekDox/F/.
cp $RawDatDir/6_S2_L001_R2_001.fastq.gz $ProjectDir/raw_rna/paired/F.oxysporum_fsp_cepae/Fus2_CzapekDox/R/.
cp $RawDatDir/7_S3_L001_R1_001.fastq.gz $ProjectDir/raw_rna/paired/F.oxysporum_fsp_cepae/Fus2_GlucosePeptone/F/.
cp $RawDatDir/7_S3_L001_R2_001.fastq.gz $ProjectDir/raw_rna/paired/F.oxysporum_fsp_cepae/Fus2_GlucosePeptone/R/.
cp $RawDatDir/9_S4_L001_R1_001.fastq.gz $ProjectDir/raw_rna/paired/F.oxysporum_fsp_cepae/Fus2_PDA/F/.
cp $RawDatDir/9_S4_L001_R2_001.fastq.gz $ProjectDir/raw_rna/paired/F.oxysporum_fsp_cepae/Fus2_PDA/R/.
#Data qc
programs: fastqc fastq-mcf kmc
Data quality was visualised using fastqc:
for RawData in $(ls raw_dna/paired/*/*/*/*.fastq.gz); do
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/dna_qc
echo $RawData;
qsub $ProgDir/run_fastqc.sh $RawData
done
Trimming was performed on data to trim adapters from sequences and remove poor quality data. This was done with fastq-mcf
Firstly, those strains with more than one run were identified:
for Strain in $(ls -d raw_dna/paired/*/*); do
NumReads=$(ls $Strain/F/*.gz | wc -l);
if [ $NumReads -gt 1 ]; then
echo "$Strain";
echo "$NumReads";
fi;
done
raw_dna/paired/F.oxysporum_fsp_cepae/Fus2
2
raw_dna/paired/F.oxysporum_fsp_cepae/HB6
2
Trimming was first performed on all strains that had a single run of data:
for StrainPath in $(ls -d raw_dna/paired/*/* | grep -v -e 'Fus2' -e 'HB6'); do
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/rna_qc
IlluminaAdapters=/home/armita/git_repos/emr_repos/tools/seq_tools/ncbi_adapters.fa
ReadsF=$(ls $StrainPath/F/*.fastq*)
ReadsR=$(ls $StrainPath/R/*.fastq*)
echo $ReadsF
echo $ReadsR
qsub $ProgDir/rna_qc_fastq-mcf.sh $ReadsF $ReadsR $IlluminaAdapters DNA
done
Trimming was then performed for strains with multiple runs of data
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/rna_qc
IlluminaAdapters=/home/armita/git_repos/emr_repos/tools/seq_tools/ncbi_adapters.fa
echo "Fus2"
StrainPath=raw_dna/paired/F.oxysporum_fsp_cepae/Fus2
ReadsF=$(ls $StrainPath/F/s_6_1_sequence.fastq.gz)
ReadsR=$(ls $StrainPath/R/s_6_2_sequence.fastq.gz)
qsub $ProgDir/rna_qc_fastq-mcf.sh $ReadsF $ReadsR $IlluminaAdapters DNA
StrainPath=raw_dna/paired/F.oxysporum_fsp_cepae/Fus2
ReadsF=$(ls $StrainPath/F/FUS2_S2_L001_R1_001.fastq.gz)
ReadsR=$(ls $StrainPath/R/FUS2_S2_L001_R2_001.fastq.gz)
qsub $ProgDir/rna_qc_fastq-mcf.sh $ReadsF $ReadsR $IlluminaAdapters DNA
echo "HB6"
StrainPath=raw_dna/paired/F.oxysporum_fsp_cepae/HB6
ReadsF=$(ls $StrainPath/F/HB6_S4_L001_R1_001.fastq.gz)
ReadsR=$(ls $StrainPath/R/HB6_S4_L001_R2_001.fastq.gz)
qsub $ProgDir/rna_qc_fastq-mcf.sh $ReadsF $ReadsR $IlluminaAdapters DNA
StrainPath=raw_dna/paired/F.oxysporum_fsp_cepae/HB6
ReadsF=$(ls $StrainPath/F/HB6_S5_L001_R1_001.fastq.gz)
ReadsR=$(ls $StrainPath/R/HB6_S5_L001_R2_001.fastq.gz)
qsub $ProgDir/rna_qc_fastq-mcf.sh $ReadsF $ReadsR $IlluminaAdapters DNA
Data quality was visualised once again following trimming:
for RawData in $(ls qc_dna/paired/*/*/*/*.fq.gz | grep 'FOP2'); do
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/dna_qc
echo $RawData;
qsub $ProgDir/run_fastqc.sh $RawData
done
kmer counting was performed using kmc This allowed estimation of sequencing depth and total genome size
This was performed for strains with single runs of data
for TrimPath in $(ls -d raw_dna/paired/*/* | grep -v -e 'Fus2' -e 'HB6' | grep 'FOP2'); do
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/dna_qc
TrimF=$(ls $TrimPath/F/*.fastq*)
TrimR=$(ls $TrimPath/R/*.fastq*)
echo $TrimF
echo $TrimR
qsub $ProgDir/kmc_kmer_counting.sh $TrimF $TrimR
done
and for strains with muiltiple runs of data:
for TrimPath in $(ls -d raw_dna/paired/*/Fus2); do
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/dna_qc
TrimF1=$(ls $TrimPath/F/s_6_1_sequence.fastq.gz)
TrimR1=$(ls $TrimPath/R/s_6_2_sequence.fastq.gz)
echo $TrimF1
echo $TrimR1
TrimF2=$(ls $TrimPath/F/FUS2_S2_L001_R1_001.fastq.gz)
TrimR2=$(ls $TrimPath/R/FUS2_S2_L001_R2_001.fastq.gz)
echo $TrimF2
echo $TrimR2
qsub $ProgDir/kmc_kmer_counting.sh $TrimF1 $TrimR1 $TrimF2 $TrimR2
done
for TrimPath in $(ls -d raw_dna/paired/*/HB6); do
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/dna_qc
TrimF1=$(ls $TrimPath/F/HB6_S4_L001_R1_001.fastq.gz)
TrimR1=$(ls $TrimPath/R/HB6_S4_L001_R2_001.fastq.gz)
echo $TrimF1
echo $TrimR1
TrimF2=$(ls $TrimPath/F/HB6_S5_L001_R1_001.fastq.gz)
TrimR2=$(ls $TrimPath/R/HB6_S5_L001_R2_001.fastq.gz)
echo $TrimF2
echo $TrimR2
qsub $ProgDir/kmc_kmer_counting.sh $TrimF1 $TrimR1 $TrimF2 $TrimR2
done
mode kmer abundance prior to error correction was reported using the following commands:
for File in $(ls qc_dna/kmc/*/*/*_true_kmer_summary.txt); do
basename $File;
tail -n3 $File | head -n1 ;
done
PG8_true_kmer_summary.txt
The mode kmer abundance is: 53 < fail
125_true_kmer_summary.txt
The mode kmer abundance is: 46 < pass
55_true_kmer_summary.txt
The mode kmer abundance is: 29 < pass
A1-2_true_kmer_summary.txt
The mode kmer abundance is: 16 < pass
A13_true_kmer_summary.txt
The mode kmer abundance is: 22 < pass
A23_true_kmer_summary.txt
The mode kmer abundance is: 33 < pass
A28_true_kmer_summary.txt
The mode kmer abundance is: 36 < pass
CB3_true_kmer_summary.txt
The mode kmer abundance is: 21 < pass
D2_true_kmer_summary.txt
The mode kmer abundance is: 11 < pass
Fus2_true_kmer_summary.txt
The mode kmer abundance is: 109 < 2lib
HB17_true_kmer_summary.txt
The mode kmer abundance is: 27 < pass
HB6_true_kmer_summary.txt
The mode kmer abundance is: 91 < 2 lib
PG_true_kmer_summary.txt
The mode kmer abundance is: 58 < pass
N139_true_kmer_summary.txt
The mode kmer abundance is: 26 < pass
FOP1_true_kmer_summary.txt
The mode kmer abundance is: 32 <- fail
FOP5_true_kmer_summary.txt
The mode kmer abundance is: 62 <- fail
L5_true_kmer_summary.txt
The mode kmer abundance is: 35 <- pass
PG18_true_kmer_summary.txt
The mode kmer abundance is: 24 <- pass
PG3_true_kmer_summary.txt
The mode kmer abundance is: 28 <- pass
A8_true_kmer_summary.txt
The mode kmer abundance is: 21 <- pass
For Miseq data:
for RawData in $(ls qc_dna/paired/*/*/*/*.fq.gz | grep -e 'N139'); do
echo $RawData;
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/dna_qc;
qsub $ProgDir/run_fastqc.sh $RawData;
GenomeSz=57
OutDir=$(dirname $RawData)
qsub $ProgDir/sub_count_nuc.sh $GenomeSz $RawData $OutDir
done
for StrainDir in $(ls -d qc_dna/paired/*/* | grep -w -e 'N139'); do
Strain=$(basename $StrainDir)
printf "$Strain\t"
for File in $(ls qc_dna/paired/*/"$Strain"/*/*.txt); do
echo $(basename $File);
cat $File | tail -n1 | rev | cut -f2 -d ' ' | rev;
done | grep -v '.txt' | awk '{ SUM += $1} END { print SUM }'
done
#Assembly
Assembly was performed with:
- Spades
for StrainPath in $(ls -d qc_dna/paired/*/* | grep -v -e 'Fus2' -e 'HB6' | grep 'FOP2'); do
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/assemblers/spades
Strain=$(echo $StrainPath | rev | cut -f1 -d '/' | rev)
Organism=$(echo $StrainPath | rev | cut -f2 -d '/' | rev)
F_Read=$(ls $StrainPath/F/*.fq.gz)
R_Read=$(ls $StrainPath/R/*.fq.gz)
OutDir=assembly/spades/$Organism/$Strain
Jobs=$(qstat | grep 'submit_SPA' | grep 'qw' | wc -l)
while [ $Jobs -gt 1 ]; do
sleep 5m
printf "."
Jobs=$(qstat | grep 'submit_SPA' | grep 'qw' | wc -l)
done
printf "\n"
echo $F_Read
echo $R_Read
qsub $ProgDir/submit_SPAdes.sh $F_Read $R_Read $OutDir correct 10
done
Assembly for PG8, FOP1 and FOP5 failed due to a lack of memory, as such the assembly was resubmitted with more RAM.
for StrainPath in $(ls -d qc_dna/paired/*/* | grep -e 'FOP5' -e 'PG8' -e 'FOP1'); do
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/assemblers/spades
Strain=$(echo $StrainPath | rev | cut -f1 -d '/' | rev)
Organism=$(echo $StrainPath | rev | cut -f2 -d '/' | rev)
F_Read=$(ls $StrainPath/F/*.fq.gz)
R_Read=$(ls $StrainPath/R/*.fq.gz)
OutDir=assembly/spades/$Organism/$Strain
echo $F_Read
echo $R_Read
qsub $ProgDir/submit_SPAdes_HiMem.sh $F_Read $R_Read $OutDir correct 10
done
Assemblies were submitted for genomes with data from multiple sequencing runs:
for StrainPath in $(ls -d qc_dna/paired/F.*/HB6); do
echo $StrainPath
ProgDir=/home/ransoe/git_repos/tools/seq_tools/assemblers/spades/multiple_libraries
Strain=$(echo $StrainPath | rev | cut -f1 -d '/' | rev)
Organism=$(echo $StrainPath | rev | cut -f2 -d '/' | rev)
echo $Strain
echo $Organism
TrimF1_Read=$(ls $StrainPath/F/HB6_S4_L001_R1_001_trim.fq.gz);
TrimR1_Read=$(ls $StrainPath/R/HB6_S4_L001_R2_001_trim.fq.gz);
TrimF2_Read=$(ls $StrainPath/F/HB6_S5_L001_R1_001_trim.fq.gz);
TrimR2_Read=$(ls $StrainPath/R/HB6_S5_L001_R2_001_trim.fq.gz);
echo $TrimF1_Read
echo $TrimR1_Read
echo $TrimF2_Read
echo $TrimR2_Read
OutDir=assembly/spades/$Organism/$Strain
qsub $ProgDir/subSpades_2lib.sh $TrimF1_Read $TrimR1_Read $TrimF2_Read $TrimR2_Read $OutDir correct 10
done
for StrainPath in $(ls -d qc_dna/paired/F.*/Fus2); do
echo $StrainPath
ProgDir=/home/ransoe/git_repos/tools/seq_tools/assemblers/spades/multiple_libraries
Strain=$(echo $StrainPath | rev | cut -f1 -d '/' | rev)
Organism=$(echo $StrainPath | rev | cut -f2 -d '/' | rev)
echo $Strain
echo $Organism
TrimF1_Read=$(ls $StrainPath/F/s_6_1_sequence_trim.fq.gz);
TrimR1_Read=$(ls $StrainPath/R/s_6_2_sequence_trim.fq.gz);
TrimF2_Read=$(ls $StrainPath/F/FUS2_S2_L001_R1_001_trim.fq.gz);
TrimR2_Read=$(ls $StrainPath/R/FUS2_S2_L001_R2_001_trim.fq.gz);
echo $TrimF1_Read
echo $TrimR1_Read
echo $TrimF2_Read
echo $TrimR2_Read
OutDir=assembly/spades/$Organism/$Strain
qsub $ProgDir/subSpades_2lib_HiMem.sh $TrimF1_Read $TrimR1_Read $TrimF2_Read $TrimR2_Read $OutDir correct 10
done
Quast
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/assemblers/assembly_qc/quast
for Assembly in $(ls assembly/spades/*/*/filtered_contigs/contigs_min_500bp.fasta | grep 'FOP2'); do
# for Assembly in $(ls assembly/spades/*/*/*/contigs_min_500bp_renamed.fasta | grep 'Fus2_edited_v2'); do
# for Assembly in $(ls assembly/external_group/F.oxysporum/fo47/broad/fusarium_oxysporum_fo47_1_supercontigs.fasta); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
OutDir=assembly/spades/$Organism/$Strain/filtered_contigs
qsub $ProgDir/sub_quast.sh $Assembly $OutDir
done
Contigs were renamed in accordance with ncbi recomendations.
ProgDir=~/git_repos/emr_repos/tools/seq_tools/assemblers/assembly_qc/remove_contaminants
touch tmp.csv
for Assembly in $(ls assembly/spades/*/*/filtered_contigs/contigs_min_500bp.fasta); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
OutDir=assembly/spades/$Organism/$Strain/filtered_contigs
$ProgDir/remove_contaminants.py --inp $Assembly --out $OutDir/contigs_min_500bp_renamed.fasta --coord_file tmp.csv
done
rm tmp.csv
A Bioproject and Biosample was made with NCBI genbank for submission of genomes. Following the creation of these submissions, the .fasta assembly was uploaded through the submission portal. A note was provided requesting that the assembly be run through the contamination screen to aid a more detailed resubmission in future. The returned FCSreport.txt was downloaded from the NCBI webportal and used to correct the assembly to NCBI standards.
NCBI reports (FCSreport.txt) were manually downloaded to the following loactions:
for Assembly in $(ls assembly/spades/*/*/*/contigs_min_500bp_renamed.fasta | grep -v 'ncbi_edits' | grep -w -e '125' -e 'A23' -e 'A13' -e 'A28' -e 'CB3' -e 'PG' -e 'A8' -e 'N139'); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
NCBI_report_dir=genome_submission/$Organism/$Strain/initial_submission
mkdir -p $NCBI_report_dir
done
These downloaded files were used to correct assemblies:
for Assembly in $(ls assembly/spades/*/*/filtered_contigs/contigs_min_500bp_renamed.fasta | grep -w -e '125' -e 'A23' -e 'A13' -e 'A28' -e 'CB3' -e 'PG' -e 'A8' -e 'N139' | grep 'N139'); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
echo "$Organism - $Strain"
NCBI_report=$(ls genome_submission/$Organism/$Strain/initial_submission/FCSreport.txt)
OutDir=assembly/spades/$Organism/$Strain/ncbi_edits
mkdir -p $OutDir
ProgDir=~/git_repos/emr_repos/tools/seq_tools/assemblers/assembly_qc/remove_contaminants
$ProgDir/remove_contaminants.py --keep_mitochondria --inp $Assembly --out $OutDir/contigs_min_500bp_renamed.fasta --coord_file $NCBI_report > $OutDir/log.txt
done
A second submission of assemblies to ncbi gave a second round of edits to make. These reports (FCSreport.txt) were stored in the following location:
for Assembly in $(ls assembly/spades/*/*/*/contigs_min_500bp_renamed.fasta | grep -w 'ncbi_edits' | grep -w -e '125' -e 'A23' -e 'A13' -e 'A28' -e 'CB3' -e 'PG' -e 'A8' -e 'N139'); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
NCBI_report_dir=genome_submission/$Organism/"$Strain"_ncbi/submission
mkdir -p $NCBI_report_dir
done
These downloaded files were used to correct assemblies:
for Assembly in $(ls assembly/spades/*/*/ncbi_edits/contigs_min_500bp_renamed.fasta| grep 'ncbi_edits' | grep -w -e '125' -e 'A23' -e 'A13' -e 'A28' -e 'CB3' -e 'PG' -e 'A8' -e 'N139' | grep -e 'A8' -e 'PG' -e 'A23'); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
echo "$Organism - $Strain"
NCBI_report=$(ls genome_submission/$Organism/"$Strain"_ncbi/submission/FCSreport.txt)
OutDir=assembly/spades/$Organism/$Strain/ncbi_edits2
mkdir -p $OutDir
touch $NCBI_report
ProgDir=~/git_repos/emr_repos/tools/seq_tools/assemblers/assembly_qc/remove_contaminants
$ProgDir/remove_contaminants.py --keep_mitochondria --inp $Assembly --out $OutDir/contigs_min_500bp_renamed.fasta --coord_file $NCBI_report > $OutDir/log.txt
done
Quast was used to collect details on these assemblies
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/assemblers/assembly_qc/quast
for Assembly in $(ls assembly/spades/*/*/ncbi_edits2/contigs_min_500bp_renamed.fasta | grep -w -e '125' -e 'A23' -e 'A13' -e 'A28' -e 'CB3' -e 'PG' -e 'A8' -e 'N139' | grep -e 'A8' -e 'PG' -e 'A23'); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
echo "$Organism - $Strain"
OutDir=$(dirname $Assembly)
qsub $ProgDir/sub_quast.sh $Assembly $OutDir
done
Repeat masking was performed and used the following programs: Repeatmasker Repeatmodeler
The best assemblies were used to perform repeatmasking
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/repeat_masking
for BestAss in $(ls assembly/spades/*/*/ncbi_edits2/contigs_min_500bp_renamed.fasta | grep -w -e '125' -e 'A23' -e 'A13' -e 'A28' -e 'CB3' -e 'PG' -e 'A8' -e 'N139' | grep -e 'A8' -e 'PG' -e 'A23'); do
# for BestAss in $(ls assembly/external_group/F.oxysporum/fo47/broad/fusarium_oxysporum_fo47_1_supercontigs.fasta); do
Strain=$(echo $BestAss | rev | cut -f3 -d '/' | rev)
Organism=$(echo $BestAss | rev | cut -f4 -d '/' | rev)
OutDir=repeat_masked/$Organism/"$Strain"_ncbi/ncbi_submission
qsub $ProgDir/rep_modeling.sh $BestAss $OutDir
qsub $ProgDir/transposonPSI.sh $BestAss $OutDir
done
The published non-pathogen genome for isolate FO47 was also repeatmasked as this isolate was also used in experimental work.
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/repeat_masking
Fo47Ass=assembly/external_group/F.oxysporum/fo47/broad/fusarium_oxysporum_fo47_1_supercontigs.fasta
for BestAss in $(ls $Fo47Ass); do
qsub $ProgDir/rep_modeling.sh $BestAss
qsub $ProgDir/transposonPSI.sh $BestAss
done
Fo47 masked contigs were noted to still contain additional text following spaces. This disrupted downstream programs.
for File in $(ls repeat_masked/F.oxysporum/fo47/*/fo47_contigs_*masked.fa); do
sed -i 's/ of Fusarium oxysporum Fo47//g' $File
done
The TransposonPSI masked bases were used to mask additional bases from the repeatmasker / repeatmodeller softmasked and hardmasked files.
# Jobs=$(qstat | grep 'rep_modeli' | grep -w 'r' | wc -l)
# while [ $Jobs -gt 0 ]; do
# sleep 10
# printf "."
# Jobs=$(qstat | grep 'rep_modeli' | grep -w 'r' | wc -l)
# done
# printf "\n"
for File in $(ls repeat_masked/*/*/*/*_contigs_softmasked.fa | grep 'ncbi' | grep -e '125' -e 'A23' -e 'A13' -e 'A28' -e 'CB3' -e 'PG' -e 'A8' -e 'N139' | grep -v 'old' | grep 'N139'); do
OutDir=$(dirname $File)
TPSI=$(ls $OutDir/*_contigs_unmasked.fa.TPSI.allHits.chains.gff3)
OutFile=$(echo $File | sed 's/_contigs_softmasked.fa/_contigs_softmasked_repeatmasker_TPSI_appended.fa/g')
echo "$OutFile"
bedtools maskfasta -soft -fi $File -bed $TPSI -fo $OutFile
echo "Number of masked bases:"
cat $OutFile | grep -v '>' | tr -d '\n' | awk '{print $0, gsub("[a-z]", ".")}' | cut -f2 -d ' '
done
# The number of N's in hardmasked sequence are not counted as some may be present within the assembly and were therefore not repeatmasked.
for File in $(ls repeat_masked/*/*/*/*_contigs_hardmasked.fa | grep 'ncbi' | grep -e '125' -e 'A23' -e 'A13' -e 'A28' -e 'CB3' -e 'PG' -e 'A8' -e 'N139' | grep -v 'old' | grep 'N139'); do
OutDir=$(dirname $File)
TPSI=$(ls $OutDir/*_contigs_unmasked.fa.TPSI.allHits.chains.gff3)
OutFile=$(echo $File | sed 's/_contigs_hardmasked.fa/_contigs_hardmasked_repeatmasker_TPSI_appended.fa/g')
echo "$OutFile"
bedtools maskfasta -fi $File -bed $TPSI -fo $OutFile
# echo "Number of masked bases:"
# cat $OutFile | grep -v '>' | tr -d '\n' | awk '{print $0, gsub("[N]", ".")}' | cut -f2 -d ' '
done
repeat_masked/F.oxysporum_fsp_cepae/125_ncbi/ncbi_submission/125_contigs_softmasked_repeatmasker_TPSI_appended.fa
Number of masked bases:
3781475
repeat_masked/F.oxysporum_fsp_cepae/A13_ncbi/ncbi_submission/A13_contigs_softmasked_repeatmasker_TPSI_appended.fa
Number of masked bases:
4946858
repeat_masked/F.oxysporum_fsp_cepae/A23_ncbi/ncbi_submission/A23_contigs_softmasked_repeatmasker_TPSI_appended.fa
Number of masked bases:
3553622
repeat_masked/F.oxysporum_fsp_cepae/A28_ncbi/ncbi_submission/A28_contigs_softmasked_repeatmasker_TPSI_appended.fa
Number of masked bases:
4429343
repeat_masked/F.oxysporum_fsp_cepae/CB3_ncbi/ncbi_submission/CB3_contigs_softmasked_repeatmasker_TPSI_appended.fa
Number of masked bases:
2863123
repeat_masked/F.oxysporum_fsp_cepae/PG_ncbi/ncbi_submission/PG_contigs_softmasked_repeatmasker_TPSI_appended.fa
Number of masked bases:
3070545
repeat_masked/F.proliferatum/A8_ncbi/ncbi_submission/A8_contigs_softmasked_repeatmasker_TPSI_appended.fa
Number of masked bases:
1209471
repeat_masked/F.oxysporum_fsp_narcissi/N139_ncbi/ncbi_submission/N139_contigs_softmasked_repeatmasker_TPSI_appended.fa
Number of masked bases:
5689747
Gene prediction followed three steps: Pre-gene prediction - Quality of genome assemblies were assessed using Cegma to see how many core eukaryotic genes can be identified. Gene model training - Gene models were trained using assembled RNAseq data as part of the Braker1 pipeline Gene prediction - Gene models were used to predict genes in genomes as part of the the Braker1 pipeline. This used RNAseq data as hints for gene models.
Quality of genome assemblies was assessed by looking for the gene space in the assemblies.
ProgDir=/home/armita/git_repos/emr_repos/tools/gene_prediction/cegma
cd /home/groups/harrisonlab/project_files/fusarium
for Genome in $(ls repeat_masked/F.*/*/*/*_contigs_unmasked.fa | grep -e '125' -e 'A23' -e 'A13' -e 'A28' -e 'CB3' -e 'PG' -e 'A8' -e 'N139' | grep 'ncbi' | grep -v 'old'); do
#for Genome in $(ls repeat_masked/F.*/*/*/*_contigs_unmasked.fa | grep -w 'fo47'); do
echo $Genome;
qsub $ProgDir/sub_cegma.sh $Genome dna;
done
Outputs were summarised using the commands:
for File in $(ls gene_pred/cegma/F*/*/*_dna_cegma.completeness_report | grep -w -e '125' -e 'A23' -e 'A13' -e 'A28' -e 'CB3' -e 'PG' -e 'A8' -e 'N139' | grep 'ncbi'); do
Strain=$(echo $File | rev | cut -f2 -d '/' | rev);
Species=$(echo $File | rev | cut -f3 -d '/' | rev);
printf "$Species\t$Strain\n";
cat $File | head -n18 | tail -n+4;printf "\n";
done > gene_pred/cegma/cegma_results_dna_summary.txt
less gene_pred/cegma/cegma_results_dna_summary.txt
Busco has replaced CEGMA and was run to check gene space in assemblies
for Assembly in $(ls repeat_masked/*/*/*/*_contigs_unmasked.fa | grep -e 'Fus2' -e '125' -e 'A23' -e 'A13' -e 'A28' -e 'CB3' -e 'PG' -e 'A8' -e 'N139' | grep -e 'ncbi' -e 'canu_new' | grep -v 'old'); do
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
ProgDir=/home/armita/git_repos/emr_repos/tools/gene_prediction/busco
# BuscoDB="Fungal"
BuscoDB=$(ls -d /home/groups/harrisonlab/dbBusco/sordariomyceta_odb9)
OutDir=gene_pred/busco/$Organism/$Strain/assembly
qsub $ProgDir/sub_busco2.sh $Assembly $BuscoDB $OutDir
done
for File in $(ls gene_pred/busco/F*/*/assembly/*/short_summary_*.txt); do
echo $File;
cat $File | grep -e '(C)' -e 'Total';
done
#Gene prediction
Gene prediction was performed for Fusarium genomes. Two gene prediction approaches were used:
Gene prediction using Braker1 Prediction of all putative ORFs in the genome using the ORF finder (atg.pl) approach.
Gene prediction was performed using Braker1.
First, RNAseq data was aligned to Fusarium genomes.
- qc of RNA seq data is detailed below:
for Folder in $(ls -d raw_rna/paired/F.oxysporum_fsp_cepae/*); do
FolderName=$(echo $Folder | rev | cut -f1 -d '/' | rev);
echo $FolderName;
ls -lh $Folder/F;
ls -lh $Folder/R;
done
This contained the following data:
55_72hrs_rep1
sample013_1.combined.fastq.gz
sample013_2.combined.fastq.gz
55_72hrs_rep2
sample014_1.combined.fastq.gz
sample014_2.combined.fastq.gz
55_72hrs_rep3
sample015_1.combined.fastq.gz
sample015_2.combined.fastq.gz
control_72hrs_rep1
sample002_1.combined.fastq.gz
sample002_2.combined.fastq.gz
control_72hrs_rep2
sample004_1.combined.fastq.gz
sample004_2.combined.fastq.gz
control_72hrs_rep3
sample005_1.combined.fastq.gz
sample005_2.combined.fastq.gz
FO47_72hrs_rep1
sample006_1.combined.fastq.gz
sample006_2.combined.fastq.gz
FO47_72hrs_rep2
sample007_1.combined.fastq.gz
sample007_2.combined.fastq.gz
FO47_72hrs_rep3
sample012_1.combined.fastq.gz
sample012_2.combined.fastq.gz
Fus2_0hrs_prelim
1_S1_L001_R1_001.fastq.gz
1_S1_L001_R2_001.fastq.gz
Fus2_16hrs_prelim
3_S2_L001_R1_001.fastq.gz
3_S2_L001_R2_001.fastq.gz
Fus2_24hrs_prelim_rep1
4_S3_L001_R1_001.fastq.gz
4_S3_L001_R2_001.fastq.gz
Fus2_24hrs_prelim_rep2
Fus2_36hrs_prelim
36hr-root_S10_L001_R1_001.fastq.gz
36hr-root_S10_L001_R2_001.fastq.gz
Fus2_48hrs_prelim
6_S4_L001_R1_001.fastq.gz
6_S4_L001_R2_001.fastq.gz
Fus2_4hrs_prelim
4hr-root_S7_L001_R1_001.fastq.gz
4hr-root_S7_L001_R2_001.fastq.gz
Fus2_72hrs_prelim
7_S5_L001_R1_001.fastq.gz
7_S5_L001_R2_001.fastq.gz
Fus2_72hrs_rep1
sample016_1.combined.fastq.gz
sample016_2.combined.fastq.gz
Fus2_72hrs_rep2
sample018_1.combined.fastq.gz
sample018_2.combined.fastq.gz
Fus2_72hrs_rep3
sample019_1.combined.fastq.gz
sample019_2.combined.fastq.gz
Fus2_8hrs_prelim
8hr-root_S8_L001_R1_001.fastq.gz
8hr-root_S8_L001_R2_001.fastq.gz
Fus2_96hrs_prelim
8_S6_L001_R1_001.fastq.gz
8_S6_L001_R2_001.fastq.gz
Fus2_CzapekDox
6_S2_L001_R1_001_fastqc 6_S2_L001_R1_001.fastq.gz
6_S2_L001_R2_001_fastqc 6_S2_L001_R2_001.fastq.gz
Fus2_GlucosePeptone
7_S3_L001_R1_001_fastqc 7_S3_L001_R1_001.fastq.gz
7_S3_L001_R2_001_fastqc 7_S3_L001_R2_001.fastq.gz
Fus2_PDA
9_S4_L001_R1_001_fastqc 9_S4_L001_R1_001.fastq.gz
9_S4_L001_R2_001_fastqc 9_S4_L001_R2_001.fastq.gz
Fus2_PDB
4_S1_L001_R1_001_fastqc 4_S1_L001_R1_001.fastq.gz
4_S1_L001_R2_001_fastqc 4_S1_L001_R2_001.fastq.gz
Perform qc of RNAseq timecourse data
for num in $(qstat | grep 'rna_qc' | cut -f1 -d ' '); do
Treatment=$(head -n1 rna_qc_fastq-mcf.sh.o$num | cut -f10 -d '/')
for FilePath in $(raw_rna/paired/F.oxysporum_fsp_cepae/55_72hrs_rep1); do
echo $FilePath
FileF=$(ls $FilePath/F/*.gz)
FileR=$(ls $FilePath/R/*.gz)
IlluminaAdapters=/home/armita/git_repos/emr_repos/tools/seq_tools/ncbi_adapters.fa
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/rna_qc
qsub $ProgDir/rna_qc_fastq-mcf.sh $FileF $FileR $IlluminaAdapters RNA
sleep 10
done
done
Data quality was visualised using fastqc:
for RawData in $(ls qc_rna/paired/F.oxysporum_fsp_cepae/*/*/*.fq.gz); do
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/dna_qc
echo $RawData;
qsub $ProgDir/run_fastqc.sh $RawData
done
Insert sizes of the RNA seq library were unknown until a draft alignment could be made. To do this tophat and cufflinks were run, aligning the reads against a single genome. The fragment length and stdev were printed to stdout while cufflinks was running.
for Assembly in $(ls repeat_masked/*/*/*/*_contigs_unmasked.fa | grep 'FOP2'); do
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
for RNADir in $(ls -d qc_rna/paired/F.oxysporum_fsp_cepae/*); do
Timepoint=$(echo $RNADir | rev | cut -f1 -d '/' | rev)
echo "$Timepoint"
FileF=$(ls $RNADir/F/*_trim.fq.gz)
FileR=$(ls $RNADir/R/*_trim.fq.gz)
OutDir=alignment/$Organism/$Strain/$Timepoint
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/RNAseq
qsub $ProgDir/tophat_alignment.sh $Assembly $FileF $FileR $OutDir
done
done
Alignments were concatenated prior to running cufflinks: Cufflinks was run to produce the fragment length and stdev statistics:
for Assembly in $(ls repeat_masked/*/Fo47/*/*_contigs_softmasked_repeatmasker_TPSI_appended.fa | grep 'FOP2'); do
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
AcceptedHits=alignment/$Organism/$Strain/concatenated/concatenated.bam
OutDir=gene_pred/cufflinks/$Organism/$Strain/concatenated_prelim
echo "$Organism - $Strain"
mkdir -p $OutDir
samtools merge -f $AcceptedHits \
alignment/$Organism/$Strain/55_72hrs_rep1/accepted_hits.bam \
alignment/$Organism/$Strain/55_72hrs_rep2/accepted_hits.bam \
alignment/$Organism/$Strain/55_72hrs_rep3/accepted_hits.bam \
alignment/$Organism/$Strain/FO47_72hrs_rep1/accepted_hits.bam \
alignment/$Organism/$Strain/FO47_72hrs_rep2/accepted_hits.bam \
alignment/$Organism/$Strain/FO47_72hrs_rep3/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_0hrs_prelim/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_16hrs_prelim/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_24hrs_prelim_rep1/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_36hrs_prelim/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_48hrs_prelim/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_4hrs_prelim/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_72hrs_prelim/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_72hrs_rep1/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_72hrs_rep2/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_72hrs_rep3/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_8hrs_prelim/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_96hrs_prelim/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_CzapekDox/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_GlucosePeptone/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_PDA/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_PDB/accepted_hits.bam
cufflinks -o $OutDir/cufflinks -p 8 --max-intron-length 4000 $AcceptedHits 2>&1 | tee $OutDir/cufflinks/cufflinks.log
done
Output from stdout included:
Processed 22484 loci. [*************************] 100%
Map Properties:
Normalized Map Mass: 50507412.55
Raw Map Mass: 50507412.55
Fragment Length Distribution: Empirical (learned)
Estimated Mean: 181.98
Estimated Std Dev: 78.39
[13:02:48] Assembling transcripts and estimating abundances.
Processed 22506 loci. [*************************] 100%
The Estimated Mean: 181.98 allowed calculation of of the mean insert gap to be -20bp 182-(97*2) where 97 was the mean read length. This was provided to tophat on a second run (as the -r option) along with the fragment length stdev to increase the accuracy of mapping.
Then Rnaseq data was aligned to each genome assembly:
for Assembly in $(ls repeat_masked/*/*/*/*_contigs_unmasked.fa | grep -e '125' -e 'A23' -e 'A13' -e 'A28' -e 'CB3' -e 'PG' -e 'A8' -e 'N139' | grep 'ncbi' | grep -v 'old'); do
# for Assembly in $(ls assembly/spades/*/*/ncbi_edits2/contigs_min_500bp_renamed.fasta | grep -w -e '125' -e 'A23' -e 'A13' -e 'A28' -e 'CB3' -e 'PG' -e 'A8' -e 'N139'); do
# for Assembly in $(ls assembly/merged_canu_spades/*/Fus2/filtered_contigs/Fus2_contigs_renamed.fasta); do
# for Assembly in $(ls assembly/external_group/F.oxysporum/fo47/broad/fusarium_oxysporum_fo47_1_supercontigs.fasta); do
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
for RNADir in $(ls -d qc_rna/paired/F.oxysporum_fsp_cepae/*); do
Timepoint=$(echo $RNADir | rev | cut -f1 -d '/' | rev)
echo "$Timepoint"
FileF=$(ls $RNADir/F/*_trim.fq.gz)
FileR=$(ls $RNADir/R/*_trim.fq.gz)
OutDir=alignment/$Organism/$Strain/$Timepoint
InsertGap='-20'
InsertStdDev='78'
Jobs=$(qstat | grep 'tophat' | grep 'qw' | wc -l)
while [ $Jobs -gt 1 ]; do
sleep 10
printf "."
Jobs=$(qstat | grep 'tophat' | grep 'qw' | wc -l)
done
printf "\n"
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/RNAseq
qsub $ProgDir/tophat_alignment.sh $Assembly $FileF $FileR $OutDir $InsertGap $InsertStdDev
done
done
Before braker predictiction was performed, I double checked that I had the genemark key in my user area and copied it over from the genemark install directory:
ls ~/.gm_key
cp /home/armita/prog/genemark/gm_key_64 ~/.gm_key
# for Assembly in $(ls repeat_masked/*/*/*/*_contigs_softmasked_repeatmasker_TPSI_appended.fa | grep -v 'HB17' | grep 'Fus2' | grep -e 'Fus2_canu_new' -e 'Fus2_merged' | grep 'cepae' | grep 'Fus2_merged'); do
# for Assembly in $(ls repeat_masked/*/*/*/*_contigs_softmasked_repeatmasker_TPSI_appended.fa | grep 'proliferatum'); do
for Assembly in $(ls repeat_masked/*/*/*/*_contigs_softmasked_repeatmasker_TPSI_appended.fa | grep -e '125' -e 'A23' -e 'A13' -e 'A28' -e 'CB3' -e 'PG' -e 'A8' -e 'N139' | grep 'ncbi' | grep -v 'old'); do
Jobs=$(qstat | grep 'tophat' | grep -w 'r' | wc -l)
while [ $Jobs -gt 1 ]; do
sleep 10
printf "."
Jobs=$(qstat | grep 'tophat' | grep -w 'r' | wc -l)
done
printf "\n"
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
mkdir -p alignment/$Organism/$Strain/concatenated
samtools merge -f alignment/$Organism/$Strain/concatenated/concatenated.bam \
alignment/$Organism/$Strain/55_72hrs_rep1/accepted_hits.bam \
alignment/$Organism/$Strain/55_72hrs_rep2/accepted_hits.bam \
alignment/$Organism/$Strain/55_72hrs_rep3/accepted_hits.bam \
alignment/$Organism/$Strain/FO47_72hrs_rep1/accepted_hits.bam \
alignment/$Organism/$Strain/FO47_72hrs_rep2/accepted_hits.bam \
alignment/$Organism/$Strain/FO47_72hrs_rep3/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_0hrs_prelim/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_16hrs_prelim/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_24hrs_prelim_rep1/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_36hrs_prelim/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_48hrs_prelim/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_4hrs_prelim/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_72hrs_prelim/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_72hrs_rep1/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_72hrs_rep2/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_72hrs_rep3/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_8hrs_prelim/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_96hrs_prelim/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_CzapekDox/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_GlucosePeptone/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_PDA/accepted_hits.bam \
alignment/$Organism/$Strain/Fus2_PDB/accepted_hits.bam
OutDir=gene_pred/braker/$Organism/"$Strain"_braker
AcceptedHits=alignment/$Organism/$Strain/concatenated/concatenated.bam
GeneModelName="$Organism"_"$Strain"_braker_new
rm -r /home/armita/prog/augustus-3.1/config/species/"$Organism"_"$Strain"_braker_new
ProgDir=/home/armita/git_repos/emr_repos/tools/gene_prediction/braker1
qsub $ProgDir/sub_braker_fungi.sh $Assembly $OutDir $AcceptedHits $GeneModelName
done > log.txt
Fasta and gff files were extracted from Braker1 output.
for File in $(ls gene_pred/braker/F.*/*_braker/*/augustus.gff | grep -e '125' -e 'A23' -e 'A13' -e 'A28' -e 'CB3' -e 'PG' -e 'A8' -e 'N139' | grep -e 'canu_new' -e 'ncbi' | grep -v 'canu_new'); do
getAnnoFasta.pl $File
OutDir=$(dirname $File)
echo "##gff-version 3" > $OutDir/augustus_extracted.gff
cat $File | grep -v '#' >> $OutDir/augustus_extracted.gff
done
The relationship between gene models and aligned reads was investigated. To do this aligned reads needed to be sorted and indexed:
Note - IGV was used to view aligned reads against the Fus2 genome on my local machine.
InBam=alignment/F.oxysporum_fsp_cepae/Fus2_edited_v2/concatenated/concatenated.bam
ViewBam=alignment/F.oxysporum_fsp_cepae/Fus2_edited_v2/concatenated/concatenated_view.bam
SortBam=alignment/F.oxysporum_fsp_cepae/Fus2_edited_v2/concatenated/concatenated_sorted
samtools view -b $InBam > $ViewBam
samtools sort $ViewBam $SortBam
samtools index $SortBam.bam
Additional genes were added to Braker gene predictions, using CodingQuary in pathogen mode to predict additional regions.
Fistly, aligned RNAseq data was assembled into transcripts using Cufflinks.
Note - cufflinks doesn't always predict direction of a transcript and therefore features can not be restricted by strand when they are intersected.
# Jobs=$(qstat | grep 'tophat' | grep -w 'r' | wc -l)
# while [ $Jobs -gt 1 ]; do
# sleep 10
# printf "."
# Jobs=$(qstat | grep 'tophat' | grep -w 'r' | wc -l)
# done
# sleep 2h
# Jobs=$(qstat | grep 'sub_br' | grep -w 'r' | wc -l)
# while [ $Jobs -gt 1 ]; do
# sleep 10
# printf "."
# Jobs=$(qstat | grep 'sub_br' | grep -w 'r' | wc -l)
# done
for Assembly in $(ls repeat_masked/*/*/*/*_contigs_softmasked_repeatmasker_TPSI_appended.fa | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi'| grep -e '125' -e 'A23' -e 'A13' -e 'A28' -e 'CB3' -e 'PG' -e 'A8' -e 'N139' | grep -e 'ncbi' | grep -v 'old'); do
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
OutDir=gene_pred/cufflinks/$Organism/$Strain/concatenated
mkdir -p $OutDir
AcceptedHits=$(ls alignment/$Organism/$Strain/concatenated/concatenated.bam)
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/RNAseq
qsub $ProgDir/sub_cufflinks.sh $AcceptedHits $OutDir
done > log2.txt
Secondly, genes were predicted using CodingQuary:
# Jobs=$(qstat | grep 'armita' | wc -l)
# while [ $Jobs -ge 1 ]; do
# sleep 10m
# printf "."
# Jobs=$(qstat | grep 'armita' | wc -l)
# done
for Assembly in $(ls repeat_masked/*/*/*/*_contigs_softmasked_repeatmasker_TPSI_appended.fa | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi' | grep -e '125' -e 'A23' -e 'A13' -e 'A28' -e 'CB3' -e 'PG' -e 'A8' -e 'N139' | grep -v 'old'); do
Jobs=$(qstat | grep 'sub_cuffli' | wc -l)
while [ $Jobs -ge 1 ]; do
sleep 10
printf "."
Jobs=$(qstat | grep 'sub_cuffli' | wc -l)
done
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
OutDir=gene_pred/codingquary/$Organism/$Strain
CufflinksGTF=$(ls gene_pred/cufflinks/$Organism/$Strain/concatenated/transcripts.gtf)
ProgDir=/home/armita/git_repos/emr_repos/tools/gene_prediction/codingquary
qsub $ProgDir/sub_CodingQuary.sh $Assembly $CufflinksGTF $OutDir
done
Then, additional transcripts were added to Braker gene models, when CodingQuary genes were predicted in regions of the genome, not containing Braker gene models:
# for BrakerGff in $(ls gene_pred/braker/F.*/*_braker_new/*/augustus.gff3 | grep -w -e 'Fus2'); do
for BrakerGff in $(ls gene_pred/braker/F.*/*/*/augustus.gff3 | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi'| grep -e '125' -e 'A23' -e 'A13' -e 'A28' -e 'CB3' -e 'PG' -e 'A8' -e 'N139' | grep -e 'ncbi' | grep 'N139'); do
Strain=$(echo $BrakerGff| rev | cut -d '/' -f3 | rev | sed 's/_braker_pacbio//g'| sed 's/_braker//g')
Organism=$(echo $BrakerGff | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
Assembly=$(ls repeat_masked/$Organism/$Strain/*/*_contigs_softmasked_repeatmasker_TPSI_appended.fa)
CodingQuaryGff=gene_pred/codingquary/$Organism/$Strain/out/PredictedPass.gff3
PGNGff=gene_pred/codingquary/$Organism/$Strain/out/PGN_predictedPass.gff3
AddDir=gene_pred/codingquary/$Organism/$Strain/additional
FinalDir=gene_pred/final_genes/$Organism/$Strain/final
AddGenesList=$AddDir/additional_genes.txt
AddGenesGff=$AddDir/additional_genes.gff
FinalGff=$AddDir/combined_genes.gff
mkdir -p $AddDir
mkdir -p $FinalDir
bedtools intersect -v -a $CodingQuaryGff -b $BrakerGff | grep 'gene'| cut -f2 -d'=' | cut -f1 -d';' > $AddGenesList
bedtools intersect -v -a $PGNGff -b $BrakerGff | grep 'gene'| cut -f2 -d'=' | cut -f1 -d';' >> $AddGenesList
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation
$ProgDir/gene_list_to_gff.pl $AddGenesList $CodingQuaryGff CodingQuarry_v2.0 ID CodingQuary > $AddGenesGff
$ProgDir/gene_list_to_gff.pl $AddGenesList $PGNGff PGNCodingQuarry_v2.0 ID CodingQuary >> $AddGenesGff
ProgDir=/home/armita/git_repos/emr_repos/tools/gene_prediction/codingquary
$ProgDir/add_CodingQuary_features.pl $AddGenesGff $Assembly > $FinalDir/final_genes_CodingQuary.gff3
$ProgDir/gff2fasta.pl $Assembly $FinalDir/final_genes_CodingQuary.gff3 $FinalDir/final_genes_CodingQuary
cp $BrakerGff $FinalDir/final_genes_Braker.gff3
$ProgDir/gff2fasta.pl $Assembly $FinalDir/final_genes_Braker.gff3 $FinalDir/final_genes_Braker
cat $FinalDir/final_genes_Braker.pep.fasta $FinalDir/final_genes_CodingQuary.pep.fasta | sed -r 's/\*/X/g' > $FinalDir/final_genes_combined.pep.fasta
cat $FinalDir/final_genes_Braker.cdna.fasta $FinalDir/final_genes_CodingQuary.cdna.fasta > $FinalDir/final_genes_combined.cdna.fasta
cat $FinalDir/final_genes_Braker.gene.fasta $FinalDir/final_genes_CodingQuary.gene.fasta > $FinalDir/final_genes_combined.gene.fasta
cat $FinalDir/final_genes_Braker.upstream3000.fasta $FinalDir/final_genes_CodingQuary.upstream3000.fasta > $FinalDir/final_genes_combined.upstream3000.fasta
GffBraker=$FinalDir/final_genes_CodingQuary.gff3
GffQuary=$FinalDir/final_genes_Braker.gff3
GffAppended=$FinalDir/final_genes_appended.gff3
cat $GffBraker $GffQuary > $GffAppended
done
The final number of genes per isolate was observed using:
for DirPath in $(ls -d gene_pred/final_genes/F.*/*/final | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi'| grep -e '125' -e 'A23' -e 'A13' -e 'A28' -e 'CB3' -e 'PG' -e 'A8' -e 'N139' | grep -e 'ncbi'); do
echo $DirPath;
cat $DirPath/final_genes_Braker.pep.fasta | grep '>' | wc -l;
cat $DirPath/final_genes_CodingQuary.pep.fasta | grep '>' | wc -l;
cat $DirPath/final_genes_combined.pep.fasta | grep '>' | wc -l;
echo "";
done
gene_pred/final_genes/F.oxysporum_fsp_cepae/125_ncbi/final
17197
1546
18743
gene_pred/final_genes/F.oxysporum_fsp_cepae/A13_ncbi/final
17986
948
18934
gene_pred/final_genes/F.oxysporum_fsp_cepae/A23_ncbi/final
17006
1551
18557
gene_pred/final_genes/F.oxysporum_fsp_cepae/A28_ncbi/final
17426
1448
18874
gene_pred/final_genes/F.oxysporum_fsp_cepae/CB3_ncbi/final
16833
1345
18178
gene_pred/final_genes/F.oxysporum_fsp_cepae/PG_ncbi/final
16811
1290
18101
gene_pred/final_genes/F.oxysporum_fsp_narcissi/N139_ncbi/final
19059
1642
20701
gene_pred/final_genes/F.proliferatum/A8_ncbi/final
15421
37
15458
for Gff in $(ls gene_pred/final_genes/*/*/final/final_genes_appended.gff3 | grep -e 'canu_new' -e 'ncbi'); do
Strain=$(echo $Gff | rev | cut -d '/' -f3 | rev)
Organism=$(echo $Gff | rev | cut -d '/' -f4 | rev)
echo "$Strain - $Organism"
cat $Gff | grep -w 'gene' | wc -l
done
125_ncbi - F.oxysporum_fsp_cepae
18505
A13_ncbi - F.oxysporum_fsp_cepae
18790
A23_ncbi - F.oxysporum_fsp_cepae
18323
A28_ncbi - F.oxysporum_fsp_cepae
18629
CB3_ncbi - F.oxysporum_fsp_cepae
17943
Fus2_canu_new - F.oxysporum_fsp_cepae
18855
PG_ncbi - F.oxysporum_fsp_cepae
17830
N139_ncbi - F.oxysporum_fsp_narcissi
20493
A8_ncbi - F.proliferatum
15418
for AddGenes in $(ls gene_pred/codingquary/F.*/*/additional/additional_genes.gff | grep 'ncbi' | grep -v 'ncbi_braker' | grep 'A23'); do
Strain=$(echo $AddGenes| rev | cut -d '/' -f3 | rev)
Organism=$(echo $AddGenes | rev | cut -d '/' -f4 | rev)
OutDir=$(dirname $AddGenes)
echo "$Organism - $Strain" > $OutDir/duplicated_genes.txt
ProgDir=/home/armita/git_repos/emr_repos/tools/gene_prediction/codingquary
$ProgDir/remove_dup_features.py --inp_gff $AddGenes >> $OutDir/duplicated_genes.txt
cat $OutDir/duplicated_genes.txt
echo ""
done
Following annotation of gene models and orthology analysis documented below, some gene models were noted to require manual editing. This was performed and gene models were re-annotated. In some cases this required removal of genes annotated by Braker so that the gene models predicted by coding queary could be favoured. In other cases, gene models were manually edited based upon results from other Fusarium spp. genomes:
Gene models requiring editing
- FTF1 g16934 - delete gene, replace with CodingQuary model.
- SIX3 g16847, g16190 - A23 one copy
- SIX5 g16849 - A23 missing intron
- SIX7 CUFF_4344_1_33 - Gene model good
- SIX9 g13574, g16273 - premature stop in A23 (end of contig) & one copy in A23 & 125
- SIX10 g17153 - Gene model good
- SIX12 PGN_CUFF_4345_1_2 - checked start of Fus2, FoC sequence is too long but FoL reference matches this length - Bring A23 and 125 in line with Fus2 & FoL
- No gene model
Organism=F.oxysporum_fsp_cepae
Strain=Fus2_canu_new
OutDir=gene_pred/final_genes/$Organism/$Strain/edited
mkdir -p $OutDir
Assembly=$(ls repeat_masked/$Organism/$Strain/*/*_contigs_softmasked_repeatmasker_TPSI_appended.fa)
SourceDir=gene_pred/codingquary/$Organism/$Strain
BrakerGff=$(ls $SourceDir/final/final_genes_Braker.gff3)
CodingQuaryGff=$(ls $SourceDir/out/PredictedPass.gff3)
PGNGff=$(ls $SourceDir/out/PGN_predictedPass.gff3)
cat $BrakerGff \
| grep -v -w 'g16934' \
| grep -v -w 'g13345' \
> $OutDir/final_genes_Braker_ed.gff3
printf \
"contig_19_pilon\tManual_annotation\tgene\t445791\t445964\t0\t+\t.\tID=man_1;
contig_19_pilon\tManual_annotation\tmRNA\t445791\t445964\t.\t+\t.\tID=man_1.t1;Parent=man_1
contig_19_pilon\tManual_annotation\tstart_codon\t445791\t445793\t.\t+\t.\tParent=man_1.t1;
contig_19_pilon\tManual_annotation\tCDS\t445791\t445964\t.\t+\t0\tID=man_1.t1.CDS1;Parent=man_1.t1
contig_19_pilon\tManual_annotation\texon\t445791\t445964\t.\t+\t.\tID=man_1.t1.exon1;Parent=man_1.t1;
contig_19_pilon\tManual_annotation\tstop_codon\t445962\t445964\t.\t+\t.\tParent=man_1.t1;\n" \
>> $OutDir/final_genes_Braker_ed.gff3
cat $CodingQuaryGff \
| grep -v -w -e 'NS.00001' -e 'NS.00002' -e 'NS.00003' -e 'NS.04463' -e 'CUFF.11065.1.82' \
> $OutDir/PredictedPass_ed.gff3
cp $PGNGff $OutDir/PGN_predictedPass_ed.gff3
touch $OutDir/manual_annotations.gff3
#
# printf \
# "contig_10_pilon\tManual_annotation\tgene\t1569\t5055\t0\t+\t.\tID=g13345_ps;
# contig_10_pilon\tManual_annotation\tmRNA\t1569\t5055\t.\t+\t.\tID=g13345_ps.t1;Parent=g13345_ps
# contig_10_pilon\tManual_annotation\tstart_codon\t1569\t1571\t.\t+\t.\tParent=g13345_ps.t1;
# contig_10_pilon\tManual_annotation\tCDS\t1569\t2856\t.\t+\t0\tID=g13345_ps.t1.CDS1;Parent=g13345_ps.t1
# contig_10_pilon\tManual_annotation\texon\t1569\t2856\t.\t+\t.\tParent=g13345_ps.t1
# contig_10_pilon\tManual_annotation\tintron\t2857\t2905\t.\t+\t.\tParent=g13345_ps.t1
# contig_10_pilon\tManual_annotation\tCDS\t2906\t5055\t.\t+\t2\tID=g13345_ps.t1.CDS2;Parent=g13345_ps.t1
# contig_10_pilon\tManual_annotation\texon\t2906\t5055\t.\t+\t.\tID=g13345_ps.t1.exon1;Parent=g13345_ps.t1;
# contig_10_pilon\tManual_annotation\tstop_codon\t5053\t5055\t.\t+\t.\tParent=g13345_ps.t1;\n" \
# > $OutDir/pseudogenes.gff3
Note that at this stage all codingquary genes contain . characters rather than _ characters g16038 - SIX12 - use alternative transcript no gene - SIX14 - manual annotation g16100 - SIX5 - shorten intron and final CDS g15699 - SIX7 - remove intron and final CDS. no gene - SIX9 - manual annotation Duplicated genes: CUFF.7204.2.14 CUFF.7961.1.0 CUFF.11924.2.127 CUFF.6792.1.4
Organism=F.oxysporum_fsp_cepae
Strain=A23_ncbi
OutDir=gene_pred/final_genes/$Organism/$Strain/edited
mkdir -p $OutDir
Assembly=$(ls repeat_masked/$Organism/$Strain/*/*_contigs_softmasked_repeatmasker_TPSI_appended.fa)
BrakerGff=$(ls gene_pred/braker/$Organism/"$Strain"_braker/*/augustus.gff3)
CodingQuaryGff=$(ls gene_pred/codingquary/$Organism/$Strain/out/PredictedPass.gff3)
PGNGff=$(ls gene_pred/codingquary/$Organism/$Strain/out/PGN_predictedPass.gff3)
cat $BrakerGff \
| grep -w -v 'g7779' \
| grep -w -v 'g13351' \
| grep -v 'g16038' \
| sed \
's/contig_819\tAUGUSTUS\tgene\t655\t1273\t0.73\t+\t.\tID=g16100;/contig_819\tAUGUSTUS\tgene\t655\t1188\t0.73\t+\t.\tID=g16100;/g' \
| sed \
's/contig_819\tAUGUSTUS\tmRNA\t655\t1273\t0.73\t+\t.\tID=g16100.t1;Parent=g16100/contig_819\tAUGUSTUS\tmRNA\t655\t1188\t0.73\t+\t.\tID=g16100.t1;Parent=g16100/g' \
| sed \
's/contig_819\tAUGUSTUS\tintron\t1002\t1057\t1\t+\t.\tParent=g16100.t1;/contig_819\tAUGUSTUS\tintron\t1002\t1050\t1\t+\t.\tParent=g16100.t1;/g' \
| sed \
's/contig_819\tAUGUSTUS\tCDS\t1058\t1273\t0.74\t+\t0\tID=g16100.t1.CDS4;Parent=g16100.t1/contig_819\tAUGUSTUS\tCDS\t1051\t1188\t0.74\t+\t0\tID=g16100.t1.CDS4;Parent=g16100.t1/g' \
| sed \
's/contig_819\tAUGUSTUS\texon\t1058\t1273\t.\t+\t.\tID=g16100.t1.exon4;Parent=g16100.t1;/contig_819\tAUGUSTUS\texon\t1051\t1188\t.\t+\t.\tID=g16100.t1.exon4;Parent=g16100.t1;/g' \
| sed \
's/contig_819\tAUGUSTUS\tstop_codon\t1271\t1273\t.\t+\t0\tParent=g16100.t1;/contig_819\tAUGUSTUS\tstop_codon\t1186\t1188\t.\t+\t0\tParent=g16100.t1;/g' \
| sed \
's/contig_592\tAUGUSTUS\tgene\t1046\t1683\t0.68\t+\t.\tID=g15699;/contig_592\tAUGUSTUS\tgene\t1046\t1543\t0.68\t+\t.\tID=g15699;/g' \
| sed \
's/contig_592\tAUGUSTUS\tmRNA\t1046\t1683\t0.68\t+\t.\tID=g15699.t1;Parent=g15699/contig_592\tAUGUSTUS\tmRNA\t1046\t1543\t0.68\t+\t.\tID=g15699.t1;Parent=g15699/g' \
| sed \
's/contig_592\tAUGUSTUS\tCDS\t1046\t1539\t0.68\t+\t0\tID=g15699.t1.CDS1;Parent=g15699.t1/contig_592\tAUGUSTUS\tCDS\t1046\t1543\t0.68\t+\t0\tID=g15699.t1.CDS1;Parent=g15699.t1/g' \
| sed \
's/contig_592\tAUGUSTUS\texon\t1046\t1539\t.\t+\t.\tID=g15699.t1.exon1;Parent=g15699.t1;/contig_592\tAUGUSTUS\texon\t1046\t1543\t.\t+\t.\tID=g15699.t1.exon1;Parent=g15699.t1;/g' \
| grep -v 'contig_592.AUGUSTUS.intron.1540.1676.1.+...Parent=g15699.t1;' \
| grep -v 'g15699.t1.CDS2' | grep -v 'g15699.t1.exon2' \
| sed \
's/contig_592\tAUGUSTUS\tstop_codon\t1681\t1683\t.\t+\t0\tParent=g15699.t1;/contig_592\tAUGUSTUS\tstop_codon\t1541\t1543\t.\t+\t0\tParent=g15699.t1;/g' \
> $OutDir/final_genes_Braker_ed.gff3
printf \
"contig_1792\tManual_annotation\tgene\t1\t219\t1\t+\t.\tID=man_1;
contig_1792\tManual_annotation\tmRNA\t1\t219\t1\t+\t.\tID=man_1.t1;Parent=man_1
contig_1792\tManual_annotation\tCDS\t1\t219\t1.0\t+\t0\tID=man_1.t1.CDS1;Parent=man_1.t1
contig_1792\tManual_annotation\texon\t1\t219\t.\t+\t.\tID=man_1.t1.exon1;Parent=man_1.t1;
contig_1792\tManual_annotation\tstop_codon\t217\t219\t.\t+\t.\tParent=man_1.t1;\n" \
>> $OutDir/final_genes_Braker_ed.gff3
printf \
"contig_1016\tManual_annotation\tgene\t2103\t2276\t1\t+\t.\tID=man_2;
contig_1016\tManual_annotation\tmRNA\t2103\t2276\t1\t+\t.\tID=man_2.t1;Parent=man_2
contig_1016\tManual_annotation\tstart_codon\t2103\t2105\t.\t+\t.\tParent=man_2.t1;
contig_1016\tManual_annotation\tCDS\t2103\t2276\t1.0\t+\t0\tID=man_2.t1.CDS1;Parent=man_2.t1
contig_1016\tManual_annotation\texon\t2103\t2276\t.\t+\t.\tID=man_2.t1.exon1;Parent=man_2.t1;
contig_1016\tManual_annotation\tstop_codon\t2274\t2276\t.\t+\t.\tParent=man_2.t1;\n" \
>> $OutDir/final_genes_Braker_ed.gff3
cat $CodingQuaryGff | grep -w -v -e 'CUFF.7204.2.14' -e 'CUFF.7961.1.0' -e 'CUFF.11924.2.127' -e 'CUFF.6792.1.4' > $OutDir/PredictedPass_ed.gff3
cp $PGNGff $OutDir/PGN_predictedPass_ed.gff3
Note that at this stage all codingquary genes contain . characters rather than _ characters g16001 - SIX12 - remove gene and use PGN_predictedPass gene g16002 - SIX7 - remove intron and make stop codon earlier CUFF_14695 - SIX14 - Replace with manual annotation in correct orientation Duplicated genes: CUFF.14630.2.11 CUFF.7213.2.7
Organism=F.oxysporum_fsp_cepae
Strain=125_ncbi
OutDir=gene_pred/final_genes/$Organism/$Strain/edited
mkdir -p $OutDir
Assembly=$(ls repeat_masked/$Organism/$Strain/*/*_contigs_softmasked_repeatmasker_TPSI_appended.fa)
BrakerGff=$(ls gene_pred/braker/$Organism/"$Strain"_braker/*/augustus.gff3)
CodingQuaryGff=$(ls gene_pred/codingquary/$Organism/$Strain/out/PredictedPass.gff3)
PGNGff=$(ls gene_pred/codingquary/$Organism/$Strain/out/PGN_predictedPass.gff3)
cat $BrakerGff \
| grep -v -w 'g16001' \
| sed \
's/contig_714\tAUGUSTUS\tgene\t6151\t6788\t0.93\t+\t.\tID=g16002/contig_714\tAUGUSTUS\tgene\t6151\t6648\t0.93\t+\t.\tID=g16002/g' \
| sed \
's/contig_714\tAUGUSTUS\tmRNA\t6151\t6788\t0.93\t+\t.\tID=g16002.t1;Parent=g16002/contig_714\tAUGUSTUS\tmRNA\t6151\t6648\t0.93\t+\t.\tID=g16002.t1;Parent=g16002/g' \
| sed \
's/contig_714\tAUGUSTUS\tCDS\t6151\t6644\t0.93\t+\t0\tID=g16002.t1.CDS1;Parent=g16002.t1/contig_714\tAUGUSTUS\tCDS\t6151\t6648\t0.93\t+\t0\tID=g16002.t1.CDS1;Parent=g16002.t1/g' \
| sed \
's/contig_714\tAUGUSTUS\texon\t6151\t6644\t.\t+\t.\tID=g16002.t1.exon1;Parent=g16002.t1;/contig_714\tAUGUSTUS\texon\t6151\t6648\t.\t+\t.\tID=g16002.t1.exon1;Parent=g16002.t1;/g' \
| grep -v -e 'g16002.t1.CDS2' -e 'g16002.t1.exon2' \
| grep -v 'contig_714.AUGUSTUS.intron.6645.6781.1.+...Parent=g16002.t1;' \
| sed \
's/contig_714\tAUGUSTUS\tstop_codon\t6786\t6788\t.\t+\t0\tParent=g16002.t1;/contig_714\tAUGUSTUS\tstop_codon\t6646\t6648\t.\t+\t0\tParent=g16002.t1;/g' \
> $OutDir/final_genes_Braker_ed.gff3
printf \
"contig_2082\tManual_annotation\tgene\t261\t525\t1\t+\t.\tID=man_1;
contig_2082\tManual_annotation\tmRNA\t261\t525\t1\t+\t.\tID=man_1.t1;Parent=man_1
contig_2082\tManual_annotation\tstart_codon\t261\t263\t.\t+\t.\tParent=man_1.t1;
contig_2082\tManual_annotation\tCDS\t261\t525\t1.0\t+\t0\tID=man_1.t1.CDS1;Parent=man_1.t1
contig_2082\tManual_annotation\texon\t261\t525\t.\t+\t.\tID=man_1.t1.exon1;Parent=man_1.t1;\n" \
>> $OutDir/final_genes_Braker_ed.gff3
printf \
"contig_971\tManual_annotation\tgene\t333\t506\t1\t-\t.\tID=man_2;
contig_971\tManual_annotation\tmRNA\t333\t506\t1\t-\t.\tID=man_2.t1;Parent=man_2
contig_971\tManual_annotation\tstop_codon\t333\t335\t.\t-\t.\tParent=man_2.t1;
contig_971\tManual_annotation\tCDS\t333\t506\t1.0\t-\t0\tID=man_2.t1.CDS1;Parent=man_2.t1
contig_971\tManual_annotation\texon\t333\t506\t.\t-\t.\tID=man_2.t1.exon1;Parent=man_2.t1;
contig_971\tManual_annotation\tstart_codon\t504\t506\t.\t-\t.\tParent=man_2.t1;\n" \
>> $OutDir/final_genes_Braker_ed.gff3
cat $CodingQuaryGff | grep -v -w -e 'CUFF.14630.2.11' -e 'CUFF.7213.2.7' > $OutDir/PredictedPass_ed.gff3
cp $PGNGff $OutDir/PGN_predictedPass_ed.gff3
Note that at this stage all codingquary genes contain . characters rather than _ characters g16592 - SIX10 - remove gene and use PGN_predictedPass gene g16591 - SIX12 - shorten intron and final CDS
Duplicated genes: CUFF.639.2.10 CUFF.7442.1.80 (identified by ncbi) CUFF.2100.1.11 (identified by ncbi)
Organism=F.oxysporum_fsp_narcissi
Strain=N139_ncbi
OutDir=gene_pred/final_genes/$Organism/$Strain/edited
mkdir -p $OutDir
Assembly=$(ls repeat_masked/$Organism/$Strain/*/*_contigs_softmasked_repeatmasker_TPSI_appended.fa)
BrakerGff=$(ls gene_pred/braker/$Organism/"$Strain"_braker/*/augustus.gff3)
CodingQuaryGff=$(ls gene_pred/codingquary/$Organism/$Strain/out/PredictedPass.gff3)
PGNGff=$(ls gene_pred/codingquary/$Organism/$Strain/out/PGN_predictedPass.gff3)
cat $BrakerGff \
| grep -v -w 'g16592' \
| sed \
's/contig_728\tAUGUSTUS\tgene\t3227\t3658\t0.99\t+\t.\tID=g16591;/contig_728\tAUGUSTUS\tgene\t3317\t3658\t0.99\t+\t.\tID=g16591;/g' \
| sed \
's/contig_728\tAUGUSTUS\tmRNA\t3227\t3658\t0.99\t+\t.\tID=g16591.t1;Parent=g16591/contig_728\tAUGUSTUS\tmRNA\t3317\t3658\t0.99\t+\t.\tID=g16591.t1;Parent=g16591/g' \
| sed \
's/contig_728\tAUGUSTUS\tstart_codon\t3227\t3229\t.\t+\t0\tParent=g16591.t1;/contig_728\tAUGUSTUS\tstart_codon\t3317\t3319\t.\t+\t0\tParent=g16591.t1;/g' \
| sed \
's/contig_728\tAUGUSTUS\tCDS\t3227\t3248\t1\t+\t0\tID=g16591.t1.CDS1;Parent=g16591.t1/contig_728\tAUGUSTUS\tCDS\t3317\t3658\t1\t+\t0\tID=g16591.t1.CDS1;Parent=g16591.t1/g' \
| sed \
's/contig_728\tAUGUSTUS\texon\t3227\t3248\t.\t+\t.\tID=g16591.t1.exon1;Parent=g16591.t1;/contig_728\tAUGUSTUS\texon\t3317\t3658\t.\t+\t.\tID=g16591.t1.exon1;Parent=g16591.t1;/g' \
| grep -v -e 'intron.3249.3299.1.+...Parent=g16591.t1;' \
| grep -v -e 'g16591.t1.CDS2' -e 'g16591.t1.exon2' \
> $OutDir/final_genes_Braker_ed.gff3
cat $CodingQuaryGff | grep -v -w -e 'CUFF.636.2.10' -e 'CUFF.7442.1.80' -e 'CUFF.2100.2.11' > $OutDir/PredictedPass_ed.gff3
cp $PGNGff $OutDir/PGN_predictedPass_ed.gff3
Note that at this stage all codingquary genes contain . characters rather than _ characters
for Assembly in $(ls repeat_masked/*/*/*/*_contigs_softmasked_repeatmasker_TPSI_appended.fa | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi'| grep -e 'ncbi' -e 'Fus2_canu_new' | grep -v -e 'Fus2_canu_new' -e 'A23' -e '125' -e 'N139' | grep -v 'old' | grep 'A8'); do
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
OutDir=gene_pred/final_genes/$Organism/$Strain/edited
mkdir -p $OutDir
BrakerGff=$(ls gene_pred/braker/$Organism/"$Strain"_braker/*/augustus.gff3)
CodingQuaryGff=$(ls gene_pred/codingquary/$Organism/$Strain/out/PredictedPass.gff3)
PGNGff=$(ls gene_pred/codingquary/$Organism/$Strain/out/PGN_predictedPass.gff3)
if [[ $Strain == 'A28_ncbi' ]]; then
cp $BrakerGff $OutDir/final_genes_Braker_ed.gff3
cat $CodingQuaryGff | grep -v -w -e 'CUFF.7109.1.50' > $OutDir/PredictedPass_ed.gff3
cp $PGNGff $OutDir/PGN_predictedPass_ed.gff3
elif [[ $Strain == 'CB3_ncbi' ]]; then
cp $BrakerGff $OutDir/final_genes_Braker_ed.gff3
cat $CodingQuaryGff | grep -v -w -e 'CUFF.10490.1.97' > $OutDir/PredictedPass_ed.gff3
cp $PGNGff $OutDir/PGN_predictedPass_ed.gff3
elif [[ $Strain == 'PG_ncbi' ]]; then
cp $BrakerGff $OutDir/final_genes_Braker_ed.gff3
cat $CodingQuaryGff | grep -v -w -e 'CUFF.7109.1.38' -e 'CUFF.11875.1.103' > $OutDir/PredictedPass_ed.gff3
cp $PGNGff $OutDir/PGN_predictedPass_ed.gff3
else
cp $BrakerGff $OutDir/final_genes_Braker_ed.gff3
cp $CodingQuaryGff $OutDir/PredictedPass_ed.gff3
cp $PGNGff $OutDir/PGN_predictedPass_ed.gff3
fi
done
for EditDir in $(ls -d gene_pred/final_genes/*/*/edited | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi'| grep -e 'Fus2_canu_new' -e 'ncbi' | grep -v 'Fus2_canu_new' | grep -e '125' -e 'A23' -e 'N139' | grep 'N139'); do
Strain=$(echo $EditDir | rev | cut -d '/' -f2 | rev)
Organism=$(echo $EditDir | rev | cut -d '/' -f3 | rev)
echo "$Organism - $Strain"
Assembly=$(ls repeat_masked/$Organism/$Strain/*/*_contigs_softmasked_repeatmasker_TPSI_appended.fa)
BrakerGff=$EditDir/final_genes_Braker_ed.gff3
CodingQuaryGff=$EditDir/PredictedPass_ed.gff3
PGNGff=$EditDir/PGN_predictedPass_ed.gff3
# ManGff=$EditDir/manual_annotations.gff3
AddDir=$EditDir/additional
FinalDir=gene_pred/final_genes/$Organism/$Strain/final
AddGenesList=$AddDir/additional_genes.txt
AddGenesGff=$AddDir/additional_genes.gff
# FinalGff=$AddDir/combined_genes.gff
mkdir -p $AddDir
mkdir -p $FinalDir
bedtools intersect -v -a $CodingQuaryGff -b $BrakerGff | grep 'gene'| cut -f2 -d'=' | cut -f1 -d';' > $AddGenesList
bedtools intersect -v -a $PGNGff -b $BrakerGff | grep 'gene'| cut -f2 -d'=' | cut -f1 -d';' >> $AddGenesList
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation
$ProgDir/gene_list_to_gff.pl $AddGenesList $CodingQuaryGff CodingQuarry_v2.0 ID CodingQuary > $AddGenesGff
$ProgDir/gene_list_to_gff.pl $AddGenesList $PGNGff PGNCodingQuarry_v2.0 ID CodingQuary >> $AddGenesGff
ProgDir=/home/armita/git_repos/emr_repos/tools/gene_prediction/codingquary
$ProgDir/add_CodingQuary_features.pl $AddGenesGff $Assembly > $FinalDir/final_genes_CodingQuary.gff3
$ProgDir/gff2fasta.pl $Assembly $FinalDir/final_genes_CodingQuary.gff3 $FinalDir/final_genes_CodingQuary
cp $BrakerGff $FinalDir/final_genes_Braker.gff3
$ProgDir/gff2fasta.pl $Assembly $FinalDir/final_genes_Braker.gff3 $FinalDir/final_genes_Braker
# cp $ManGff $FinalDir/final_genes_manual.gff3
# $ProgDir/gff2fasta.pl $Assembly $FinalDir/final_genes_manual.gff3 $FinalDir/final_genes_manual
cat $FinalDir/final_genes_Braker.pep.fasta $FinalDir/final_genes_CodingQuary.pep.fasta | sed -r 's/\*/X/g' > $FinalDir/final_genes_combined.pep.fasta
cat $FinalDir/final_genes_Braker.cdna.fasta $FinalDir/final_genes_CodingQuary.cdna.fasta > $FinalDir/final_genes_combined.cdna.fasta
cat $FinalDir/final_genes_Braker.gene.fasta $FinalDir/final_genes_CodingQuary.gene.fasta > $FinalDir/final_genes_combined.gene.fasta
cat $FinalDir/final_genes_Braker.upstream3000.fasta $FinalDir/final_genes_CodingQuary.upstream3000.fasta > $FinalDir/final_genes_combined.upstream3000.fasta
GffBraker=$FinalDir/final_genes_CodingQuary.gff3
GffQuary=$FinalDir/final_genes_Braker.gff3
# GffManual=$FinalDir/final_genes_manual.gff3
GffAppended=$FinalDir/final_genes_appended.gff3
cat $GffBraker $GffQuary > $GffAppended
# cat $GffBraker $GffQuary $GffManual > $GffAppended
done
for Assembly in $(ls gene_pred/final_genes/*/*/final/final_genes_combined.gene.fasta | grep -e 'Fus2' -e '125' -e 'A23' -e 'A13' -e 'A28' -e 'CB3' -e 'PG' -e 'A8' -e 'N139' | grep -e 'ncbi' -e 'canu_new' | grep -v 'old'); do
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
ProgDir=/home/armita/git_repos/emr_repos/tools/gene_prediction/busco
# BuscoDB="Fungal"
BuscoDB=$(ls -d /home/groups/harrisonlab/dbBusco/sordariomyceta_odb9)
OutDir=gene_pred/busco/$Organism/$Strain/genes
qsub $ProgDir/sub_busco2.sh $Assembly $BuscoDB $OutDir
done
for File in $(ls gene_pred/busco/*/*/genes/*/short_summary_*.txt); do
echo $File;
cat $File | grep -e '(C)' -e 'Total';
done
The genome was searched in six reading frames for any start codon and following translated identification of a start codon translating sequence until a stop codon was found. This is based upon the atg.pl script used in paper describing the P. infestans genome. Additional functionality was added to this script by also printing ORFs in .gff format.
ProgDir=/home/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
for Genome in $(ls repeat_masked/F.*/*/*/*_contigs_unmasked.fa | grep -w -e 'Fus2'); do
qsub $ProgDir/run_ORF_finder.sh $Genome
done
The Gff files from the the ORF finder are not in true Gff3 format. These were corrected using the following commands:
ProgDir=~/git_repos/emr_repos/tools/seq_tools/feature_annotation
for ORF_Gff in $(ls gene_pred/ORF_finder/*/*/*_ORF.gff | grep -v '_F_atg_' | grep -v '_R_atg_'); do
ORF_Gff_mod=$(echo $ORF_Gff | sed 's/_ORF.gff/_ORF_corrected.gff3/g')
echo ""
echo "Correcting the following file:"
echo $ORF_Gff
echo "Redirecting to:"
echo $ORF_Gff_mod
$ProgDir/gff_corrector.pl $ORF_Gff > $ORF_Gff_mod
done
#Functional annotation
Interproscan was used to give gene models functional annotations. Annotation was run using the commands below:
Note: This is a long-running script. As such, these commands were run using 'screen' to allow jobs to be submitted and monitored in the background. This allows the session to be disconnected and reconnected over time.
Screen ouput detailing the progress of submission of interporscan jobs was redirected to a temporary output file named interproscan_submission.log .
screen -a
cd /home/groups/harrisonlab/project_files/fusarium
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan
for Genes in $(ls gene_pred/final_genes/F.*/*/*/final_genes_combined.pep.fasta | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi'| grep -e 'Fus2_canu_new' -e 'ncbi' | grep 'ncbi' | grep -e '125' -e 'A23' -e 'N139' ); do
echo $Genes
$ProgDir/sub_interproscan.sh $Genes
done 2>&1 | tee -a interproscan_submisison.log
Following interproscan annotation split files were combined using the following commands:
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan
for Proteins in $(ls gene_pred/final_genes/F.*/*/*/final_genes_combined.pep.fasta | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi' | grep -e 'Fus2_canu_new' -e 'ncbi' | grep 'ncbi' | grep -e '125' -e 'A23' -e 'N139'); do
Strain=$(echo $Proteins | rev | cut -d '/' -f3 | rev)
Organism=$(echo $Proteins | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
echo $Strain
InterProRaw=gene_pred/interproscan/$Organism/$Strain/raw
$ProgDir/append_interpro.sh $Proteins $InterProRaw
done
for Proteome in $(ls gene_pred/final_genes/F.*/*/*/final_genes_combined.pep.fasta | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi'| grep -e 'Fus2_canu_new' -e 'ncbi' | grep 'ncbi' | grep -e '125' -e 'A23' -e 'N139'); do
Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
OutDir=gene_pred/swissprot/$Organism/$Strain
SwissDbDir=../../uniprot/swissprot
SwissDbName=uniprot_sprot
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/swissprot
qsub $ProgDir/sub_swissprot.sh $Proteome $OutDir $SwissDbDir $SwissDbName
done
for SwissTable in $(ls gene_pred/swissprot/*/*/swissprot_vJul2016_10_hits.tbl | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi' | grep -e 'Fus2_canu_new' -e 'ncbi' | grep 'ncbi' | grep -e '125' -e 'A23' -e 'N139'); do
# SwissTable=gene_pred/swissprot/Fus2/swissprot_v2015_10_hits.tbl
Strain=$(echo $SwissTable | rev | cut -f2 -d '/' | rev)
Organism=$(echo $SwissTable | rev | cut -f3 -d '/' | rev)
echo "$Organism - $Strain"
OutTable=gene_pred/swissprot/$Organism/$Strain/swissprot_vJul2016_tophit_parsed.tbl
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/swissprot
$ProgDir/swissprot_parser.py --blast_tbl $SwissTable --blast_db_fasta ../../uniprot/swissprot/uniprot_sprot.fasta > $OutTable
done
Carbohydrte active enzymes were idnetified using CAZYfollowing recomendations at http://csbl.bmb.uga.edu/dbCAN/download/readme.txt :
for Proteome in $(ls gene_pred/final_genes/F.*/*/*/final_genes_combined.pep.fasta | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi' | grep -e 'Fus2_canu_new' -e 'ncbi' | grep -v 'ncbi'); do
Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
OutDir=gene_pred/CAZY/$Organism/$Strain
mkdir -p $OutDir
Prefix="$Strain"_CAZY
CazyHmm=../../dbCAN/dbCAN-fam-HMMs.txt
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/HMMER
qsub $ProgDir/sub_hmmscan.sh $CazyHmm $Proteome $Prefix $OutDir
done
The Hmm parser was used to filter hits by an E-value of E1x10-5 or E1x10-e3 if they had a hit over a length of X %.
Those proteins with a signal peptide were extracted from the list and gff files representing these proteins made.
for File in $(ls gene_pred/CAZY/*/*/*CAZY.out.dm | grep -e 'cepae' -e 'proliferatum' -e 'narcissi' | grep -e 'Fus2_canu_new' -e 'ncbi'); do
Strain=$(echo $File | rev | cut -f2 -d '/' | rev)
Organism=$(echo $File | rev | cut -f3 -d '/' | rev)
OutDir=$(dirname $File)
echo "$Organism - $Strain"
ProgDir=/home/groups/harrisonlab/dbCAN
$ProgDir/hmmscan-parser.sh $OutDir/"$Strain"_CAZY.out.dm > $OutDir/"$Strain"_CAZY.out.dm.ps
CazyHeaders=$(echo $File | sed 's/.out.dm/_headers.txt/g')
cat $OutDir/"$Strain"_CAZY.out.dm.ps | cut -f3 | sort | uniq > $CazyHeaders
echo "number of CAZY proteins identified:"
cat $CazyHeaders | wc -l
# Gff=$(ls gene_pred/codingquary/$Organism/$Strain/final/final_genes_appended.gff3)
Gff=$(ls gene_pred/final_genes/$Organism/$Strain/final/final_genes_appended.gff3)
CazyGff=$OutDir/"$Strain"_CAZY.gff
ProgDir=/home/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
$ProgDir/extract_gff_for_sigP_hits.pl $CazyHeaders $Gff CAZyme ID > $CazyGff
echo "number of CAZY genes identified:"
cat $CazyGff | grep -w 'gene' | wc -l
SecretedProts=$(ls gene_pred/final_genes_signalp-4.1/$Organism/$Strain/"$Strain"_final_sp_no_trans_mem.aa)
SecretedHeaders=$(echo $SecretedProts | sed 's/.aa/_headers.txt/g')
cat $SecretedProts | grep '>' | tr -d '>' > $SecretedHeaders
CazyGffSecreted=$OutDir/"$Strain"_CAZY_secreted.gff
$ProgDir/extract_gff_for_sigP_hits.pl $SecretedHeaders $CazyGff Secreted_CAZyme ID > $CazyGffSecreted
echo "number of Secreted CAZY proteins identified:"
cat $CazyGffSecreted | grep -w 'mRNA' | cut -f9 | tr -d 'ID=' | cut -f1 -d ';' > $OutDir/"$Strain"_CAZY_secreted_headers.txt
cat $OutDir/"$Strain"_CAZY_secreted_headers.txt | wc -l
echo "number of Secreted CAZY genes identified:"
cat $CazyGffSecreted | grep -w 'gene' | wc -l
# cat $OutDir/"$Strain"_CAZY_secreted_headers.txt | cut -f1 -d '.' | sort | uniq | wc -l
done > tmp.txt
F.oxysporum_fsp_cepae - 125_ncbi
number of CAZY proteins identified:
925
number of CAZY genes identified:
925
number of Secreted CAZY proteins identified:
386
number of Secreted CAZY genes identified:
386
F.oxysporum_fsp_cepae - A13_ncbi
number of CAZY proteins identified:
965
number of CAZY genes identified:
965
number of Secreted CAZY proteins identified:
397
number of Secreted CAZY genes identified:
397
F.oxysporum_fsp_cepae - A23_ncbi
number of CAZY proteins identified:
930
number of CAZY genes identified:
930
number of Secreted CAZY proteins identified:
387
number of Secreted CAZY genes identified:
387
F.oxysporum_fsp_cepae - A28_ncbi
number of CAZY proteins identified:
919
number of CAZY genes identified:
919
number of Secreted CAZY proteins identified:
376
number of Secreted CAZY genes identified:
376
F.oxysporum_fsp_cepae - CB3_ncbi
number of CAZY proteins identified:
909
number of CAZY genes identified:
909
number of Secreted CAZY proteins identified:
383
number of Secreted CAZY genes identified:
383
F.oxysporum_fsp_cepae - Fus2_canu_new
number of CAZY proteins identified:
940
number of CAZY genes identified:
940
number of Secreted CAZY proteins identified:
386
number of Secreted CAZY genes identified:
386
F.oxysporum_fsp_cepae - PG_ncbi
number of CAZY proteins identified:
914
number of CAZY genes identified:
914
number of Secreted CAZY proteins identified:
381
number of Secreted CAZY genes identified:
381
F.oxysporum_fsp_narcissi - N139_ncbi
number of CAZY proteins identified:
961
number of CAZY genes identified:
961
number of Secreted CAZY proteins identified:
399
number of Secreted CAZY genes identified:
399
F.proliferatum - A8_ncbi
number of CAZY proteins identified:
819
number of CAZY genes identified:
819
number of Secreted CAZY proteins identified:
341
number of Secreted CAZY genes identified:
341
Note - the CAZY genes identified may need further filtering based on e value and cuttoff length - see below:
Cols in yourfile.out.dm.ps:
- Family HMM
- HMM length
- Query ID
- Query length
- E-value (how similar to the family HMM)
- HMM start
- HMM end
- Query start
- Query end
- Coverage
-
For fungi, use E-value < 1e-17 and coverage > 0.45
-
The best threshold varies for different CAZyme classes (please see http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4132414/ for details). Basically to annotate GH proteins, one should use a very relax coverage cutoff or the sensitivity will be low (Supplementary Tables S4 and S9); (ii) to annotate CE families a very stringent E-value cutoff and coverage cutoff should be used; otherwise the precision will be very low due to a very high false positive rate (Supplementary Tables S5 and S10)
for CAZY in $(ls gene_pred/CAZY/*/*/*_CAZY.out.dm.ps | grep -e 'cepae' -e 'proliferatum' -e 'narcissi' | grep -e 'Fus2_canu_new' -e 'ncbi' | grep -e 'Fus2' -e 'N139'); do
Strain=$(echo $CAZY | rev | cut -f2 -d '/' | rev)
Organism=$(echo $CAZY | rev | cut -f3 -d '/' | rev)
OutDir=$(dirname $CAZY)
echo "$Organism - $Strain"
Secreted=$(ls gene_pred/final_genes_signalp-4.1/$Organism/$Strain/*_final_sp_no_trans_mem_headers.txt)
Gff=gene_pred/final_genes/$Organism/$Strain/final/final_genes_appended.gff3
ProgDir=/home/armita/git_repos/emr_repos/tools/pathogen/CAZY
$ProgDir/summarise_CAZY.py --cazy $CAZY --inp_secreted $Secreted --inp_gff $Gff --summarise_family --trim_gene_id 2 --kubicek_2014
done | less -S
F.oxysporum_fsp_cepae - 125_ncbi
AA - 70
GT - 3
CE - 64
CBM - 46
PL - 25
GH - 177
F.oxysporum_fsp_cepae - A13_ncbi
AA - 67
GT - 2
CE - 71
CBM - 51
PL - 24
GH - 182
F.oxysporum_fsp_cepae - A23_ncbi
AA - 69
GT - 3
CE - 67
CBM - 48
PL - 25
GH - 175
F.oxysporum_fsp_cepae - A28_ncbi
AA - 63
GT - 2
CE - 64
CBM - 49
PL - 22
GH - 176
F.oxysporum_fsp_cepae - CB3_ncbi
AA - 64
GT - 3
CE - 69
CBM - 42
PL - 25
GH - 179
F.oxysporum_fsp_cepae - Fus2_canu_new
AA - 69
GT - 3
CE - 66
CBM - 47
PL - 26
GH - 174
F.oxysporum_fsp_cepae - PG_ncbi
AA - 66
GT - 2
CE - 65
CBM - 48
PL - 27
GH - 173
F.oxysporum_fsp_narcissi - N139_ncbi
AA - 68
GT - 1
CE - 66
CBM - 41
PL - 27
GH - 195
F.proliferatum - A8_ncbi
AA - 64
GT - 3
CE - 64
CBM - 28
PL - 26
GH - 156
F.oxysporum_fsp_cepae - 125_ncbi
B-Galactosidases - 3
A-Galactosidases - 4
Polygalacturonase - 13
A-Arabinosidases - 26
Xylanases - 11
Polygalacturonate lyases - 26
B-Glucuronidases - 4
B-Glycosidases - 13
Cellulases - 20
Xyloglucanases - 1
F.oxysporum_fsp_cepae - A13_ncbi
B-Galactosidases - 3
A-Galactosidases - 3
Polygalacturonase - 14
A-Arabinosidases - 28
Xylanases - 11
Polygalacturonate lyases - 26
B-Glucuronidases - 3
B-Glycosidases - 13
Cellulases - 22
Xyloglucanases - 1
F.oxysporum_fsp_cepae - A23_ncbi
B-Galactosidases - 3
B-Glucuronidases - 4
Polygalacturonase - 13
A-Arabinosidases - 26
Xylanases - 11
Polygalacturonate lyases - 26
A-Galactosidases - 4
B-Glycosidases - 13
Cellulases - 21
Xyloglucanases - 1
F.oxysporum_fsp_cepae - A28_ncbi
B-Galactosidases - 2
A-Galactosidases - 4
Polygalacturonase - 13
A-Arabinosidases - 29
Xylanases - 11
Polygalacturonate lyases - 24
B-Glucuronidases - 4
B-Glycosidases - 13
Cellulases - 21
Xyloglucanases - 1
F.oxysporum_fsp_cepae - CB3_ncbi
B-Galactosidases - 2
B-Glucuronidases - 4
Polygalacturonase - 12
A-Arabinosidases - 28
Xylanases - 10
Polygalacturonate lyases - 26
A-Galactosidases - 4
B-Glycosidases - 14
Cellulases - 21
Xyloglucanases - 1
F.oxysporum_fsp_cepae - Fus2_canu_new
B-Galactosidases - 2
A-Galactosidases - 4
Polygalacturonase - 13
A-Arabinosidases - 26
Xylanases - 11
Polygalacturonate lyases - 27
B-Glucuronidases - 4
B-Glycosidases - 13
Cellulases - 20
Xyloglucanases - 1
F.oxysporum_fsp_cepae - PG_ncbi
B-Galactosidases - 2
A-Galactosidases - 4
Polygalacturonase - 12
A-Arabinosidases - 28
Xylanases - 10
Polygalacturonate lyases - 26
B-Glucuronidases - 4
B-Glycosidases - 12
Cellulases - 20
Xyloglucanases - 1
F.oxysporum_fsp_narcissi - N139_ncbi
B-Galactosidases - 2
B-Glucuronidases - 4
Polygalacturonase - 12
A-Arabinosidases - 28
Xylanases - 11
Polygalacturonate lyases - 27
A-Galactosidases - 4
B-Glycosidases - 17
Cellulases - 20
Xyloglucanases - 1
F.proliferatum - A8_ncbi
B-Galactosidases - 2
A-Galactosidases - 3
Polygalacturonase - 11
A-Arabinosidases - 20
Xylanases - 9
Polygalacturonate lyases - 26
B-Glucuronidases - 2
B-Glycosidases - 10
Cellulases - 21
Xyloglucanases - 1
Antismash was run to identify clusters of secondary metabolite genes within the genome. Antismash was run using the weserver at: http://antismash.secondarymetabolites.org
Results of web-annotation of gene clusters within the assembly were downloaded to the following directories:
for Assembly in $(ls repeat_masked/*/*/*/*_contigs_softmasked_repeatmasker_TPSI_appended.fa | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi' | grep -e 'canu_new' -e 'ncbi'); do
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
OutDir=analysis/antismash/$Organism/$Strain
mkdir -p $OutDir
done
for Zip in $(ls analysis/antismash/*/*/*.zip | grep -e 'ncbi' | grep 'A23'); do
OutDir=$(dirname $Zip)
unzip -d $OutDir $Zip
done
# AntiSmash=analysis/antismash/79c1471f-4a2b-41f7-ba36-18ba94675f59/contig_1_pilon.final.gbk
for AntiSmash in $(ls analysis/antismash/*/*/*/*.final.gbk | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi' | grep -e 'Fus2_canu_new' -e 'ncbi' | grep 'A23'); do
Organism=$(echo $AntiSmash | rev | cut -f4 -d '/' | rev)
Strain=$(echo $AntiSmash | rev | cut -f3 -d '/' | rev)
echo "$Organism - $Strain"
OutDir=analysis/antismash/$Organism/$Strain
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/secondary_metabolites
$ProgDir/antismash2gff.py --inp_antismash $AntiSmash > $OutDir/"$Strain"_secondary_metabolite_regions.gff
printf "Number of clusters detected:\t"
cat $OutDir/"$Strain"_secondary_metabolite_regions.gff | grep 'antismash_cluster' | wc -l
# GeneGff=gene_pred/final_genes/F.oxysporum_fsp_cepae/Fus2_canu_new/final/final_genes_appended.gff3
GeneGff=gene_pred/final_genes/$Organism/$Strain/final/final_genes_appended.gff3
bedtools intersect -u -a $GeneGff -b $OutDir/"$Strain"_secondary_metabolite_regions.gff > $OutDir/metabolite_cluster_genes.gff
cat $OutDir/metabolite_cluster_genes.gff | grep -w 'mRNA' | cut -f9 | cut -f2 -d '=' | cut -f1 -d ';' > $OutDir/metabolite_cluster_gene_headers.txt
printf "Number of predicted proteins in clusters:\t"
cat $OutDir/metabolite_cluster_gene_headers.txt | wc -l
printf "Number of predicted genes in clusters:\t"
cat $OutDir/metabolite_cluster_genes.gff | grep -w 'gene' | wc -l
done
These clusters represented the following genes. Note that these numbers just show the number of intersected genes with gff clusters and are not confirmed by function
F.oxysporum_fsp_cepae - 125_ncbi
Number of clusters detected: 46
Number of predicted proteins in clusters: 568
Number of predicted genes in clusters: 559
F.oxysporum_fsp_cepae - A13_ncbi
Number of clusters detected: 49
Number of predicted proteins in clusters: 626
Number of predicted genes in clusters: 620
F.oxysporum_fsp_cepae - A23_ncbi
Number of clusters detected: 35
Number of predicted proteins in clusters: 436
Number of predicted genes in clusters: 432
F.oxysporum_fsp_cepae - A28_ncbi
Number of clusters detected: 46
Number of predicted proteins in clusters: 668
Number of predicted genes in clusters: 656
F.oxysporum_fsp_cepae - CB3_ncbi
Number of clusters detected: 44
Number of predicted proteins in clusters: 613
Number of predicted genes in clusters: 604
F.oxysporum_fsp_cepae - Fus2_canu_new
Number of clusters detected: 50
Number of predicted proteins in clusters: 720
Number of predicted genes in clusters: 703
F.oxysporum_fsp_cepae - PG_ncbi
Number of clusters detected: 45
Number of predicted proteins in clusters: 646
Number of predicted genes in clusters: 638
F.oxysporum_fsp_narcissi - N139_ncbi
Number of clusters detected: 46
Number of predicted proteins in clusters: 606
Number of predicted genes in clusters: 601
F.proliferatum - A8_ncbi
Number of clusters detected: 58
Number of predicted proteins in clusters: 762
Number of predicted genes in clusters: 762
for Antismash in $(ls analysis/antismash/F.*/*/*_secondary_metabolite_regions.gff | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi' | grep -e 'Fus2_canu_new' -e 'ncbi' | grep 'Fus2'); do
Organism=$(echo $Antismash | rev | cut -f3 -d '/' | rev)
Strain=$(echo $Antismash | rev | cut -f2 -d '/' | rev)
echo "$Organism - $Strain"
cat $Antismash | cut -f3 | sort | uniq -c
done
F.oxysporum_fsp_cepae - 125_ncbi
3 indole
11 nrps
1 nrps-indole
4 nrps-t1pks
10 other
6 t1pks
1 t3pks
10 terpene
F.oxysporum_fsp_cepae - A13_ncbi
3 indole
12 nrps
3 nrps-t1pks
10 other
9 t1pks
1 t1pks-nrps
1 t3pks
10 terpene
F.oxysporum_fsp_cepae - A23_ncbi
3 indole
11 nrps
1 nrps-indole
4 nrps-t1pks
10 other
6 t1pks
1 t3pks
10 terpene
F.oxysporum_fsp_cepae - A28_ncbi
3 indole
1 indole-nrps
11 nrps
2 nrps-t1pks
10 other
8 t1pks
1 t1pks-nrps
1 t3pks
9 terpene
F.oxysporum_fsp_cepae - CB3_ncbi
1 bacteriocin
3 indole
1 indole-nrps
11 nrps
3 nrps-t1pks
9 other
6 t1pks
1 t1pks-indole
1 t3pks
8 terpene
F.oxysporum_fsp_cepae - Fus2_canu_new
3 indole
1 indole-nrps
11 nrps
3 nrps-t1pks
11 other
7 t1pks
1 t1pks-nrps
1 t3pks
12 terpene
F.oxysporum_fsp_cepae - PG_ncbi
3 indole
1 indole-nrps
11 nrps
3 nrps-t1pks
10 other
5 t1pks
1 t1pks-indole
1 t3pks
9 terpene
1 terpene-t1pks
F.oxysporum_fsp_narcissi - N139_ncbi
3 indole
1 indole-nrps
13 nrps
3 nrps-t1pks
10 other
6 t1pks
1 t1pks-terpene
1 t3pks
8 terpene
F.proliferatum - A8_ncbi
1 hserlactone
3 indole
13 nrps
1 nrps-indole
6 nrps-t1pks
9 other
14 t1pks
1 t1pks-terpene
1 t3pks
9 terpene
Small secreted cysteine rich proteins were identified within secretomes. These proteins may be identified by EffectorP, but this approach allows direct control over what constitutes a SSCP.
for Secretome in $(ls gene_pred/final_genes_signalp-4.1/*/*/*_final_sp_no_trans_mem.aa | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi'| grep -e 'Fus2_canu_new' -e 'ncbi'); do
Strain=$(echo $Secretome| rev | cut -f2 -d '/' | rev)
Organism=$(echo $Secretome | rev | cut -f3 -d '/' | rev)
echo "$Organism - $Strain"
OutDir=analysis/sscp/$Organism/$Strain
mkdir -p $OutDir
ProgDir=/home/armita/git_repos/emr_repos/tools/pathogen/sscp
$ProgDir/sscp_filter.py --inp_fasta $Secretome --max_length 300 --threshold 3 --out_fasta $OutDir/"$Strain"_sscp_all_results.fa
cat $OutDir/"$Strain"_sscp_all_results.fa | grep 'Yes' > $OutDir/"$Strain"_sscp.fa
printf "number of SSC-rich genes:\t"
cat $OutDir/"$Strain"_sscp.fa | grep '>' | tr -d '>' | cut -f1 -d '.' | sort | uniq | wc -l
printf "Number of effectors predicted by EffectorP:\t"
EffectorP=$(ls analysis/effectorP/$Organism/$Strain/*_EffectorP_secreted_headers.txt)
cat $EffectorP | wc -l
printf "Number of SSCPs predicted by both effectorP and this approach: \t"
cat $OutDir/"$Strain"_sscp.fa | grep '>' | tr -d '>' > $OutDir/"$Strain"_sscp_headers.txt
cat $OutDir/"$Strain"_sscp_headers.txt $EffectorP | cut -f1 | sort | uniq -d | wc -l
echo ""
done > tmp.txt
F.oxysporum_fsp_cepae - 125_ncbi
% cysteine content threshold set to: 3
maximum length set to: 300
No. short-cysteine rich proteins in input fasta: 289
number of SSC-rich genes: 287
Number of effectors predicted by EffectorP: 357
Number of SSCPs predicted by both effectorP and this approach: 221
F.oxysporum_fsp_cepae - A13_ncbi
% cysteine content threshold set to: 3
maximum length set to: 300
No. short-cysteine rich proteins in input fasta: 318
number of SSC-rich genes: 316
Number of effectors predicted by EffectorP: 364
Number of SSCPs predicted by both effectorP and this approach: 237
F.oxysporum_fsp_cepae - A23_ncbi
% cysteine content threshold set to: 3
maximum length set to: 300
No. short-cysteine rich proteins in input fasta: 286
number of SSC-rich genes: 284
Number of effectors predicted by EffectorP: 355
Number of SSCPs predicted by both effectorP and this approach: 218
F.oxysporum_fsp_cepae - A28_ncbi
% cysteine content threshold set to: 3
maximum length set to: 300
No. short-cysteine rich proteins in input fasta: 283
number of SSC-rich genes: 281
Number of effectors predicted by EffectorP: 346
Number of SSCPs predicted by both effectorP and this approach: 214
F.oxysporum_fsp_cepae - CB3_ncbi
% cysteine content threshold set to: 3
maximum length set to: 300
No. short-cysteine rich proteins in input fasta: 279
number of SSC-rich genes: 277
Number of effectors predicted by EffectorP: 357
Number of SSCPs predicted by both effectorP and this approach: 213
F.oxysporum_fsp_cepae - Fus2_canu_new
% cysteine content threshold set to: 3
maximum length set to: 300
No. short-cysteine rich proteins in input fasta: 288
number of SSC-rich genes: 286
Number of effectors predicted by EffectorP: 355
Number of SSCPs predicted by both effectorP and this approach: 218
F.oxysporum_fsp_cepae - PG_ncbi
% cysteine content threshold set to: 3
maximum length set to: 300
No. short-cysteine rich proteins in input fasta: 287
number of SSC-rich genes: 285
Number of effectors predicted by EffectorP: 337
Number of SSCPs predicted by both effectorP and this approach: 214
F.oxysporum_fsp_narcissi - N139_ncbi
% cysteine content threshold set to: 3
maximum length set to: 300
No. short-cysteine rich proteins in input fasta: 344
number of SSC-rich genes: 342
Number of effectors predicted by EffectorP: 407
Number of SSCPs predicted by both effectorP and this approach: 263
F.proliferatum - A8_ncbi
% cysteine content threshold set to: 3
maximum length set to: 300
No. short-cysteine rich proteins in input fasta: 197
number of SSC-rich genes: 197
Number of effectors predicted by EffectorP: 258
Number of SSCPs predicted by both effectorP and this approach: 147
#Genomic analysis
BLast searches were used to identify which genes had homologs on which chromosomes of the Fusarium lycopersici genome.
FoLGenomeFa=assembly/external_group/F.oxysporum_fsp_lycopersici/4287_chromosomal/ensembl/Fusarium_oxysporum_chromosome_and_additional_contigs.fa
for Proteome in $(ls gene_pred/final_genes/F.*/*/*/final_genes_combined.pep.fasta | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi' | grep 'ncbi'); do
# for Proteome in $(ls assembly/external_group/F.oxysporum/fo47/broad/fusarium_oxysporum_fo47_1_proteins.fasta); do
# for Proteome in $(ls gene_pred/external_group/F.oxysporum_fsp_lycopersici/4287/Fusox1/Fusox1_GeneCatalog_proteins_20110522_parsed.fa); do
Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
echo "$Organism - $Strain"
OutDir=analysis/blast_homology/$Organism/$Strain
ProgDir=/home/armita/git_repos/emr_repos/tools/pathogen/blast
qsub $ProgDir/run_blast2csv.sh $Proteome protein $FoLGenomeFa $OutDir
done
Convert top blast hits into gff annotations
for BlastHitsCsv in $(ls analysis/blast_homology/F.*/*/4287_chromosomal_final_genes_combined.pep.fasta_hits.csv | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi' | grep 'ncbi'); do
Organism=$(echo $BlastHitsCsv | rev | cut -f3 -d '/' | rev)
Strain=$(echo $BlastHitsCsv | rev | cut -f2 -d '/' | rev)
echo "$Organism - $Strain"
HitsGff=$(echo $BlastHitsCsv | sed 's/.csv/.gff/g')
Column2="$Strain"_gene_homolog
NumHits=1
ProgDir=/home/armita/git_repos/emr_repos/tools/pathogen/blast
$ProgDir/blast2gff.pl $Column2 $NumHits $BlastHitsCsv > $HitsGff
done
for HitsGff in $(ls analysis/blast_homology/F.*/*/4287_chromosomal_final_genes_combined.pep.fasta_hits.gff | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi' | grep 'ncbi'); do
Organism=$(echo $HitsGff | rev | cut -f3 -d '/' | rev)
Strain=$(echo $HitsGff| rev | cut -f2 -d '/' | rev)
echo "$Organism - $Strain"
HitsDir=$(dirname $HitsGff)
FoLGenes=assembly/external_group/F.oxysporum_fsp_lycopersici/4287_chromosomal/ensembl/Fusarium_oxysporum.FO2.31.chr.gff3
FoLIntersect=$HitsDir/4287_chromosomal_final_genes_combined_intersect.bed
bedtools intersect -wo -a $HitsGff -b $FoLGenes > $FoLIntersect
done
Putative pathogenicity and effector related genes were identified within Braker gene models using a number of approaches:
- A) From Augustus gene models - Identifying secreted proteins
- B) From Augustus gene models - Effector identification using EffectorP
- D) From ORF fragments - Signal peptide & RxLR motif
- E) From ORF fragments - Hmm evidence of WY domains
- F) From ORF fragments - Hmm evidence of RxLR effectors
Required programs:
- SignalP-4.1
- TMHMM
Proteins that were predicted to contain signal peptides were identified using the following commands:
SplitfileDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/signal_peptides
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/signal_peptides
CurPath=$PWD
for Proteome in $(ls gene_pred/final_genes/F.*/*/*/final_genes_combined.pep.fasta | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi' | grep -e 'Fus2_canu_new' -e 'ncbi' | grep -v 'ncbi'); do
Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
SplitDir=gene_pred/final_genes_split/$Organism/$Strain
mkdir -p $SplitDir
BaseName="$Organism""_$Strain"_final_preds
$SplitfileDir/splitfile_500.py --inp_fasta $Proteome --out_dir $SplitDir --out_base $BaseName
for File in $(ls $SplitDir/*_final_preds_*); do
Jobs=$(qstat | grep 'pred_sigP' | wc -l)
while [ $Jobs -gt 20 ]; do
sleep 10
printf "."
Jobs=$(qstat | grep 'pred_sigP' | wc -l)
done
printf "\n"
echo $File
qsub $ProgDir/pred_sigP.sh $File signalp-4.1
done
done
The batch files of predicted secreted proteins needed to be combined into a single file for each strain. This was done with the following commands:
for SplitDir in $(ls -d gene_pred/final_genes_split/*/* | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi' | grep -e 'Fus2_canu_new' -e 'ncbi' | grep -v 'ncbi'); do
Strain=$(echo $SplitDir | rev |cut -d '/' -f1 | rev)
Organism=$(echo $SplitDir | rev |cut -d '/' -f2 | rev)
InStringAA=''
InStringNeg=''
InStringTab=''
InStringTxt=''
SigpDir=final_genes_signalp-4.1
for GRP in $(ls -l $SplitDir/*_final_preds_*.fa | rev | cut -d '_' -f1 | rev | sort -n); do
InStringAA="$InStringAA gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_final_preds_$GRP""_sp.aa";
InStringNeg="$InStringNeg gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_final_preds_$GRP""_sp_neg.aa";
InStringTab="$InStringTab gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_final_preds_$GRP""_sp.tab";
InStringTxt="$InStringTxt gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_final_preds_$GRP""_sp.txt";
done
cat $InStringAA > gene_pred/$SigpDir/$Organism/$Strain/"$Strain"_final_sp.aa
cat $InStringNeg > gene_pred/$SigpDir/$Organism/$Strain/"$Strain"_final_neg_sp.aa
tail -n +2 -q $InStringTab > gene_pred/$SigpDir/$Organism/$Strain/"$Strain"_final_sp.tab
cat $InStringTxt > gene_pred/$SigpDir/$Organism/$Strain/"$Strain"_final_sp.txt
done
Some proteins that are incorporated into the cell membrane require secretion. Therefore proteins with a transmembrane domain are not likely to represent cytoplasmic or apoplastic effectors.
Proteins containing a transmembrane domain were identified:
for Proteome in $(ls gene_pred/final_genes/F.*/*/*/final_genes_combined.pep.fasta | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi' | grep -e 'Fus2_canu_new' -e 'ncbi' | grep -v 'ncbi'); do
Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/transmembrane_helices
qsub $ProgDir/submit_TMHMM.sh $Proteome
done
Those proteins with transmembrane domains were removed from lists of Signal peptide containing proteins
for File in $(ls gene_pred/trans_mem/*/*/*_TM_genes_neg.txt | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi' | grep -e 'Fus2_canu_new' -e 'ncbi' | grep -v 'ncbi' ); do
Strain=$(echo $File | rev | cut -f2 -d '/' | rev)
Organism=$(echo $File | rev | cut -f3 -d '/' | rev)
echo "$Organism - $Strain"
TmHeaders=$(echo "$File" | sed 's/neg.txt/neg_headers.txt/g')
cat $File | cut -f1 > $TmHeaders
SigP=$(ls gene_pred/final_genes_signalp-4.1/$Organism/$Strain/*_final_sp.aa)
OutDir=$(dirname $SigP)
ProgDir=/home/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
$ProgDir/extract_from_fasta.py --fasta $SigP --headers $TmHeaders > $OutDir/"$Strain"_final_sp_no_trans_mem.aa
echo "Number of SigP proteins:"
cat $SigP | grep '>' | wc -l
echo "Number without transmembrane domains:"
cat $OutDir/"$Strain"_final_sp_no_trans_mem.aa | grep '>' | wc -l
echo "Number of gene models:"
cat $OutDir/"$Strain"_final_sp_no_trans_mem.aa | grep '>' | cut -f1 -d't' | sort | uniq |wc -l
done
F.oxysporum_fsp_cepae - Fus2_canu_new
Number of SigP proteins:
1793
Number without transmembrane domains:
1475
Number of gene models:
1449
F.oxysporum_fsp_cepae - A13_ncbi
Number of SigP proteins:
1826
Number without transmembrane domains:
1508
F.oxysporum_fsp_cepae - A23_ncbi
Number of SigP proteins:
1752
Number without transmembrane domains:
1439
F.oxysporum_fsp_cepae - A28_ncbi
Number of SigP proteins:
1772
Number without transmembrane domains:
1450
F.oxysporum_fsp_cepae - CB3_ncbi
Number of SigP proteins:
1763
Number without transmembrane domains:
1439
F.oxysporum_fsp_cepae - PG_ncbi
Number of SigP proteins:
1760
Number without transmembrane domains:
1440
F.oxysporum_fsp_narcissi - N139_ncbi
Number of SigP proteins:
1901
Number without transmembrane domains:
1566
F.proliferatum - A8_ncbi
Number of SigP proteins:
1553
Number without transmembrane domains:
1254
Required programs:
- EffectorP.py
for Proteome in $(ls gene_pred/final_genes/F.*/*/*/final_genes_combined.pep.fasta | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi' | grep -e 'Fus2_canu_new' -e 'ncbi' | grep 'Fus2_canu_new'); do
Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
BaseName="$Organism"_"$Strain"_EffectorP
OutDir=analysis/effectorP/$Organism/$Strain
ProgDir=~/git_repos/emr_repos/tools/seq_tools/feature_annotation/fungal_effectors
qsub $ProgDir/pred_effectorP.sh $Proteome $BaseName $OutDir
done
Those genes that were predicted as secreted and tested positive by effectorP were identified:
for File in $(ls analysis/effectorP/*/*/*_EffectorP.txt | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi' | grep -e 'Fus2_canu_new' -e 'ncbi' | grep -v 'ncbi'); do
Strain=$(echo $File | rev | cut -f2 -d '/' | rev)
Organism=$(echo $File | rev | cut -f3 -d '/' | rev)
echo "$Organism - $Strain"
Headers=$(echo "$File" | sed 's/_EffectorP.txt/_EffectorP_headers.txt/g')
cat $File | grep 'Effector' | cut -f1 > $Headers
Secretome=$(ls gene_pred/final_genes_signalp-4.1/$Organism/$Strain/*_final_sp_no_trans_mem.aa)
OutFile=$(echo "$File" | sed 's/_EffectorP.txt/_EffectorP_secreted.aa/g')
ProgDir=/home/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
$ProgDir/extract_from_fasta.py --fasta $Secretome --headers $Headers > $OutFile
OutFileHeaders=$(echo "$File" | sed 's/_EffectorP.txt/_EffectorP_secreted_headers.txt/g')
cat $OutFile | grep '>' | tr -d '>' > $OutFileHeaders
cat $OutFileHeaders | wc -l
Gff=$(ls gene_pred/final_genes/$Organism/$Strain/*/final_genes_appended.gff3)
EffectorP_Gff=$(echo "$File" | sed 's/_EffectorP.txt/_EffectorP_secreted.gff/g')
ProgDir=/home/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
$ProgDir/extract_gff_for_sigP_hits.pl $OutFileHeaders $Gff effectorP ID > $EffectorP_Gff
cat $EffectorP_Gff | grep -w 'gene' | wc -l
done > tmp.txt
F.oxysporum_fsp_cepae - Fus2_canu_new
355
355
F.oxysporum_fsp_cepae - A13_ncbi
364
364
F.oxysporum_fsp_cepae - A23_ncbi
355
355
F.oxysporum_fsp_cepae - A28_ncbi
346
346
F.oxysporum_fsp_cepae - CB3_ncbi
357
357
F.oxysporum_fsp_cepae - PG_ncbi
337
337
F.oxysporum_fsp_narcissi - N139_ncbi
407
407
F.proliferatum - A8_ncbi
258
258
for Genome in $(ls repeat_masked/F.*/*/*/*_contigs_unmasked.fa | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi'| grep -e 'Fus2_canu_new' -e 'ncbi' | grep -v 'old' | grep -v 'ncbi' ); do
Organism=$(echo "$Genome" | rev | cut -d '/' -f4 | rev)
Strain=$(echo "$Genome" | rev | cut -d '/' -f3 | rev)
BrakerGff=$(ls gene_pred/final_genes/$Organism/"$Strain"/final/final_genes_CodingQuary.gff3)
QuaryGff=$(ls gene_pred/final_genes/$Organism/"$Strain"/final/final_genes_Braker.gff3)
OutDir=analysis/mimps/$Organism/$Strain
mkdir -p "$OutDir"
echo "$Organism - $Strain"
ProgDir="/home/armita/git_repos/emr_repos/tools/pathogen/mimp_finder"
$ProgDir/mimp_finder.pl $Genome $OutDir/"$Strain"_mimps.fa $OutDir/"$Strain"_mimps.gff > $OutDir/"$Strain"_mimps.log
$ProgDir/gffexpander.pl +- 2000 $OutDir/"$Strain"_mimps.gff > $OutDir/"$Strain"_mimps_exp.gff
echo "The number of mimps identified:"
cat $OutDir/"$Strain"_mimps.fa | grep '>' | wc -l
bedtools intersect -u -a $BrakerGff -b $OutDir/"$Strain"_mimps_exp.gff > $OutDir/"$Strain"_genes_in_2kb_mimp.gff
bedtools intersect -u -a $QuaryGff -b $OutDir/"$Strain"_mimps_exp.gff >> $OutDir/"$Strain"_genes_in_2kb_mimp.gff
echo "The following transcripts intersect mimps:"
MimpProtsTxt=$OutDir/"$Strain"_prots_in_2kb_mimp.txt
MimpGenesTxt=$OutDir/"$Strain"_genes_in_2kb_mimp.txt
cat $OutDir/"$Strain"_genes_in_2kb_mimp.gff | grep -w 'mRNA' | cut -f9 | cut -f1 -d';' | cut -f2 -d'=' | sort | uniq > $MimpProtsTxt
cat $OutDir/"$Strain"_genes_in_2kb_mimp.gff | grep -w 'mRNA' | cut -f9 | cut -f1 -d';' | cut -f2 -d'=' | cut -f1 -d '.'| sort | uniq > $MimpGenesTxt
cat $MimpProtsTxt | wc -l
cat $MimpGenesTxt | wc -l
echo ""
done > tmp.txt
F.oxysporum_fsp_cepae - Fus2_canu_new
The number of mimps identified:
153
The following transcripts intersect mimps:
157
155
F.oxysporum_fsp_cepae - 125_ncbi
The number of mimps identified:
140
The following transcripts intersect mimps:
95
95
F.oxysporum_fsp_cepae - A13_ncbi
The number of mimps identified:
55
The following transcripts intersect mimps:
49
49
F.oxysporum_fsp_cepae - A23_ncbi
The number of mimps identified:
136
The following transcripts intersect mimps:
88
88
F.oxysporum_fsp_cepae - A28_ncbi
The number of mimps identified:
35
The following transcripts intersect mimps:
36
36
F.oxysporum_fsp_cepae - CB3_ncbi
The number of mimps identified:
30
The following transcripts intersect mimps:
32
32
F.oxysporum_fsp_cepae - PG_ncbi
The number of mimps identified:
51
The following transcripts intersect mimps:
43
43
F.oxysporum_fsp_narcissi - N139_ncbi
The number of mimps identified:
207
The following transcripts intersect mimps:
120
119
F.proliferatum - A8_ncbi
The number of mimps identified:
6
The following transcripts intersect mimps:
11
11
Those genes that were predicted as secreted and within 2Kb of a MIMP were identified:
for File in $(ls analysis/mimps/*/*/*_genes_in_2kb_mimp.txt | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi'| grep -e 'Fus2_canu_new' -e 'ncbi' | grep -v 'old' | grep 'ncbi' ); do
Strain=$(echo $File | rev | cut -f2 -d '/' | rev | sed 's/_chromosomal//g')
Organism=$(echo $File | rev | cut -f3 -d '/' | rev)
echo "$Organism - $Strain"
ProtsFile=$(echo $File | sed 's/genes/prots/g')
Secretome=$(ls gene_pred/final_genes_signalp-4.1/$Organism/$Strain/*_final_sp_no_trans_mem.aa)
OutFile=$(echo "$File" | sed 's/.gff/_secreted.gff/g')
SecretedHeaders=$(echo "$Secretome" | sed 's/.aa/_headers.txt/g')
cat $Secretome | grep '>' | tr -d '>' | sed 's/-p.//g' > $SecretedHeaders
cat $ProtsFile $SecretedHeaders | cut -f1 | sort | uniq -d | wc -l
cat $SecretedHeaders | cut -f1 | cut -f1 -d '.' | sort | uniq | grep -f $File | wc -l
# ProgDir=/home/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
# $ProgDir/extract_gff_for_sigP_hits.pl $SecretedHeaders $File secreted_mimp ID > $OutFile
# cat $OutFile | grep -w 'mRNA' | wc -l
done
F.oxysporum_fsp_cepae - Fus2_canu_new
31
31
F.oxysporum_fsp_cepae - 125_ncbi
24
26
F.oxysporum_fsp_cepae - A13_ncbi
9
9
F.oxysporum_fsp_cepae - A23_ncbi
20
20
F.oxysporum_fsp_cepae - A28_ncbi
5
5
F.oxysporum_fsp_cepae - CB3_ncbi
1
1
F.oxysporum_fsp_cepae - PG_ncbi
3
3
F.oxysporum_fsp_narcissi - N139_ncbi
31
31
F.proliferatum - A8_ncbi
1
1
Secreted proteins with a N-terminal [SG]-P-C-[KR]-P motif
for File in $(ls gene_pred/final_genes_signalp-4.1/F.*/*/*_final_sp_no_trans_mem.aa | grep -e 'Fus2_canu_new' -e 'ncbi'); do
echo $File;
cat $File | grep -B1 "[SG]PC[KR]P" | grep '>' | cut -f1 | tr -d '>' | wc -l;
done
cat gene_pred/final_genes_signalp-4.1/F.oxysporum_fsp_cepae/Fus2_canu_new/Fus2_canu_new_final_sp_no_trans_mem.aa | grep -B1 "[SG]PC[KR]P" | grep '>' | cut -f1 | tr -d '>' > tmp.txt
cat analysis/expression/Fus2_expressed_genes.tsv | grep -w -f 'tmp.txt' > tmp.tsv
gene_pred/final_genes_signalp-4.1/F.oxysporum_fsp_cepae/125_ncbi/125_ncbi_final_sp_no_trans_mem.aa
43
gene_pred/final_genes_signalp-4.1/F.oxysporum_fsp_cepae/A13_ncbi/A13_ncbi_final_sp_no_trans_mem.aa
50
gene_pred/final_genes_signalp-4.1/F.oxysporum_fsp_cepae/A23_ncbi/A23_ncbi_final_sp_no_trans_mem.aa
42
gene_pred/final_genes_signalp-4.1/F.oxysporum_fsp_cepae/A28_ncbi/A28_ncbi_final_sp_no_trans_mem.aa
45
gene_pred/final_genes_signalp-4.1/F.oxysporum_fsp_cepae/Fus2_canu_new/Fus2_canu_new_final_sp_no_trans_mem.aa
42
gene_pred/final_genes_signalp-4.1/F.oxysporum_fsp_cepae/CB3_ncbi/CB3_ncbi_final_sp_no_trans_mem.aa
46
gene_pred/final_genes_signalp-4.1/F.oxysporum_fsp_cepae/PG_ncbi/PG_ncbi_final_sp_no_trans_mem.aa
42
gene_pred/final_genes_signalp-4.1/F.oxysporum_fsp_narcissi/N139_ncbi/N139_ncbi_final_sp_no_trans_mem.aa
42
gene_pred/final_genes_signalp-4.1/F.proliferatum/A8_ncbi/A8_ncbi_final_sp_no_trans_mem.aa
40
See pathogen/identifying_LS_chromosomes/characterising_LS_contigs.md
Orthomcl was used to identify orthologous groups between Fusarium spp. genomes
Genomes were grouped by subspecies and orthology was determined within each subspecies group. Orthology was also determined between subspecies groups.
Pathogenic | non-pathogenic | Intermediate |
---|---|---|
125 | A28 | 55 |
A23 | D2 | |
Fus2 | PG |
The Commands used to run this analysis are shown in pathogen/orthology/F.oxysporum_fsp.cepae_pathogen_vs_non-pathogen_orthology.md
Protein sequence of previously characterised SIX genes used to BLAST against assemblies.
ProgDir=/home/armita/git_repos/emr_repos/tools/pathogen/blast
for Assembly in $(ls repeat_masked/*/*/*/*_contigs_unmasked.fa | grep -w -v 'Fus2_canu_new' | grep 'ncbi' | grep -v 'old'); do
echo $Assembly
Query=analysis/blast_homology/Fo_path_genes/Fo_path_genes_CRX.fa
qsub $ProgDir/blast_pipe.sh $Query dna $Assembly
done
Once blast searches had completed, the BLAST hits were converted to GFF annotations:
for BlastHits in $(ls analysis/blast_homology/*/*/*Fo_path_genes_CRX.fa_homologs.csv | grep -v -w 'Fus2_canu_new' | grep 'ncbi'); do
Strain=$(echo $BlastHits | rev | cut -f2 -d '/' | rev)
Organism=$(echo $BlastHits | rev | cut -f3 -d '/' | rev)
ProgDir=/home/armita/git_repos/emr_repos/tools/pathogen/blast
HitsGff=$(echo $BlastHits | sed 's/.csv/.gff/g')
Column2=SIX_homolog
NumHits=12
$ProgDir/blast2gff.pl $Column2 $NumHits $BlastHits > $HitsGff
done
The blast hits were summarised in a single table for all the genomes. The top
identity of the top blast hit in relation to the enquire query sequence was
presented for each blast hit.
OutFile=analysis/blast_homology/Fo_path_genes_CRX_summary.tab
echo "Organism" > tmp2.tab
cat analysis/blast_homology/F.proliferatum/A8/A8_Fo_path_genes_CRX.fa_homologs.csv | cut -f1 >> tmp2.tab
for BlastHits in $(ls analysis/blast_homology/*/*/*Fo_path_genes_CRX.fa_homologs.csv | grep -v -w -e 'Fus2' -e 'Fus2_edited_v1' -e 'Fus2_edited_v2' -e 'Fus2_canu' -e 'Fus2_3' -e '4287' | grep -e 'Fus2_canu_new' -e 'ncbi'); do
Strain=$(echo $BlastHits | rev | cut -f2 -d '/' | rev)
Organism=$(echo $BlastHits | rev | cut -f3 -d '/' | rev)
echo "$Organism" > tmp.tab
echo "$Strain" >> tmp.tab
cat $BlastHits | cut -f10,19 | tail -n +2 | sed -r "s/\t/,/g" >> tmp.tab
paste tmp2.tab tmp.tab > $OutFile
cp $OutFile tmp2.tab
done
rm tmp.tab
rm tmp2.tab
for HitsGff in $(ls analysis/blast_homology/*/*/*Fo_path_genes_CRX.fa_homologs.gff | grep -v 'trinity' | grep -e 'Fus2_canu_new' -e 'ncbi'); do
Strain=$(echo $HitsGff | rev | cut -f2 -d '/' | rev)
Organism=$(echo $HitsGff | rev | cut -f3 -d '/' | rev)
echo "$Organism - $Strain"
GffAppended=$(ls gene_pred/final_genes/$Organism/"$Strain"*/final/final_genes_appended.gff3)
OutDir=$(dirname $HitsGff)
SixIntersect=$OutDir/"$Strain"_Fo_path_genes_CRX.fa_hit_genes.bed
bedtools intersect -wao -a $HitsGff -b $GffAppended > $SixIntersect
bedtools intersect -wao -a $HitsGff -b $GffAppended | cut -f9,18 | grep -v 'Parent'
echo ""
done | tr -d ';' | tr -d '"' | sed 's/ID=//g'> analysis/blast_homology/Fo_path_genes/Fo_path_genes_CRX_hit_genes_summary.tab
F.oxysporum_fsp_cepae - Fus2_canu_new
C5_Seq45_BlastHit_1 g13425
CRX1_Seq49_BlastHit_1 g15739
CRX1_Seq49_BlastHit_2 g11056
CRX1_Seq49_BlastHit_3 g59
CRX1_Seq49_BlastHit_4 g16971
CRX2_Seq50_BlastHit_1 g11056
CRX2_Seq50_BlastHit_2 g15739
CRX2_Seq50_BlastHit_3 g59
MIMP_39_ex.19GJ26_B10-1.seq_BlastHit_1 g15724
MIMP_39_ex.19GJ26_B10-1.seq_BlastHit_2 g17045
MIMP_39_ex.19GJ26_B10-1.seq_BlastHit_3 CUFF_4289_1_31
MIMP_39_ex.19GJ26_B10-1.seq_BlastHit_4 CUFF_327_1_65
MIMP_39_ex.19GJ26_B10-1.seq_BlastHit_5 g12255
MIMP_g29_ex.19GJ26_H12-1.seq_BlastHit_1 g16905
MIMP_g16_ex.19GJ26_B07-1.seq_BlastHit_1 g16179
MIMP_g41_ex.45DG31-1.seq_BlastHit_1 g17144
MIMP_g41_ex.45DG31-1.seq_BlastHit_2 NS_02766
Fusarium_oxysporum_f._sp._lycopersici_isolate_FOL-MM10_secreted_in_xylem_3_(SIX3)_gene,_complete_cds_BlastHit_1 g16847
Fusarium_oxysporum_f._sp._lycopersici_isolate_FOL-MM10_secreted_in_xylem_3_(SIX3)_gene,_complete_cds_BlastHit_2 g16190
Fusarium_oxysporum_f._sp._lycopersici_isolate_BFOL-51_secreted_in_xylem_5_(SIX5)_gene,_partial_cds_BlastHit_1 g16849
Fusarium_oxysporum_f._sp._lycopersici_secreted_in_xylem_Six7_(SIX7)_mRNA,_complete_cds_BlastHit_1 CUFF_4344_1_33
Fusarium_oxysporum_f._sp._lycopersici_strain_Fol007_Six9_(SIX9)_mRNA,_complete_cds_BlastHit_1 g16273
Fusarium_oxysporum_f._sp._lycopersici_strain_Fol007_Six9_(SIX9)_mRNA,_complete_cds_BlastHit_2 g13574
Fusarium_oxysporum_f._sp._lycopersici_strain_Fol007_Six10_(SIX10)_mRNA,_complete_cds_BlastHit_1 g17153
Fusarium_oxysporum_f._sp._lycopersici_strain_Fol007_Six12_(SIX12)_mRNA,_complete_cds_BlastHit_1 PGN_CUFF_4345_1_2
Fusarium_oxysporum_f._sp._lycopersici_strain_Fol007_Six14_(SIX14)_mRNA,_complete_cds_BlastHit_1 man_1
Previously published FTF genes from Sanchez et al 2016 were blasted against Fusarium genomes.
for Assembly in $(ls repeat_masked/*/*/*/*_contigs_unmasked.fa | grep -v -e 'HB17' | grep -v -w 'Fus2_canu_new' | grep 'A8_ncbi'); do
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
echo $Assembly
Query=analysis/blast_homology/Fo_path_genes/FTF_cds_Sanchez_et_al_2016.fasta
OutDir=analysis/FTF/$Organism/$Strain
mkdir -p $OutDir
ProgDir=/home/armita/git_repos/emr_repos/tools/pathogen/blast
qsub $ProgDir/run_blast2csv.sh $Query dna $Assembly $OutDir
done
BLAST hits were converted to Gff annotations and intersected with gene models:
for BlastHits in $(ls analysis/FTF/*/*/*_FTF_cds_Sanchez_et_al_2016.fasta_hits.csv | grep -v -w 'Fus2_canu_new' | grep 'A8_ncbi'); do
Strain=$(echo $BlastHits | rev | cut -f2 -d '/' | rev)
Organism=$(echo $BlastHits | rev | cut -f3 -d '/' | rev)
OutDir=analysis/FTF/$Organism/$Strain
HitsGff=$(echo $BlastHits | sed 's/.csv/.gff/g')
Column2=FTF_homolog
NumHits=1
ProgDir=/home/armita/git_repos/emr_repos/tools/pathogen/blast
$ProgDir/blast2gff.pl $Column2 $NumHits $BlastHits > $HitsGff
GffAppended=$(ls gene_pred/final_genes/$Organism/$Strain/final/final_genes_appended.gff3)
bedtools intersect -wao -a $HitsGff -b $GffAppended > $OutDir/"$Strain"_FTF_hits_intersected.bed
done
Fus2 genes g16859 and g10474 were identified as FTF1 and FTF2 homologs.
cat $OutDir/"$Strain"_FTF_hits_intersected.bed | grep -w 'gene' | cut -f18 | cut -f1 -d ';' | sort | uniq
The orthogorups that these genes belonged to was identified:
cat analysis/orthology/orthomcl/FoC_vs_Fo_vs_FoL_publication_ncbi/FoC_vs_Fo_vs_FoL_publication_ncbi_orthogroups.txt | grep -e 'Fus2|g16859.t' -e 'Fus2|g10474.t'
orthogroup730: 4287|FOXG_17458-t26_1-p1 4287|FOXG_14422-t26_1-p1 4287|FOXG_15059-t26_1-p1 A13|g9338.t1 4287|FOXG_09390-t26_1-p1 A28|g1088.t1 fo47|FOZG_12687T0 PG|g9482.t1 CB3|g6501.t1 Fus2|g10474.t1 A23|g3017.t1 fo47|FOZG_18011T0 PG|g14993.t1 125|g2599.t1 125|g16201.t1 Fus2|g16859.t1 Fus2|NS_02884.t1 A23|g16099.t1 4287|FOXG_17123-t26_1-p1 4287|FOXG_17084-t26_1-p1 4287|FOXG_14000-t26_1-p1 4287|FOXG_12589-t26_1-p1 4287|FOXG_12539-t26_1-p1 4287|FOXG_16414-t26_1-p1 4287|FOXG_14257-t26_1-p1 A13|g15237.t1
This identified orthogroup730 as containing FTF1 & FTF2.
9 Transcription factor genes (TF) have been identified in FoL that potentially regulate SIX gene expression. The genes in FoL can be used to identify orthologous FoC genes:
TF_list="FOXG_14257 FOXG_17260 FOXG_17266 FOXG_14201 FOXG_14230 FOXG_14211 FOXG_14275 FOXG_14277 FOXG_14274"
num=0
OrthoTxt=$(ls analysis/orthology/orthomcl/FoC_vs_Fo_vs_FoL_publication_ncbi/FoC_vs_Fo_vs_FoL_publication_ncbi_orthogroups.txt)
AnnotTab=$(ls gene_pred/annotations/F.oxysporum_fsp_cepae/Fus2_canu_new/Fus2_canu_new_gene_annotations.tab)
for TF in $TF_list; do
num=$(($num +1))
echo "TF"$num
Orthogroup=$(cat $OrthoTxt | grep -w "$TF" | cut -f1 -d ':')
cat $AnnotTab | grep -w "$Orthogroup"
done > analysis/transcription_factors/Fus2_canu_new_TF1-9.tsv
TF_list="FOXG_14257 FOXG_17260 FOXG_17266 FOXG_14201 FOXG_14230 FOXG_14211 FOXG_14275 FOXG_14277 FOXG_14274"
num=0
OrthoTxt=$(ls analysis/orthology/orthomcl/Fp_Fv_FoC_FoL_Fo/Fp_Fv_FoC_FoL_Fo_orthogroups.txt)
AnnotTab=$(ls gene_pred/annotations/F.proliferatum/A8_ncbi/A8_ncbi_gene_annotations.tab)
for TF in $TF_list; do
num=$(($num +1))
echo "TF"$num
Orthogroup=$(cat $OrthoTxt | grep -w "$TF" | cut -f1 -d ':')
cat $AnnotTab | grep -w "$Orthogroup"
done > analysis/transcription_factors/Fp_TF1-9.tsv
Previously published Fusarium spp. Mitcochondial genes from Al-Reedy 2012 were blasted against assembled genomes.
for Assembly in $(ls repeat_masked/*/*/*/*_contigs_unmasked.fa | grep -v -e 'HB17' | grep -w 'Fus2_canu_new'); do
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
echo $Assembly
Query=analysis/blast_homology/Mt_genes/F.spp._mt_prots_Al-Reedy_et_al._2012.fasta
OutDir=analysis/Mt_genes/$Organism/$Strain
mkdir -p $OutDir
ProgDir=/home/armita/git_repos/emr_repos/tools/pathogen/blast
qsub $ProgDir/run_blast2csv.sh $Query protein $Assembly $OutDir
done
BLAST hits were converted to Gff annotations and intersected with gene models:
for BlastHits in $(ls analysis/Mt_genes/*/*/*_F.spp._mt_prots_Al-Reedy_et_al._2012.fasta_hits.csv | grep -v -e 'HB17' | grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi' | grep 'A8'); do
Strain=$(echo $BlastHits | rev | cut -f2 -d '/' | rev)
Organism=$(echo $BlastHits | rev | cut -f3 -d '/' | rev)
echo "$Organism - $Strain"
OutDir=analysis/Mt_genes/$Organism/$Strain
HitsGff=$(echo $BlastHits | sed 's/.csv/.gff/g')
Column2=Mt_gene_homolog
NumHits=1
ProgDir=/home/armita/git_repos/emr_repos/tools/pathogen/blast
$ProgDir/blast2gff.pl $Column2 $NumHits $BlastHits > $HitsGff
GffAppended=$(ls gene_pred/final_genes/$Organism/$Strain/final/final_genes_appended.gff3)
bedtools intersect -wao -a $HitsGff -b $GffAppended > $OutDir/"$Strain"_Mt_hits_intersected.bed
done
The PHIbase database was searched against the assembled genomes using tBLASTx.
for Assembly in $(ls repeat_masked/F.*/*/*/*_contigs_unmasked.fa| grep -v 'HB17' | grep -e 'cepae' -e 'proliferatum' -e 'narcissi' | grep -e 'Fus2_canu_new' -e 'ncbi' | grep -v 'old'); do
ProgDir=/home/armita/git_repos/emr_repos/tools/pathogen/blast
qsub $ProgDir/blast_pipe.sh analysis/blast_homology/PHIbase/PHI_36_accessions.fa protein $Assembly
done
following blasting PHIbase to the genome, the hits were filtered by effect on virulence.
First the a tab seperated file was made in the clusters core directory containing PHIbase. These commands were run as part of previous projects but have been included here for completeness.
As a preliminary analysis of the RNAseq data, highly expressed genes at 72hrs post infection were identified in Fus2.
This process is described in RNAseq/Fus2_expressed_path_genes.md and RNAseq/fpkm_analysis.md
Fus2 contigs 10, 14, 16, 19, 20, 21 and 22 were identified as lineage specific.
As such interproscan annotation were extracted for these contigs and Functional enrichments were compared to the entire genome.
GffGenes=gene_pred/final_genes/F.oxysporum_fsp_cepae/Fus2_canu_new/final/final_genes_appended.gff3
cat $GffGenes | grep 'mRNA' | cut -f1 | sort | uniq -c | sort -n -k2 -t '_'
GffEffP=analysis/effectorP/F.oxysporum_fsp_cepae/Fus2_canu_new/F.oxysporum_fsp_cepae_Fus2_canu_new_EffectorP_secreted.gff
cat $GffEffP | grep 'mRNA' | cut -f1 | sort | uniq -c | sort -n -k2 -t '_'
GffCAZY=gene_pred/CAZY/F.oxysporum_fsp_cepae/Fus2_canu_new/Fus2_canu_new_CAZY_secreted.gff
cat $GffCAZY | grep 'mRNA' | cut -f1 | sort | uniq -c | sort -n -k2 -t '_'
GffSecMet=analysis/antismash/F.oxysporum_fsp_cepae/Fus2_canu_new/Fus2_canu_new_secondary_metabolite_regions.gff
cat $GffSecMet | cut -f1 | sort | uniq -c | sort -n -k2 -t '_'
GffMimp=analysis/mimps/F.oxysporum_fsp_cepae/Fus2_canu_new/Fus2_canu_new_genes_in_2kb_mimp.gff
cat $GffMimp | grep 'mRNA' | cut -f1 | sort | uniq -c | sort -n -k2 -t '_'
GffMimpSec=analysis/mimps/F.oxysporum_fsp_cepae/Fus2_canu_new/Fus2_canu_new_genes_in_2kb_mimp_secreted.gff
cat $GffMimpSec | grep 'mRNA' | cut -f1 | sort | uniq -c | sort -n -k2 -t '_'
From this it was clear that the genes within 2kb of a mimp were primarily associated with the pathogen specific contigs. INterproscan information on these genes was extracted
MimpGeneHeaders=analysis/mimps/F.oxysporum_fsp_cepae/Fus2_canu_new/Fus2_canu_new_prots_in_2kb_mimp.txt
MimpInterpro=analysis/mimps/F.oxysporum_fsp_cepae/Fus2_canu_new/Fus2_canu_new_prots_in_2kb_mimp_interpro.txt
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/filter_by_gene.py --inp_interpro $InterPro --inp_txt $MimpGeneHeaders > $MimpInterpro
more information of number of mimps in LS and PS regions was extracted:
TotalMimps=analysis/mimps/F.oxysporum_fsp_cepae/Fus2_canu_new/Fus2_canu_new_mimps.gff
# Total in Genome
cat $TotalMimps |wc -l
# In all LS regions
cat $TotalMimps | grep -e 'contig_10' -e 'contig_16' -e 'contig_19' -e 'contig_21' -e 'contig_14' -e 'contig_20' -e 'contig_22' | wc -l
# In PS regions
cat $TotalMimps | grep -e 'contig_10' -e 'contig_16' -e 'contig_19' -e 'contig_21' | wc -l
GenesIn2Kb=analysis/mimps/F.oxysporum_fsp_cepae/Fus2_canu_new/Fus2_canu_new_genes_in_2kb_mimp.gff
# Total in Genome
cat $GenesIn2Kb | grep 'gene'| wc -l
# In all LS regions
cat $GenesIn2Kb | grep 'gene'| grep -e 'contig_10' -e 'contig_16' -e 'contig_19' -e 'contig_21' -e 'contig_14' -e 'contig_20' -e 'contig_22' | wc -l
# In PS regions
cat $GenesIn2Kb | grep 'gene'| grep -e 'contig_10' -e 'contig_16' -e 'contig_19' -e 'contig_21' | wc -l
SecretedIn2Kb=analysis/mimps/F.oxysporum_fsp_cepae/Fus2_canu_new/Fus2_canu_new_genes_in_2kb_mimp_secreted.gff
# Total in Genome
cat $SecretedIn2Kb | grep 'gene'| wc -l
# In all LS regions
cat $SecretedIn2Kb | grep 'gene'| grep -e 'contig_10' -e 'contig_16' -e 'contig_19' -e 'contig_21' -e 'contig_14' -e 'contig_20' -e 'contig_22' | wc -l
# In PS regions
cat $SecretedIn2Kb | grep 'gene'| grep -e 'contig_10' -e 'contig_16' -e 'contig_19' -e 'contig_21' | wc -l
Effector candidates within 2Kb of a MIMP were identified:
GenesIn2Kb=analysis/mimps/F.oxysporum_fsp_cepae/Fus2_canu_new/Fus2_canu_new_genes_in_2kb_mimp.gff
OutDir=analysis/mimps/F.oxysporum_fsp_cepae/Fus2_canu_new
cat $GenesIn2Kb | grep 'gene'| grep -e 'contig_10' -e 'contig_16' -e 'contig_19' -e 'contig_21' | cut -f2 -d '=' | tr -d ';' > $OutDir/Fus2_canu_new_genes_in_2kb_mimp_PS_headers.txt
cat $GenesIn2Kb | grep 'gene'| grep -e 'contig_14' -e 'contig_20' -e 'contig_22' | cut -f2 -d '=' | tr -d ';' > $OutDir/Fus2_canu_new_genes_in_2kb_mimp_non-PS_LS_headers.txt
cat $GenesIn2Kb | grep 'gene'| grep -v -e 'contig_10' -e 'contig_16' -e 'contig_19' -e 'contig_21' -e 'contig_14' -e 'contig_20' -e 'contig_22' | cut -f2 -d '=' | tr -d ';' > $OutDir/Fus2_canu_new_genes_in_2kb_mimp_core_headers.txt
EffectorPHeaders=analysis/effectorP/F.oxysporum_fsp_cepae/Fus2_canu_new/F.oxysporum_fsp_cepae_Fus2_canu_new_EffectorP_secreted_headers.txt
cat $EffectorPHeaders | cut -f1 | cut -f1 -d '.' | sort | uniq | grep -f $OutDir/Fus2_canu_new_genes_in_2kb_mimp_PS_headers.txt | wc -l
cat $EffectorPHeaders | cut -f1 | cut -f1 -d '.' | sort | uniq | grep -f $OutDir/Fus2_canu_new_genes_in_2kb_mimp_non-PS_LS_headers.txt | wc -l
cat $EffectorPHeaders | cut -f1 | cut -f1 -d '.' | sort | uniq | grep -f $OutDir/Fus2_canu_new_genes_in_2kb_mimp_core_headers.txt | wc -l
CazyHeaders=gene_pred/CAZY/F.oxysporum_fsp_cepae/Fus2_canu_new/Fus2_canu_new_CAZY_secreted_headers.txt
cat $CazyHeaders | cut -f1 | cut -f1 -d '.' | sort | uniq | grep -f $OutDir/Fus2_canu_new_genes_in_2kb_mimp_PS_headers.txt | wc -l
cat $CazyHeaders | cut -f1 | cut -f1 -d '.' | sort | uniq | grep -f $OutDir/Fus2_canu_new_genes_in_2kb_mimp_non-PS_LS_headers.txt | wc -l
cat $CazyHeaders | cut -f1 | cut -f1 -d '.' | sort | uniq | grep -f $OutDir/Fus2_canu_new_genes_in_2kb_mimp_core_headers.txt | wc -l
AntismashGff=analysis/antismash/F.oxysporum_fsp_cepae/Fus2_canu_new/Fus2_secondary_metabolite_regions.gff
MimpPlus2KbGff=analysis/mimps/F.oxysporum_fsp_cepae/Fus2_canu_new/Fus2_canu_new_mimps_exp.gff
bedtools intersect -u -a $AntismashGff -b $MimpPlus2KbGff | less -S
Enrichment was performed for each contig:
for num in $(seq 1 34); do
Contig="contig_"$num"_pilon"
echo $Contig
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_contig/$Contig
mkdir -p $OutDir
Gff=gene_pred/final_genes/F.oxysporum_fsp_cepae/Fus2_canu_new/final/final_genes_appended.gff3
GeneList=$OutDir/"$Contig"_interproscan.txt
cat $Gff | grep "$Contig" | grep 'mRNA' | cut -f9 | cut -f1 -d ';' | cut -f2 -d '=' > $GeneList
InterPro=gene_pred/interproscan/F.oxysporum_fsp_cepae/Fus2_canu_new/Fus2_canu_new_interproscan.tsv
InterPro_contig=$OutDir/"$Contig"_interproscan.tsv
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/filter_by_gene.py --inp_interpro $InterPro --inp_txt $GeneList > $InterPro_contig
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/interpro2wego.py --inp_interpro $InterPro_contig > $OutDir/"$Contig"_GO_WEGO.txt
done
The WEGO wesite was used to generate summaries of genes on core and LS regions.
A similar process was then repeated for duplicated genes:
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/duplicated_genes
mkdir -p $OutDir
DupGenesTab=/home/sobczm/popgen/codon/blast/dagchainer/testing/no_transposon/non-transposon_duplications_summaryf_ann
cat $DupGenesTab | tail -n+2 | cut -f1,3,5 | sed 's/\t/\n/g' | sed 's/,/\n/g' | grep -v "^$" | grep 'Fus2' | sed 's/Fus2_//g' | sort | uniq > $OutDir/duplicated_genes.txt
InterPro=gene_pred/interproscan/F.oxysporum_fsp_cepae/Fus2_canu_new/Fus2_canu_new_interproscan.tsv
InterPro_dups=$OutDir/duplicated_genes_interproscan.tsv
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/filter_by_gene.py --inp_interpro $InterPro --inp_txt $OutDir/duplicated_genes.txt > $InterPro_dups
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/interpro2wego.py --inp_interpro $InterPro_dups > $OutDir/duplicated_genes_GO_WEGO.txt
Annotations by contig were grouped by chromosome:
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome
Chr=Chr1
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome/$Chr
mkdir -p $OutDir
printf "" > $OutDir/"$Chr"_interproscan.tsv
echo "!interpro2wego_output" > $OutDir/"$Chr"_GO_WEGO.txt
for num in 1; do
Contig="contig_"$num"_pilon"
echo $Contig
ContigDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_contig/$Contig
mkdir -p $OutDir
cat $ContigDir/"$Contig"_interproscan.tsv >> $OutDir/"$Chr"_interproscan.tsv
cat $ContigDir/"$Contig"_GO_WEGO.txt | tail -n+2 >> $OutDir/"$Chr"_GO_WEGO.txt
done
DupGenes=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/duplicated_genes/duplicated_genes.txt
cat $OutDir/"$Chr"_interproscan.tsv | grep -f $DupGenes > $OutDir/"$Chr"_dup_genes_interproscan.tsv
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/interpro2wego.py --inp_interpro $OutDir/"$Chr"_dup_genes_interproscan.tsv > $OutDir/"$Chr"_dup_genes_GO_WEGO.txt
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome
Chr=Chr2
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome/$Chr
mkdir -p $OutDir
printf "" > $OutDir/"$Chr"_interproscan.tsv
echo "!interpro2wego_output" > $OutDir/"$Chr"_GO_WEGO.txt
for num in 17 5; do
Contig="contig_"$num"_pilon"
echo $Contig
ContigDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_contig/$Contig
mkdir -p $OutDir
cat $ContigDir/"$Contig"_interproscan.tsv >> $OutDir/"$Chr"_interproscan.tsv
cat $ContigDir/"$Contig"_GO_WEGO.txt | tail -n+2 >> $OutDir/"$Chr"_GO_WEGO.txt
done
DupGenes=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/duplicated_genes/duplicated_genes.txt
cat $OutDir/"$Chr"_interproscan.tsv | grep -f $DupGenes > $OutDir/"$Chr"_dup_genes_interproscan.tsv
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/interpro2wego.py --inp_interpro $OutDir/"$Chr"_dup_genes_interproscan.tsv > $OutDir/"$Chr"_dup_genes_GO_WEGO.txt
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome
Chr=Chr4
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome/$Chr
mkdir -p $OutDir
printf "" > $OutDir/"$Chr"_interproscan.tsv
echo "!interpro2wego_output" > $OutDir/"$Chr"_GO_WEGO.txt
for num in 2; do
Contig="contig_"$num"_pilon"
echo $Contig
ContigDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_contig/$Contig
mkdir -p $OutDir
cat $ContigDir/"$Contig"_interproscan.tsv >> $OutDir/"$Chr"_interproscan.tsv
cat $ContigDir/"$Contig"_GO_WEGO.txt | tail -n+2 >> $OutDir/"$Chr"_GO_WEGO.txt
done
DupGenes=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/duplicated_genes/duplicated_genes.txt
cat $OutDir/"$Chr"_interproscan.tsv | grep -f $DupGenes > $OutDir/"$Chr"_dup_genes_interproscan.tsv
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/interpro2wego.py --inp_interpro $OutDir/"$Chr"_dup_genes_interproscan.tsv > $OutDir/"$Chr"_dup_genes_GO_WEGO.txt
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome
Chr=Chr5
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome/$Chr
mkdir -p $OutDir
printf "" > $OutDir/"$Chr"_interproscan.tsv
echo "!interpro2wego_output" > $OutDir/"$Chr"_GO_WEGO.txt
for num in 3; do
Contig="contig_"$num"_pilon"
echo $Contig
ContigDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_contig/$Contig
mkdir -p $OutDir
cat $ContigDir/"$Contig"_interproscan.tsv >> $OutDir/"$Chr"_interproscan.tsv
cat $ContigDir/"$Contig"_GO_WEGO.txt | tail -n+2 >> $OutDir/"$Chr"_GO_WEGO.txt
done
DupGenes=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/duplicated_genes/duplicated_genes.txt
cat $OutDir/"$Chr"_interproscan.tsv | grep -f $DupGenes > $OutDir/"$Chr"_dup_genes_interproscan.tsv
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/interpro2wego.py --inp_interpro $OutDir/"$Chr"_dup_genes_interproscan.tsv > $OutDir/"$Chr"_dup_genes_GO_WEGO.txt
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome
Chr=Chr7
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome/$Chr
mkdir -p $OutDir
printf "" > $OutDir/"$Chr"_interproscan.tsv
echo "!interpro2wego_output" > $OutDir/"$Chr"_GO_WEGO.txt
for num in 4; do
Contig="contig_"$num"_pilon"
echo $Contig
ContigDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_contig/$Contig
mkdir -p $OutDir
cat $ContigDir/"$Contig"_interproscan.tsv >> $OutDir/"$Chr"_interproscan.tsv
cat $ContigDir/"$Contig"_GO_WEGO.txt | tail -n+2 >> $OutDir/"$Chr"_GO_WEGO.txt
done
DupGenes=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/duplicated_genes/duplicated_genes.txt
cat $OutDir/"$Chr"_interproscan.tsv | grep -f $DupGenes > $OutDir/"$Chr"_dup_genes_interproscan.tsv
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/interpro2wego.py --inp_interpro $OutDir/"$Chr"_dup_genes_interproscan.tsv > $OutDir/"$Chr"_dup_genes_GO_WEGO.txt
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome
Chr=Chr8
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome/$Chr
mkdir -p $OutDir
printf "" > $OutDir/"$Chr"_interproscan.tsv
echo "!interpro2wego_output" > $OutDir/"$Chr"_GO_WEGO.txt
for num in 6; do
Contig="contig_"$num"_pilon"
echo $Contig
ContigDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_contig/$Contig
mkdir -p $OutDir
cat $ContigDir/"$Contig"_interproscan.tsv >> $OutDir/"$Chr"_interproscan.tsv
cat $ContigDir/"$Contig"_GO_WEGO.txt | tail -n+2 >> $OutDir/"$Chr"_GO_WEGO.txt
done
DupGenes=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/duplicated_genes/duplicated_genes.txt
cat $OutDir/"$Chr"_interproscan.tsv | grep -f $DupGenes > $OutDir/"$Chr"_dup_genes_interproscan.tsv
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/interpro2wego.py --inp_interpro $OutDir/"$Chr"_dup_genes_interproscan.tsv > $OutDir/"$Chr"_dup_genes_GO_WEGO.txt
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome
Chr=Chr9
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome/$Chr
mkdir -p $OutDir
printf "" > $OutDir/"$Chr"_interproscan.tsv
echo "!interpro2wego_output" > $OutDir/"$Chr"_GO_WEGO.txt
for num in 7; do
Contig="contig_"$num"_pilon"
echo $Contig
ContigDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_contig/$Contig
mkdir -p $OutDir
cat $ContigDir/"$Contig"_interproscan.tsv >> $OutDir/"$Chr"_interproscan.tsv
cat $ContigDir/"$Contig"_GO_WEGO.txt | tail -n+2 >> $OutDir/"$Chr"_GO_WEGO.txt
done
DupGenes=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/duplicated_genes/duplicated_genes.txt
cat $OutDir/"$Chr"_interproscan.tsv | grep -f $DupGenes > $OutDir/"$Chr"_dup_genes_interproscan.tsv
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/interpro2wego.py --inp_interpro $OutDir/"$Chr"_dup_genes_interproscan.tsv > $OutDir/"$Chr"_dup_genes_GO_WEGO.txt
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome
Chr=Chr10
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome/$Chr
mkdir -p $OutDir
printf "" > $OutDir/"$Chr"_interproscan.tsv
echo "!interpro2wego_output" > $OutDir/"$Chr"_GO_WEGO.txt
for num in 8; do
Contig="contig_"$num"_pilon"
echo $Contig
ContigDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_contig/$Contig
mkdir -p $OutDir
cat $ContigDir/"$Contig"_interproscan.tsv >> $OutDir/"$Chr"_interproscan.tsv
cat $ContigDir/"$Contig"_GO_WEGO.txt | tail -n+2 >> $OutDir/"$Chr"_GO_WEGO.txt
done
DupGenes=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/duplicated_genes/duplicated_genes.txt
cat $OutDir/"$Chr"_interproscan.tsv | grep -f $DupGenes > $OutDir/"$Chr"_dup_genes_interproscan.tsv
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/interpro2wego.py --inp_interpro $OutDir/"$Chr"_dup_genes_interproscan.tsv > $OutDir/"$Chr"_dup_genes_GO_WEGO.txt
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome
Chr=Chr11
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome/$Chr
mkdir -p $OutDir
printf "" > $OutDir/"$Chr"_interproscan.tsv
echo "!interpro2wego_output" > $OutDir/"$Chr"_GO_WEGO.txt
for num in 9; do
Contig="contig_"$num"_pilon"
echo $Contig
ContigDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_contig/$Contig
mkdir -p $OutDir
cat $ContigDir/"$Contig"_interproscan.tsv >> $OutDir/"$Chr"_interproscan.tsv
cat $ContigDir/"$Contig"_GO_WEGO.txt | tail -n+2 >> $OutDir/"$Chr"_GO_WEGO.txt
done
DupGenes=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/duplicated_genes/duplicated_genes.txt
cat $OutDir/"$Chr"_interproscan.tsv | grep -f $DupGenes > $OutDir/"$Chr"_dup_genes_interproscan.tsv
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/interpro2wego.py --inp_interpro $OutDir/"$Chr"_dup_genes_interproscan.tsv > $OutDir/"$Chr"_dup_genes_GO_WEGO.txt
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome
Chr=Chr12
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome/$Chr
mkdir -p $OutDir
printf "" > $OutDir/"$Chr"_interproscan.tsv
echo "!interpro2wego_output" > $OutDir/"$Chr"_GO_WEGO.txt
for num in 13 15 18; do
Contig="contig_"$num"_pilon"
echo $Contig
ContigDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_contig/$Contig
mkdir -p $OutDir
cat $ContigDir/"$Contig"_interproscan.tsv >> $OutDir/"$Chr"_interproscan.tsv
cat $ContigDir/"$Contig"_GO_WEGO.txt | tail -n+2 >> $OutDir/"$Chr"_GO_WEGO.txt
done
DupGenes=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/duplicated_genes/duplicated_genes.txt
cat $OutDir/"$Chr"_interproscan.tsv | grep -f $DupGenes > $OutDir/"$Chr"_dup_genes_interproscan.tsv
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/interpro2wego.py --inp_interpro $OutDir/"$Chr"_dup_genes_interproscan.tsv > $OutDir/"$Chr"_dup_genes_GO_WEGO.txt
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome
Chr=Chr13
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome/$Chr
mkdir -p $OutDir
printf "" > $OutDir/"$Chr"_interproscan.tsv
echo "!interpro2wego_output" > $OutDir/"$Chr"_GO_WEGO.txt
for num in 12 11; do
Contig="contig_"$num"_pilon"
echo $Contig
ContigDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_contig/$Contig
mkdir -p $OutDir
cat $ContigDir/"$Contig"_interproscan.tsv >> $OutDir/"$Chr"_interproscan.tsv
cat $ContigDir/"$Contig"_GO_WEGO.txt | tail -n+2 >> $OutDir/"$Chr"_GO_WEGO.txt
done
DupGenes=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/duplicated_genes/duplicated_genes.txt
cat $OutDir/"$Chr"_interproscan.tsv | grep -f $DupGenes > $OutDir/"$Chr"_dup_genes_interproscan.tsv
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/interpro2wego.py --inp_interpro $OutDir/"$Chr"_dup_genes_interproscan.tsv > $OutDir/"$Chr"_dup_genes_GO_WEGO.txt
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome
Chr=LS14
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome/$Chr
mkdir -p $OutDir
printf "" > $OutDir/"$Chr"_interproscan.tsv
echo "!interpro2wego_output" > $OutDir/"$Chr"_GO_WEGO.txt
for num in 14; do
Contig="contig_"$num"_pilon"
echo $Contig
ContigDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_contig/$Contig
mkdir -p $OutDir
cat $ContigDir/"$Contig"_interproscan.tsv >> $OutDir/"$Chr"_interproscan.tsv
cat $ContigDir/"$Contig"_GO_WEGO.txt | tail -n+2 >> $OutDir/"$Chr"_GO_WEGO.txt
done
DupGenes=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/duplicated_genes/duplicated_genes.txt
cat $OutDir/"$Chr"_interproscan.tsv | grep -f $DupGenes > $OutDir/"$Chr"_dup_genes_interproscan.tsv
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/interpro2wego.py --inp_interpro $OutDir/"$Chr"_dup_genes_interproscan.tsv > $OutDir/"$Chr"_dup_genes_GO_WEGO.txt
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome
Chr=LS10
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome/$Chr
mkdir -p $OutDir
printf "" > $OutDir/"$Chr"_interproscan.tsv
echo "!interpro2wego_output" > $OutDir/"$Chr"_GO_WEGO.txt
for num in 10; do
Contig="contig_"$num"_pilon"
echo $Contig
ContigDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_contig/$Contig
mkdir -p $OutDir
cat $ContigDir/"$Contig"_interproscan.tsv >> $OutDir/"$Chr"_interproscan.tsv
cat $ContigDir/"$Contig"_GO_WEGO.txt | tail -n+2 >> $OutDir/"$Chr"_GO_WEGO.txt
done
DupGenes=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/duplicated_genes/duplicated_genes.txt
cat $OutDir/"$Chr"_interproscan.tsv | grep -f $DupGenes > $OutDir/"$Chr"_dup_genes_interproscan.tsv
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/interpro2wego.py --inp_interpro $OutDir/"$Chr"_dup_genes_interproscan.tsv > $OutDir/"$Chr"_dup_genes_GO_WEGO.txt
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome
Chr=LS16
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome/$Chr
mkdir -p $OutDir
printf "" > $OutDir/"$Chr"_interproscan.tsv
echo "!interpro2wego_output" > $OutDir/"$Chr"_GO_WEGO.txt
for num in 16; do
Contig="contig_"$num"_pilon"
echo $Contig
ContigDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_contig/$Contig
mkdir -p $OutDir
cat $ContigDir/"$Contig"_interproscan.tsv >> $OutDir/"$Chr"_interproscan.tsv
cat $ContigDir/"$Contig"_GO_WEGO.txt | tail -n+2 >> $OutDir/"$Chr"_GO_WEGO.txt
done
DupGenes=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/duplicated_genes/duplicated_genes.txt
cat $OutDir/"$Chr"_interproscan.tsv | grep -f $DupGenes > $OutDir/"$Chr"_dup_genes_interproscan.tsv
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/interpro2wego.py --inp_interpro $OutDir/"$Chr"_dup_genes_interproscan.tsv > $OutDir/"$Chr"_dup_genes_GO_WEGO.txt
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome
Chr=LS19
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome/$Chr
mkdir -p $OutDir
printf "" > $OutDir/"$Chr"_interproscan.tsv
echo "!interpro2wego_output" > $OutDir/"$Chr"_GO_WEGO.txt
for num in 19; do
Contig="contig_"$num"_pilon"
echo $Contig
ContigDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_contig/$Contig
mkdir -p $OutDir
cat $ContigDir/"$Contig"_interproscan.tsv >> $OutDir/"$Chr"_interproscan.tsv
cat $ContigDir/"$Contig"_GO_WEGO.txt | tail -n+2 >> $OutDir/"$Chr"_GO_WEGO.txt
done
DupGenes=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/duplicated_genes/duplicated_genes.txt
cat $OutDir/"$Chr"_interproscan.tsv | grep -f $DupGenes > $OutDir/"$Chr"_dup_genes_interproscan.tsv
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/interpro2wego.py --inp_interpro $OutDir/"$Chr"_dup_genes_interproscan.tsv > $OutDir/"$Chr"_dup_genes_GO_WEGO.txt
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome
Chr=LS20
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome/$Chr
mkdir -p $OutDir
printf "" > $OutDir/"$Chr"_interproscan.tsv
echo "!interpro2wego_output" > $OutDir/"$Chr"_GO_WEGO.txt
for num in 20; do
Contig="contig_"$num"_pilon"
echo $Contig
ContigDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_contig/$Contig
mkdir -p $OutDir
cat $ContigDir/"$Contig"_interproscan.tsv >> $OutDir/"$Chr"_interproscan.tsv
cat $ContigDir/"$Contig"_GO_WEGO.txt | tail -n+2 >> $OutDir/"$Chr"_GO_WEGO.txt
done
DupGenes=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/duplicated_genes/duplicated_genes.txt
cat $OutDir/"$Chr"_interproscan.tsv | grep -f $DupGenes > $OutDir/"$Chr"_dup_genes_interproscan.tsv
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/interpro2wego.py --inp_interpro $OutDir/"$Chr"_dup_genes_interproscan.tsv > $OutDir/"$Chr"_dup_genes_GO_WEGO.txt
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome
Chr=LS21
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome/$Chr
mkdir -p $OutDir
printf "" > $OutDir/"$Chr"_interproscan.tsv
echo "!interpro2wego_output" > $OutDir/"$Chr"_GO_WEGO.txt
for num in 21; do
Contig="contig_"$num"_pilon"
echo $Contig
ContigDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_contig/$Contig
mkdir -p $OutDir
cat $ContigDir/"$Contig"_interproscan.tsv >> $OutDir/"$Chr"_interproscan.tsv
cat $ContigDir/"$Contig"_GO_WEGO.txt | tail -n+2 >> $OutDir/"$Chr"_GO_WEGO.txt
done
DupGenes=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/duplicated_genes/duplicated_genes.txt
cat $OutDir/"$Chr"_interproscan.tsv | grep -f $DupGenes > $OutDir/"$Chr"_dup_genes_interproscan.tsv
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/interpro2wego.py --inp_interpro $OutDir/"$Chr"_dup_genes_interproscan.tsv > $OutDir/"$Chr"_dup_genes_GO_WEGO.txt
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome
Chr=LS22
OutDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_chromosome/$Chr
mkdir -p $OutDir
printf "" > $OutDir/"$Chr"_interproscan.tsv
echo "!interpro2wego_output" > $OutDir/"$Chr"_GO_WEGO.txt
for num in 22; do
Contig="contig_"$num"_pilon"
echo $Contig
ContigDir=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/by_contig/$Contig
mkdir -p $OutDir
cat $ContigDir/"$Contig"_interproscan.tsv >> $OutDir/"$Chr"_interproscan.tsv
cat $ContigDir/"$Contig"_GO_WEGO.txt | tail -n+2 >> $OutDir/"$Chr"_GO_WEGO.txt
done
DupGenes=analysis/enrichment/LS_regions/F.oxysporum_fsp_cepae/Fus2_canu_new/duplicated_genes/duplicated_genes.txt
cat $OutDir/"$Chr"_interproscan.tsv | grep -f $DupGenes > $OutDir/"$Chr"_dup_genes_interproscan.tsv
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan/feature_extraction
$ProgDir/interpro2wego.py --inp_interpro $OutDir/"$Chr"_dup_genes_interproscan.tsv > $OutDir/"$Chr"_dup_genes_GO_WEGO.txt