Skip to content

Latest commit

 

History

History
838 lines (715 loc) · 45.9 KB

2b_Untangling_Sex.md

File metadata and controls

838 lines (715 loc) · 45.9 KB

Untangling

Sex chromosome

Use HG002's chrY as an alternative reference, as GRCh38's chrY is incomplete. Include also HG002's chrX.

cd /lizardfs/guarracino/chromosome_communities/assemblies

# Get HG002 chromosome X and Y

PATH_CHM13_FA_GZ=/lizardfs/erikg/human/chm13v2.0.fa.gz
PATH_GRCH38_FA_GZ=/lizardfs/guarracino/chromosome_communities/assemblies/GCA_000001405.15_GRCh38_no_alt_analysis_set_maskedGRC_exclusions_v2_maskedcentromeres_auto_XY.fa.gz

PATH_HPRCY1_FA_GZ=/lizardfs/guarracino/chromosome_communities/assemblies/HPRCy1v2genbank.fa.gz

cat /lizardfs/guarracino/chromosome_communities/assemblies/partitioning/*.partitions.tsv | awk -v OFS='\t' '($2 == "chrX" || $2 == "chrY") {print $1}' | sort > chrXY_contigs.txt

# HiFi-based assemblies + hg01978 verkko + hg002-bakeoff + hg002 verkko
cat \
  <( samtools faidx $PATH_CHM13_FA_GZ $(grep 'chrX\|chrY' $PATH_CHM13_FA_GZ.fai | cut -f 1) | sed 's/^>/>chm13#/g' ) \
  <( samtools faidx $PATH_GRCH38_FA_GZ $(grep 'chrX\|chrY' $PATH_GRCH38_FA_GZ.fai | grep '_' -v | cut -f 1) ) \
  <(samtools faidx $PATH_HPRCY1_FA_GZ $(comm -12 <(cut -f 1 "$PATH_HPRCY1_FA_GZ".fai | sort) <(cat chrXY_contigs.txt))) \
  <(samtools faidx /lizardfs/guarracino/chromosome_communities/assemblies/hg01978.mat.fa $(comm -12 <(cut -f 1 /lizardfs/guarracino/chromosome_communities/assemblies/hg01978.mat.fa.fai | sort) <(cat chrXY_contigs.txt))) \
  <(samtools faidx /lizardfs/guarracino/chromosome_communities/assemblies/hg01978.pat.fa $(comm -12 <(cut -f 1 /lizardfs/guarracino/chromosome_communities/assemblies/hg01978.pat.fa.fai | sort) <(cat chrXY_contigs.txt))) \
  <(samtools faidx /lizardfs/guarracino/chromosome_communities/assemblies/hg002-bakeoff.mat.fa $(comm -12 <(cut -f 1 /lizardfs/guarracino/chromosome_communities/assemblies/hg002-bakeoff.mat.fa.fai | sort) <(cat chrXY_contigs.txt))) \
  <(samtools faidx /lizardfs/guarracino/chromosome_communities/assemblies/hg002-bakeoff.pat.fa $(comm -12 <(cut -f 1 /lizardfs/guarracino/chromosome_communities/assemblies/hg002-bakeoff.pat.fa.fai | sort) <(cat chrXY_contigs.txt))) | \
  bgzip -@ 48 -c > chrSEX+refs.fa.gz
samtools faidx chrSEX+refs.fa.gz

Pangenome building

Apply pggb on the chromosome-partitioned HPRC dataset:

mkdir -p /lizardfs/guarracino/chromosome_communities/graphs

RUN_PGGB=/home/guarracino/tools/pggb/pggb-a4a6668d9ece42c80ce69dc354f0cb59a849286f

num_of_haplotypes=$(cut -f 1,2 -d '#' /lizardfs/guarracino/chromosome_communities/assemblies/chrSEX+refs.fa.gz.fai | sort | uniq | wc -l)

sbatch -p highmem -c 48 --job-name sexpggb --wrap "hostname; cd /scratch && $RUN_PGGB -i /lizardfs/guarracino/chromosome_communities/assemblies/chrSEX+refs.fa.gz -o chrSEX+refs.s50k.l250k.p98.n${num_of_haplotypes} -t 48 -s 50k -l 250k -p 98 -n ${num_of_haplotypes} -k 311 -G 13117,13219 -O 0.03 -T 48 -v -V chm13:#,grch38:#; mv /scratch/chrSEX+refs.s50k.l250k.p98.n${num_of_haplotypes} /lizardfs/guarracino/chromosome_communities/graphs";

sbatch -p highmem -c 48 --job-name sexpggb --wrap "hostname; cd /scratch && $RUN_PGGB -i /lizardfs/guarracino/chromosome_communities/assemblies/chrSEX+refs.fa.gz -o chrSEX+refs.s50k.l250k.p90.n${num_of_haplotypes} -t 48 -s 50k -l 250k -p 90 -n ${num_of_haplotypes} -k 311 -G 13117,13219 -O 0.03 -T 48 -v -V chm13:#,grch38:#; mv /scratch/chrSEX+refs.s50k.l250k.p90.n${num_of_haplotypes} /lizardfs/guarracino/chromosome_communities/graphs";

Untangling

mkdir -p /lizardfs/guarracino/chromosome_communities/untangle_sex/

path_targets_txt=/lizardfs/guarracino/chromosome_communities/untangle_sex/chm13.SEX.target_paths.txt
path_fasta_fai=/lizardfs/guarracino/chromosome_communities/assemblies/chrSEX+refs.fa.gz.fai
grep chm13 $path_fasta_fai | cut -f 1 > $path_targets_txt


path_input_og=/lizardfs/guarracino/chromosome_communities/graphs/chrSEX+refs.s50k.l250k.p98.n102/chrSEX+refs.fa.gz.2ed2c67.04f1c29.22fc5c8.smooth.final.og
path_input_og=/lizardfs/guarracino/chromosome_communities/graphs/chrSEX+refs.s50k.l250k.p90.n102/chrSEX+refs.fa.gz.d1ae18e.04f1c29.d09fe3b.smooth.final.og
prefix=$(basename $path_input_og .og)

RUN_ODGI=/home/guarracino/tools/odgi/bin/odgi-454197fa29b772050c3135d5de47c816ce38e62c

Untangle with respect to all sex chromosomes and emit the cut points:

for e in 50000; do
  for m in 1000; do
    path_bed_gz=/lizardfs/guarracino/chromosome_communities/untangle_sex/$prefix.untangle.chm13#SEX.e$e.m$m.j0.n100.bed.gz
    path_cut_points_txt=/lizardfs/guarracino/chromosome_communities/untangle_sex/$prefix.untangle.chm13#SEX.e$e.m$m.j0.n100.cut_points.txt
    
    if [[ ! -s ${path_cut_points_txt} ]]; then
      echo "-e $e -m $m"
      sbatch -p workers -c 48 --job-name sexuntangle --wrap "\time -v $RUN_ODGI untangle -t 48 -P -i $path_input_og -R $path_targets_txt -e $e -m $m --cut-points-output $path_cut_points_txt -j 0 -n 100 | pigz -c > $path_bed_gz"
    fi;
  done
done

Fix best hits (if there are multiple best hits, put as first the target-chromosome of origin of the contig):

cat \
            <(cat assemblies/partitioning/*.partitions.tsv | sed 's/chr/chm13#chr/') \
            <(echo -e HG002-bakeoff#PAT#SY_unloc_13"\t"chm13#chrY) \
            <(echo -e HG002-bakeoff#PAT#SY_unloc_15"\t"chm13#chrY) \
            <(echo -e HG002-bakeoff#PAT#SY_unloc_19"\t"chm13#chrY) \
            <(echo -e HG002-bakeoff#PAT#SY_unloc_21"\t"chm13#chrY) \
            <(echo -e HG002-bakeoff#PAT#SY_unloc_22"\t"chm13#chrY) \
            <(echo -e HG002-bakeoff#PAT#SY_unloc_22"\t"chm13#chrY) \
            <(echo -e HG002#1#JAHKSE010000383.1"\t"chm13#chrY) \
            <(echo -e HG002#1#JAHKSE010000141.1"\t"chm13#chrY) \
            <(echo -e HG002#1#JAHKSE010000378.1"\t"chm13#chrY) \
            <(echo -e HG002#1#JAHKSE010000376.1"\t"chm13#chrY) \
            <(echo -e HG002#1#JAHKSE010000341.1"\t"chm13#chrY) \
            <(echo -e HG002#1#JAHKSE010000447.1"\t"chm13#chrY) \
            <(echo -e HG005#1#JAHEPO010000097.1"\t"chm13#chrY) \
            <(echo -e HG005#1#JAHEPO010000270.1"\t"chm13#chrY) \
            <(echo -e HG00621#1#JAHBCD010000210.1"\t"chm13#chrY) \
            <(echo -e HG005#1#JAHEPO010000116.1"\t"chm13#chrY) \
            <(echo -e HG005#1#JAHEPO010000199.1"\t"chm13#chrX) \
            <(echo -e HG00673#2#JAHBBY010000245.1"\t"chm13#chrX) \
            <(echo -e HG005#2#JAHEPN010000186.1"\t"chm13#chrX) \
            <(echo -e HG00733#1#JAHEPQ010000312.1"\t"chm13#chrX) \
            <(echo -e HG01071#2#JAHBCE010000198.1"\t"chm13#chrX) \
            <(echo -e HG01071#1#JAHBCF010000145.1"\t"chm13#chrX) \
            <(echo -e HG01109#1#JAHEPA010000188.1"\t"chm13#chrY) \
            <(echo -e HG01109#1#JAHEPA010000268.1"\t"chm13#chrY) \
            <(echo -e HG01109#1#JAHEPA010000227.1"\t"chm13#chrY) \
            <(echo -e HG01109#1#JAHEPA010000307.1"\t"chm13#chrY) \
            <(echo -e HG01109#1#JAHEPA010000343.1"\t"chm13#chrY) \
            <(echo -e HG01109#1#JAHEPA010000333.1"\t"chm13#chrY) \
            <(echo -e HG01109#1#JAHEPA010000405.1"\t"chm13#chrY) \
            <(echo -e HG01109#1#JAHEPA010000337.1"\t"chm13#chrY) \
            <(echo -e HG01109#1#JAHEPA010000344.1"\t"chm13#chrY) \
            <(echo -e HG01109#1#JAHEPA010000442.1"\t"chm13#chrY) \
            <(echo -e HG01109#1#JAHEPA010000458.1"\t"chm13#chrY) \
            <(echo -e HG01109#1#JAHEPA010000325.1"\t"chm13#chrY) \
            <(echo -e HG01123#2#JAGYYY010000193.1"\t"chm13#chrX) \
            <(echo -e HG01175#1#JAHAMA010000276.1"\t"chm13#chrX) \
            <(echo -e HG01175#2#JAHALZ010000239.1"\t"chm13#chrX) \
            <(echo -e HG01243#1#JAHEOY010000277.1"\t"chm13#chrY) \
            <(echo -e HG01243#1#JAHEOY010000307.1"\t"chm13#chrY) \
            <(echo -e HG01243#1#JAHEOY010000291.1"\t"chm13#chrY) \
            <(echo -e HG01258#1#JAGYYV010000281.1"\t"chm13#chrY) \
            <(echo -e HG01358#2#JAGYZA010000069.1"\t"chm13#chrX) \
            <(echo -e HG01891#2#JAGYVN010000149.1"\t"chm13#chrX) \
            <(echo -e HG01891#1#JAGYVO010000131.1"\t"chm13#chrX) \
            <(echo -e HG01928#1#JAGYVQ010000246.1"\t"chm13#chrY) \
            <(echo -e HG01952#1#JAHAME010000217.1"\t"chm13#chrY) \
            <(echo -e HG02055#1#JAHEPK010000284.1"\t"chm13#chrY) \
            <(echo -e HG02145#1#JAHKSG010000213.1"\t"chm13#chrY) \
            <(echo -e HG02145#1#JAHKSG010000422.1"\t"chm13#chrY) \
            <(echo -e HG02145#1#JAHKSG010000527.1"\t"chm13#chrY) \
            <(echo -e HG02145#1#JAHKSG010000498.1"\t"chm13#chrY) \
            <(echo -e HG02145#1#JAHKSG010000544.1"\t"chm13#chrY) \
            <(echo -e HG02145#1#JAHKSG010000728.1"\t"chm13#chrY) \
            <(echo -e HG02145#1#JAHKSG010000766.1"\t"chm13#chrY) \
            <(echo -e HG02145#1#JAHKSG010000668.1"\t"chm13#chrY) \
            <(echo -e HG02148#1#JAHAMG010000114.1"\t"chm13#chrX) \
            <(echo -e HG02486#1#JAGYVM010000047.1"\t"chm13#chrY) \
            <(echo -e HG02148#2#JAHAMF010000125.1"\t"chm13#chrX) \
            <(echo -e HG02486#1#JAGYVM010000189.1"\t"chm13#chrY) \
            <(echo -e HG02572#1#JAHAOW010000422.1"\t"chm13#chrY) \
            <(echo -e HG02572#1#JAHAOW010000309.1"\t"chm13#chrY) \
            <(echo -e HG02572#1#JAHAOW010000471.1"\t"chm13#chrY) \
            <(echo -e HG02572#1#JAHAOW010000523.1"\t"chm13#chrY) \
            <(echo -e HG02572#1#JAHAOW010000458.1"\t"chm13#chrY) \
            <(echo -e HG02572#1#JAHAOW010000535.1"\t"chm13#chrY) \
            <(echo -e HG02572#2#JAHAOV010000449.1"\t"chm13#chrY) \
            <(echo -e HG02622#2#JAHAON010000142.1"\t"chm13#chrX) \
            <(echo -e HG02717#1#JAHAOS010000199.1"\t"chm13#chrY) \
            <(echo -e HG02886#2#JAHAOT010000422.1"\t"chm13#chrX) \
            <(echo -e HG03098#1#JAHEPM010000129.1"\t"chm13#chrY) \
            <(echo -e HG03098#1#JAHEPM010000235.1"\t"chm13#chrY) \
            <(echo -e HG03098#1#JAHEPM010000398.1"\t"chm13#chrY) \
            <(echo -e HG03098#1#JAHEPM010000264.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000175.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000301.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000359.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000353.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000402.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000380.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000404.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000442.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000397.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000407.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000414.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000466.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000510.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000522.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000516.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000418.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000548.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000476.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000591.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000542.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000554.1"\t"chm13#chrY) \
            <(echo -e HG03516#2#JAGYYS010000236.1"\t"chm13#chrX) \
            <(echo -e HG03579#1#JAGYVU010000286.1"\t"chm13#chrY) \
            <(echo -e HG03579#1#JAGYVU010000293.1"\t"chm13#chrY) \
            <(echo -e HG03579#1#JAGYVU010000236.1"\t"chm13#chrY) \
            <(echo -e HG03579#1#JAGYVU010000411.1"\t"chm13#chrY) \
            <(echo -e HG03579#1#JAGYVU010000348.1"\t"chm13#chrY) \
            <(echo -e HG03579#1#JAGYVU010000340.1"\t"chm13#chrY) \
            <(echo -e HG03579#1#JAGYVU010000420.1"\t"chm13#chrY) \
            <(echo -e HG03540#2#JAGYVX010000220.1"\t"chm13#chrX) \
            <(echo -e NA18906#2#JAHEON010000121.1"\t"chm13#chrX) \
            <(echo -e NA19240#1#JAHEOM010000255.1"\t"chm13#chrX) \
            <(echo -e NA21309#1#JAHEPC010000444.1"\t"chm13#chrX) \
            <(echo -e NA20129#2#JAHEPD010000373.1"\t"chm13#chrX) \
            <(echo -e NA21309#1#JAHEPC010000484.1"\t"chm13#chrX) \
            <(echo -e NA21309#1#JAHEPC010000136.1"\t"chm13#chrX) \
            <(echo -e HG005#1#JAHEPO010000137.1"\t"chm13#chrY) \
            <(echo -e HG005#1#JAHEPO010000240.1"\t"chm13#chrY) \
            <(echo -e HG005#1#JAHEPO010000251.1"\t"chm13#chrY) \
            <(echo -e HG005#1#JAHEPO010000358.1"\t"chm13#chrY) \
            <(echo -e HG005#1#JAHEPO010000287.1"\t"chm13#chrX) \
            <(echo -e HG005#2#JAHEPN010000184.1"\t"chm13#chrX) \
            <(echo -e HG005#2#JAHEPN010000184.1"\t"chm13#chrX) \
            <(echo -e HG005#1#JAHEPO010000150.1"\t"chm13#chrY) \
            <(echo -e HG002#1#JAHKSE010000138.1"\t"chm13#chrY) \
            <(echo -e HG005#1#JAHEPO010000129.1"\t"chm13#chrY) \
            <(echo -e HG005#1#JAHEPO010000146.1"\t"chm13#chrY) \
            <(echo -e HG00673#1#JAHBBZ010000301.1"\t"chm13#chrY) \
            <(echo -e HG00673#1#JAHBBZ010000306.1"\t"chm13#chrY) \
            <(echo -e HG00673#2#JAHBBY010000140.1"\t"chm13#chrX) \
            <(echo -e HG00673#2#JAHBBY010000140.1"\t"chm13#chrX) \
            <(echo -e HHG00733#1#JAHEPQ010000190.1"\t"chm13#chrX) \
            <(echo -e HG00673#2#JAHBBY010000210.1"\t"chm13#chrX) \
            <(echo -e HG00733#1#JAHEPQ010000190.1"\t"chm13#chrX) \
            <(echo -e HG00733#2#JAHEPP010000392.1"\t"chm13#chrX) \
            <(echo -e HG005#1#JAHEPO010000178.1"\t"chm13#chrY) \
            <(echo -e HG00673#1#JAHBBZ010000257.1"\t"chm13#chrY) \
            <(echo -e HG01109#1#JAHEPA010000218.1"\t"chm13#chrY) \
            <(echo -e HG01109#1#JAHEPA010000263.1"\t"chm13#chrY) \
            <(echo -e HG01109#1#JAHEPA010000278.1"\t"chm13#chrY) \
            <(echo -e HG01109#1#JAHEPA010000428.1"\t"chm13#chrY) \
            <(echo -e G01123#1#JAGYYZ010000259.1"\t"chm13#chrX) \
            <(echo -e HG01175#2#JAHALZ010000133.1"\t"chm13#chrX) \
            <(echo -e HG01243#1#JAHEOY010000179.1"\t"chm13#chrY) \
            <(echo -e HG01258#1#JAGYYV010000194.1"\t"chm13#chrY) \
            <(echo -e HG01952#1#JAHAME010000216.1"\t"chm13#chrY) \
            <(echo -e HG01952#2#JAHAMD010000259.1"\t"chm13#chrX) \
            <(echo -e HG01978#2#JAGYVR010000154.1"\t"chm13#chrX) \
            <(echo -e HG02055#1#JAHEPK010000168.1"\t"chm13#chrY) \
            <(echo -e HG02055#1#JAHEPK010000257.1"\t"chm13#chrY) \
            <(echo -e HG02055#1#JAHEPK010000314.1"\t"chm13#chrY) \
            <(echo -e HG02145#1#JAHKSG010000262.1"\t"chm13#chrY) \
            <(echo -e HG02145#1#JAHKSG010000342.1"\t"chm13#chrY) \
            <(echo -e HG02145#1#JAHKSG010000399.1"\t"chm13#chrY) \
            <(echo -e HG02145#1#JAHKSG010000547.1"\t"chm13#chrY) \
            <(echo -e HG02145#1#JAHKSG010000739.1"\t"chm13#chrY) \
            <(echo -e HG02145#1#JAHKSG010000785.1"\t"chm13#chrY) \
            <(echo -e HG02257#1#JAGYVI010000079.1"\t"chm13#chrX) \
            <(echo -e HG02486#1#JAGYVM010000128.1"\t"chm13#chrY) \
            <(echo -e HG02486#1#JAGYVM010000157.1"\t"chm13#chrY) \
            <(echo -e HG02486#1#JAGYVM010000169.1"\t"chm13#chrY) \
            <(echo -e HG02486#1#JAGYVM010000291.1"\t"chm13#chrY) \
            <(echo -e HG02572#1#JAHAOW010000392.1"\t"chm13#chrY) \
            <(echo -e HG02572#1#JAHAOW010000399.1"\t"chm13#chrY) \
            <(echo -e HG02572#1#JAHAOW010000420.1"\t"chm13#chrY) \
            <(echo -e HG02572#1#JAHAOW010000474.1"\t"chm13#chrY) \
            <(echo -e HG02572#1#JAHAOW010000567.1"\t"chm13#chrY) \
            <(echo -e HG02717#1#JAHAOS010000264.1"\t"chm13#chrY) \
            <(echo -e HG02717#1#JAHAOS010000281.1"\t"chm13#chrY) \
            <(echo -e HG02717#1#JAHAOS010000282.1"\t"chm13#chrY) \
            <(echo -e HG02723#1#JAHEOU010000219.1"\t"chm13#chrX) \
            <(echo -e HG02886#2#JAHAOT010000399.1"\t"chm13#chrX) \
            <(echo -e HG03098#1#JAHEPM010000164.1"\t"chm13#chrY) \
            <(echo -e HG03098#1#JAHEPM010000254.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000041.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000272.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000452.1"\t"chm13#chrY) \
            <(echo -e HG03492#1#JAHEPI010000487.1"\t"chm13#chrY) \
            <(echo -e HG03579#1#JAGYVU010000262.1"\t"chm13#chrY) \
            <(echo -e HG03579#1#JAGYVU010000323.1"\t"chm13#chrY) \
            <(echo -e HG03579#1#JAGYVU010000368.1"\t"chm13#chrY) \
            <(echo -e HG03579#1#JAGYVU010000373.1"\t"chm13#chrY) \
            <(echo -e HG03579#1#JAGYVU010000417.1"\t"chm13#chrY) \
            <(echo -e HG01123#1#JAGYYZ010000259.1"\t"chm13#chrX) \
            <(echo -e HG002-bakeoff#PAT#scaffold_235"\t"chm13#chrY) \
            <(echo -e HG002-bakeoff#PAT#SY_unloc_11"\t"chm13#chrY) \
            <(echo -e HG002-bakeoff#PAT#SY_unloc_2"\t"chm13#chrY) \
            <(echo -e chm13#chrX"\t"chm13#chrX) \
            <(echo -e chm13#chrY"\t"chm13#chrY) \
            <(echo -e grch38#chrX"\t"chm13#chrX) \
            <(echo -e grch38#chrY"\t"chm13#chrY) > /lizardfs/guarracino/chromosome_communities/untangle_sex/partitioning.contig2chr.tsv


for e in 50000; do
  for m in 1000 ; do
    path_ref_fixed_bed_gz=/lizardfs/guarracino/chromosome_communities/untangle_sex/$prefix.untangle.chm13#SEX.e$e.m$m.j0.n100.fixed.bed.gz
    if [[ ! -s ${path_ref_fixed_bed_gz} ]]; then
      echo "-e $e -m $m"
      
      path_bed_gz=/lizardfs/guarracino/chromosome_communities/untangle_sex/$prefix.untangle.chm13#SEX.e$e.m$m.j0.n100.bed.gz
      python3 /lizardfs/guarracino/chromosome_communities/scripts/fix_best_hit.py \
        $path_bed_gz \
        /lizardfs/guarracino/chromosome_communities/untangle_sex/partitioning.contig2chr.tsv | tr ' ' '\t' | pigz -c -9 > $path_ref_fixed_bed_gz
    fi;
  done
done

# -p 98: Fixed 13900 hits covering 21791399 bps on the queries.
# -p 90: Fixed 12380 hits covering 20549819 bps on the queries.

Untangle with respect to a single sex chromosome, using always the same cut points:

for e in 50000; do
  for m in 1000; do
    echo "-e $e -m $m"
      
    cat $path_targets_txt | while read ref; do
      echo $ref
      
      path_ref_bed_gz=/lizardfs/guarracino/chromosome_communities/untangle_sex/$prefix.untangle.$ref.e$e.m$m.j0.n100.bed.gz
      if [[ ! -s ${path_ref_bed_gz} ]]; then
        path_cut_points_txt=/lizardfs/guarracino/chromosome_communities/untangle_sex/$prefix.untangle.chm13#SEX.e$e.m$m.j0.n100.cut_points.txt
        
        sbatch -p workers -c 48 --job-name sexuntangle --wrap "\time -v $RUN_ODGI untangle -t 48 -P -i $path_input_og -r $ref -e $e -m $m --cut-points-input $path_cut_points_txt -j 0 -n 100 | pigz -c > $path_ref_bed_gz"
      fi
    done
  done
done

Grounding (applying filters) and annotation:

mkdir -p /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded

for e in 50000; do
  for m in 1000; do
    cat $path_targets_txt | while read ref; do            
      path_grounded_tsv_gz=/lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$prefix.untangle.$ref.e$e.m$m.grounded.tsv.gz
            
      if [[ ! -s ${path_grounded_tsv_gz} ]]; then
        echo "-e $e -m $m $ref grounding"

        # Grounding
        ( echo query query.begin query.end target target.begin target.end jaccard strand self.coverage nth.best ref ref.begin ref.end ref.jaccard ref.nth.best | tr ' ' '\t'
          join \
            <(zcat /lizardfs/guarracino/chromosome_communities/untangle_sex/$prefix.untangle.chm13#SEX.e$e.m$m.j0.n100.bed.gz | awk -v j=0 -v n=5 '{ if($7 >= j && $10 <= n) {print $1"_"$2, $0} }' | tr ' ' '\t' | sort -k 1,1) \
            <(zcat /lizardfs/guarracino/chromosome_communities/untangle_sex/$prefix.untangle.$ref.e$e.m$m.j0.n100.bed.gz | awk -v j=0 -v n=10 '{ if($7 >= j && $10 <= n) {print $1"_"$2, $0} }' | tr ' ' '\t' | sort -k 1,1) | \
          tr ' ' '\t' | grep -v '^#' | cut -f 2- | cut -f -10,14-17,20 | sort -k 1,3 -k 7,7nr -k 10,10n -k 14,14nr -k 15,15n ) | tr ' ' '\t' | pigz -c > x.tsv.gz


        echo "-e $e -m $m $ref filtering&annotation"
        ref_chr=$(echo $ref | sed 's/chm13#//')
        
        ###############################################################################
        # Contigs with respect to the partitioning
        zcat x.tsv.gz | sed '1d' | cut -f 1 | grep -v chr | grep -f <(grep $ref /lizardfs/guarracino/chromosome_communities/untangle_sex/partitioning.contig2chr.tsv | cut -f 1) | sort | uniq > $ref.tmp.txt

        # WRONG (it takes too many contigs): contigs overlapping (or close at least 100kbps to) a PARs/XTRs region
        #bedtools intersect \
        #  -a <(zcat x.tsv.gz | awk -v OFS="\t" '{print $11,$12,$13,$1, "", "+"}' | sed '1d' | bedtools sort) \
        #  -b <(grep $ref_chr /lizardfs/guarracino/chromosome_communities/data/chm13_hg002.PARs.bed |\
        #    bedtools sort |\
        #    bedtools slop -b 100000 -g /lizardfs/guarracino/chromosome_communities/data/chm13_hg002.PARs.sizes |\
        #    awk -v OFS='\t' -v ref=$ref '{print(ref,$2,$3)}') | \
        #  #awk '$3-$2+1>=100000' | \
        #  cut -f 4 | \
        #  #Remove references to avoid grepping everything later (with zgrep -f)
        #  grep -v chr |\
        #  sort | uniq > $ref.tmp.txt
        
        # WRONG (it takes too many contigs): contigs touching regions that are no PARs/XTRs
        #bedtools intersect \
        #  -a <(zcat x.tsv.gz | awk -v OFS="\t" '{print $11,$12,$13,$1, "", "+"}' | sed '1d' | grep "^$ref" | bedtools sort) \
        #  -b <(bedtools complement \
        #        -i <(grep $ref_chr data/chm13_hg002.PARs.bed | sed "s/$ref_chr/chm13/" | sed -e "s/PAR1/$ref_chr/" -e "s/PAR2/$ref_chr/" -e "s/XTR[12]/$ref_chr/" -e "s/XTR/$ref_chr/" | bedtools sort) -g <(cut -f 1,2 /lizardfs/guarracino/chromosome_communities/assemblies/chrSEX+refs.fa.gz.fai | grep $ref)            
        #  ) | \
        #  cut -f 4 | \
        #  #Remove references to avoid grepping everything later (with zgrep -f)
        #  grep -v chr |\
        #  sort | uniq > $ref.tmp.txt

        # WRONG: it mixes chrX and chrY contigs
        #zcat x.tsv.gz | sed '1d' | cut -f 1 | grep -v chr | sort | uniq > $ref.tmp.txt      
        ###############################################################################

        # Add grounded.target column, re-add the references, and add annotation
        cat \
          <(zcat x.tsv.gz | head -n 1 | awk -v OFS='\t' '{print $0, "grounded.target"}') \
          <(zgrep x.tsv.gz -f $ref.tmp.txt | awk -v OFS='\t' -v ref=$ref '{print $0, ref}') \
          <(zcat x.tsv.gz | awk -v OFS='\t' -v ref=$ref '$1 ~ /chr/ { print $0, ref}' ) \
          <(grep $ref_chr /lizardfs/guarracino/chromosome_communities/data/chm13_hg002.PARs.bed | \
            awk -v OFS='\t' -v ref=$ref '{print "chm13#"$1,".",".",ref,".",".","1","+","1","1",ref,$2,$3,"1","1",ref}') |\
          pigz -c -9 > $path_grounded_tsv_gz
        rm x.tsv.gz
      fi;
    done
  done
done

Remove unreliable regions:

rm x.tsv # Cleaning

for e in 50000; do
  for m in 1000; do
    cat $path_targets_txt | while read ref; do
      path_grounded_tsv_gz=/lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$prefix.untangle.$ref.e$e.m$m.grounded.tsv.gz
      path_grounded_reliable_tsv_gz=/lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$prefix.untangle.$ref.e$e.m$m.grounded.reliable.tsv.gz

      if [[ ! -s ${path_grounded_reliable_tsv_gz} ]]; then
        echo "-e $e -m $m $ref"

        # Skip verkko's and bakeoff's contigs because we don't have the unreliable regions for them
        zcat $path_grounded_tsv_gz | sed '1d' | grep 'HG002#MAT\|HG002#PAT\|HG01978#MAT\|HG01978#PAT\|bakeoff' -v | cut -f 1 | sort | uniq | while read CONTIG; do
          SAMPLE=$( echo $CONTIG | cut -f 1 -d '#')

          path_unreliable_bed=/lizardfs/guarracino/HPRC/annotations/unreliable/$SAMPLE.hifi.flagger_final.simplified.unreliable_only.bed
          if [[ -s $path_unreliable_bed ]]; then
            #echo $CONTIG "--->" $SAMPLE
            
            zgrep "^$CONTIG" $path_grounded_tsv_gz | sort -k 2n | awk -v OFS='\t' '{print($1,$2,$3,$4"_"$5"_"$6"_"$7"_"$8"_"$9"_"$10"_"$11"_"$12"_"$13"_"$14"_"$15"_"$16)}' > x.bed
            grep $CONTIG $path_unreliable_bed > y.bed
            # -A: remove entire feature if any overlap
            bedtools subtract -a x.bed -b y.bed -A |\
              awk -v OFS='\t' '{split($4, a, "_"); print($1,$2,$3,a[1],a[2],a[3],a[4],a[5],a[6],a[7],a[8],a[9],a[10],a[11],a[12],a[13])}' >> x.tsv
            rm x.bed y.bed
          else
            zgrep "^$CONTIG" $path_grounded_tsv_gz | sort -k 2n >> x.tsv
          fi
        done
        
        # Re-take verkko's and bakeoff's untangled regions
        zcat $path_grounded_tsv_gz | sed '1d' | grep 'HG002#MAT\|HG002#PAT\|HG01978#MAT\|HG01978#PAT\|bakeoff' >> x.tsv
        
        cat \
          <( zcat $path_grounded_tsv_gz | head -n 1 ) \
          <( sort -k 1,3 -k 7,7nr -k 10,10n -k 14,14nr -k 15,15n x.tsv) | pigz -c -9 > $path_grounded_reliable_tsv_gz
        rm x.tsv
      fi;
    done
  done
done

Plot ([start-500kbps,end+500kbps] centered in the PARs/XTRs regions):

# PAR1/2/3
# https://link.springer.com/article/10.1007/s10142-013-0323-6/figures/1

n=2

for e in 50000; do
  for m in 1000; do
    for refn in 1; do
      (echo X; echo Y) | while read i; do     
        ref=chm13#chr$i
        
        echo "-e $e -m $m -refn $refn $ref"
    
        path_grounded_reliable_tsv_gz=/lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$prefix.untangle.$ref.e$e.m$m.grounded.reliable.tsv.gz
        PREFIX=$(basename $path_grounded_reliable_tsv_gz .tsv.gz);
        
        zgrep '^HG01978#MAT\|^HG01978#PAT\|bakeoff' $path_grounded_reliable_tsv_gz -v | sed '1d' | cut -f 1 | sort | uniq \
          > /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.query_to_consider.txt
        
        if [[ $i == "X" ]]; then
            # chrX#PAR1:0-2394410
            Rscript /lizardfs/guarracino/chromosome_communities/scripts/plot_untangle_without_annotation.R \
              $path_grounded_reliable_tsv_gz \
              0 2894410 \
              90 0.4 \
              0 \
              $n $refn \
              $i \
              0.9 \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.query_to_consider.txt \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.n$n.nref${refn}.PAR1.pdf
              
            # chrX#PAR2:153925834-154259566
            Rscript /lizardfs/guarracino/chromosome_communities/scripts/plot_untangle_without_annotation.R \
              $path_grounded_reliable_tsv_gz \
              153425834 154259566 \
              90 0.4 \
              0 \
              $n $refn \
              $i \
              0.9 \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.query_to_consider.txt \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.n$n.nref${refn}.PAR2.pdf
              
            # chrX#XTR:87642550-91570785
            Rscript /lizardfs/guarracino/chromosome_communities/scripts/plot_untangle_without_annotation.R \
              $path_grounded_reliable_tsv_gz \
              87142550 92070785 \
              90 0.4 \
              0 \
              $n $refn \
              $i \
              0.9 \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.query_to_consider.txt \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.n$n.nref${refn}.XTR.pdf

            # chrX#EV_STRATA_5_4:2394410-8500000
            Rscript /lizardfs/guarracino/chromosome_communities/scripts/plot_untangle_without_annotation.R \
              $path_grounded_reliable_tsv_gz \
              0 8500000 \
              90 0.4 \
              0 \
              $n $refn \
              $i \
              0.9 \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.query_to_consider.txt \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.n$n.nref${refn}.EV_STRATA_5_4.pdf  
            Rscript /lizardfs/guarracino/chromosome_communities/scripts/plot_untangle_without_annotation.R \
              $path_grounded_reliable_tsv_gz \
              0 8500000 \
              90 0.4 \
              0 \
              $n $refn \
              $i \
              0.7 \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.query_to_consider.txt \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.n$n.nref${refn}.EV_STRATA_5_4.eid07.pdf 

            # Full chromosome X
            Rscript /lizardfs/guarracino/chromosome_communities/scripts/plot_untangle_without_annotation.R \
              $path_grounded_reliable_tsv_gz \
              0 154259566 \
              200 0.4 \
              0 \
              $n $refn \
              $i \
              0.9 \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.query_to_consider.txt \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.n$n.nref${refn}.pdf
        else
            # chrY#PAR1:0-2458320
            Rscript /lizardfs/guarracino/chromosome_communities/scripts/plot_untangle_without_annotation.R \
              $path_grounded_reliable_tsv_gz \
              0 2958320 \
              90 0.4 \
              0 \
              $n $refn \
              $i \
              0.9 \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.query_to_consider.txt \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.n$n.nref${refn}.PAR1.pdf
              
            # chrY#PAR2:62122809-62460029
            Rscript /lizardfs/guarracino/chromosome_communities/scripts/plot_untangle_without_annotation.R \
              $path_grounded_reliable_tsv_gz \
              61622809 62460029 \
              90 0.4 \
              0 \
              $n $refn \
              $i \
              0.9 \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.query_to_consider.txt \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.n$n.nref${refn}.PAR2.pdf
              
            # chrY#XTR1:2727072-5914561
            Rscript /lizardfs/guarracino/chromosome_communities/scripts/plot_untangle_without_annotation.R \
              $path_grounded_reliable_tsv_gz \
              2227072 6414561 \
              90 0.4 \
              0 \
              $n $refn \
              $i \
              0.9 \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.query_to_consider.txt \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.n$n.nref${refn}.XTR1.pdf
              
            # chrY#XTR2:6200973-6400875
            Rscript /lizardfs/guarracino/chromosome_communities/scripts/plot_untangle_without_annotation.R \
              $path_grounded_reliable_tsv_gz \
              5700973 6900875 \
              90 0.4 \
              0 \
              $n $refn \
              $i \
              0.9 \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.query_to_consider.txt \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.n$n.nref${refn}.XTR2.pdf
            
            # Full chromosome Y
            Rscript /lizardfs/guarracino/chromosome_communities/scripts/plot_untangle_without_annotation.R \
              $path_grounded_reliable_tsv_gz \
              0 62460029 \
              80 0.4 \
              0 \
              $n $refn \
              $i \
              0.9 \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.query_to_consider.txt \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.n$n.nref${refn}.pdf
        fi
      done
      
      # Merge chromosomes's PDF files
      #rm /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$prefix.untangle.chm13#chrSEX.e$e.m$m.grounded.reliable.n$n.nref${refn}.merged.pdf
      #/gnu/store/d0njxcgymxvf8s7di32m9q4v9vibd11z-poppler-0.86.1/bin/pdfunite \
      #  /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$prefix.untangle.chm13#chr*.e$e.m$m.grounded.reliable.n$n.nref${refn}.*pdf \
      #  /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$prefix.untangle.chm13#chrSEX.e$e.m$m.grounded.reliable.n$n.nref${refn}.merged.pdf
      #rm /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$prefix.untangle.chm13#chr*.e$e.m$m.grounded.reliable.n$n.nref${refn}.pdf
      #rm /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$prefix.untangle.chm13#chr*.e$e.m$m.grounded.reliable.n$n.nref${refn}.*R*.pdf
    done
  done
done


# Only full chromosome X
for e in 50000; do
  for m in 1000; do
    for refn in 1; do
      (echo X; echo Y) | while read i; do     
        ref=chm13#chr$i
        
        echo "-e $e -m $m -refn $refn $ref"
    
        path_grounded_reliable_tsv_gz=/lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$prefix.untangle.$ref.e$e.m$m.grounded.reliable.tsv.gz
        PREFIX=$(basename $path_grounded_reliable_tsv_gz .tsv.gz);
        
        zgrep '^HG01978#MAT\|^HG01978#PAT\|bakeoff' $path_grounded_reliable_tsv_gz -v | sed '1d' | cut -f 1 | sort | uniq \
          > /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.query_to_consider.txt
        
        if [[ $i == "X" ]]; then        
            Rscript /lizardfs/guarracino/chromosome_communities/scripts/plot_untangle_without_annotation.R \
              $path_grounded_reliable_tsv_gz \
              0 154259566 \
              200 0.4 \
              0 \
              2 $refn \
              $i \
              0.7 \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.query_to_consider.txt \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.n2.nref${refn}.eid07.pdf
        else           
            Rscript /lizardfs/guarracino/chromosome_communities/scripts/plot_untangle_without_annotation.R \
              $path_grounded_reliable_tsv_gz \
              0 62460029 \
              80 0.4 \
              0 \
              2 $refn \
              $i \
              0.7 \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.query_to_consider.txt \
              /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$PREFIX.n2.nref${refn}.eid07.pdf
        fi
      done
    done
  done
done

CONTINUE (IF NEEDED) - Plot 2 hits: XXX

CONTINUE (IF NEEDED) - Compute support: XXX

# Merge files for all sex chromosomes (used for computing the support and the histogram length)
for e in 50000; do
  for m in 1000 ; do
    path_grounded_reliable_ALL_tsv_gz=/lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$prefix.untangle.ALL.e$e.m$m.grounded.reliable.tsv.gz
    if [[ ! -s ${path_grounded_reliable_ALL_tsv_gz} ]]; then
      cat \
        <(zcat /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$prefix.untangle.chm13#chr*.e$e.m$m.grounded.reliable.tsv.gz | grep 'self.coverage' -m 1) \
        <(zcat /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$prefix.untangle.chm13#chr*.e$e.m$m.grounded.reliable.tsv.gz | grep 'self.coverage' -v ) |\
        pigz -c -9 > $path_grounded_reliable_ALL_tsv_gz
    fi;
  done
done

#Take sex chromosome lengths
grep '^chm13' /lizardfs/guarracino/chromosome_communities/assemblies/chm13v2+grch38masked.fa.gz.fai | grep 'chrX\|chrY' | cut -f 1,2 \
  > /lizardfs/guarracino/chromosome_communities/chm13#SEX.len.tsv

# Support
# guix install r-ggridges
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx todo

CONTINUE (IF NEEDED) - Statistics on removed regions: XXX

CONTINUE (IF NEEDED) - Statistics on untangled segment lengths: XXX

TO FINISH, IF NEEDED Estimate regions that can recombine using multi-hit untangled regions:

mkdir -p /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/recombinant_regions/

# Identify regions with multiple good enough hits
for e in 50000; do
  for m in 1000; do
    path_grounded_reliable_ALL_tsv_gz=/lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$prefix.untangle.ALL.e$e.m$m.grounded.reliable.tsv.gz
    PREFIX=$(basename $path_grounded_reliable_ALL_tsv_gz .tsv.gz)
   
    zgrep '^chm13\|^grch38\|^HG002#1\|HG002#2\|^HG01978#MAT\|^HG01978#PAT\|bakeoff' $path_grounded_reliable_ALL_tsv_gz -v | sed '1d' | cut -f 1 | sort | uniq \
      > /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/recombinant_regions/$PREFIX.query_to_consider.txt

    for sc in 0 1.1; do
      for eid in 0.900 0.950 0.975 0.995 1.000; do
        eid_str=$(echo $eid | sed 's/\.//g')
        sc_str=$(echo $sc | sed 's/\.//g')
        echo $e $m $sc $eid
          
        path_recombinant_regions_bed=/lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/recombinant_regions/$PREFIX.recombinant_regions.sc${sc_str}.eid${eid_str}.bed
        if [[ ! -s $path_recombinant_regions_bed ]]; then
          python3 /lizardfs/guarracino/chromosome_communities/scripts/recombination_proxy_ranges.py \
            $path_grounded_reliable_ALL_tsv_gz \
            $eid $sc \
            /lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/recombinant_regions/$PREFIX.query_to_consider.txt \
            > $path_recombinant_regions_bed
        fi
      done
    done
  done
done

# Collect values in grounded reference space (TO PUT IN A FILE AND RUN A JOB FOR IT)
########################################################################################################################
#!/bin/sh

cd /scratch
awk -v OFS='\t' '{print($1,"0",$2)}' /lizardfs/guarracino/chromosome_communities/chm13#SEX.len.tsv > chm13.bed

path_targets_txt=/lizardfs/guarracino/chromosome_communities/untangle_sex/chm13.SEX.target_paths.txt
path_input_og=/lizardfs/guarracino/chromosome_communities/graphs/chrSEX+refs.s50k.l250k.p98.n102/chrSEX+refs.fa.gz.2ed2c67.04f1c29.22fc5c8.smooth.final.og
prefix=$(basename $path_input_og .og)

for e in 50000; do
  for m in 1000; do
    path_grounded_reliable_ALL_tsv_gz=/lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/$prefix.untangle.ALL.e$e.m$m.grounded.reliable.tsv.gz
    PREFIX=$(basename $path_grounded_reliable_ALL_tsv_gz .tsv.gz)

    path_recombinant_regions_table_tsv=/lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/recombinant_regions/$PREFIX.recombinant_regions.table.tsv
    path_recombinant_regions_table_sizes_tsv=/lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/recombinant_regions/$PREFIX.recombinant_regions.table.sizes.tsv
    path_recombinant_regions_table_with_counts_tsv=/lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/recombinant_regions/$PREFIX.recombinant_regions.table.counts.tsv
    
    rm rg.txt
    rm $path_recombinant_regions_table_tsv $path_recombinant_regions_table_sizes_tsv $path_recombinant_regions_table_with_counts_tsv
    for sc in 0 1.1; do
      for eid in 0.900 0.950 0.975 0.995 1.000; do
        eid_str=$(echo $eid | sed 's/\.//g')
        sc_str=$(echo $sc | sed 's/\.//g')
        echo $e $m $sc $eid
          
        path_recombinant_regions_bed=/lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/recombinant_regions/$PREFIX.recombinant_regions.sc${sc_str}.eid${eid_str}.bed

        bedtools merge -i <(cut -f 4,5,6 $path_recombinant_regions_bed | sed '1d' | bedtools sort ) | \
          awk -v sc=$sc -v eid=$eid -v OFS='\t' '{print(sc,eid,$1,$2,$3)}' >> $path_recombinant_regions_table_tsv
          
        bedtools merge -i <(cut -f 4,5,6 $path_recombinant_regions_bed | sed '1d' | bedtools sort ) | \
          awk -v sc=$sc -v eid=$eid -v OFS='\t' '{SUM+=$3-$2}END{print(sc,eid,SUM)}' >> $path_recombinant_regions_table_sizes_tsv

        # For each sample, merge intervals with respect to the grounded reference
        sed '1d' $path_recombinant_regions_bed | cut -f 1 | sort | uniq | while read CONTIG; do
          grep "^$CONTIG" $path_recombinant_regions_bed | cut -f 4,5,6 | bedtools sort | bedtools merge >> rg.txt
        done
        
        # For each grounded reference position, count how many sample support it
        # -d: Report the depth at each position in each A feature.
        # The awk script is to get intervals where grounded reference and counts is constant.
        bedtools coverage -a chm13.bed -b rg.txt -d | \
          python3 /lizardfs/guarracino/chromosome_communities/scripts/compress_coverage_info.py | \
          awk -v sc=$sc -v eid=$eid -v OFS='\t' '{print(sc,eid,$1,$2,$3,$4)}' \
          >> $path_recombinant_regions_table_with_counts_tsv
        rm rg.txt
      done
    done
  done
done

rm chm13.bed
########################################################################################################################

# Plots
# Use the average counts as self coverage for the putative recombinant regions
cd /lizardfs/guarracino/chromosome_communities/
f=/lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/recombinant_regions/chrSEX+refs.fa.gz.2ed2c67.04f1c29.22fc5c8.smooth.final.untangle.ALL.e50000.m1000.grounded.reliable.recombinant_regions.table.counts.tsv
max_count=$(cut -f 6 $f | sort -n | tail -n 1)
echo -e "query\tquery.begin\tquery.end\ttarget\ttarget.begin\ttarget.end\tjaccard\tstrand\tself.coverage\tnth.best\tref\tref.begin\tref.end\tref.jaccard\tref.nth.best\tgrounded.target" > chrXY+recombinant.tsv
for chr in chrX chrY; do
  cat \
    <( grep $chr /lizardfs/guarracino/chromosome_communities/data/chm13_hg002.PARs.bed | sed 's/^/chm13#/g' | \
        awk -v OFS='\t' -v ref=chm13#$chr -v max=$max_count '{print $1,".",".",ref,".",".","1","+",max,"1",ref,$2,$3,"1","1",ref}' ) \
    <( grep $chr $f | \
        awk '$1 == 0 && $2 == 0.9' | cut -f 3,4,5,6 | awk '$4 > 0' | bedtools sort | bedtools merge -o mean -c 4 -d 10000 | \
        awk '$3 - $2 > 30000' | \
        awk -v OFS='\t' -v ref=chm13#$chr '{print $1,".",".",ref,".",".","1","+",$4,"1",ref,$2,$3,"1","1",ref}' ) | sort -r >> chrXY+recombinant.tsv
done

Rscript /lizardfs/guarracino/chromosome_communities/scripts/plot_recombinant_regions_chrXY.R \
  chrXY+recombinant.tsv \
  90 10 \
  "X" \
  /lizardfs/guarracino/chromosome_communities/chrX.recombinant_regions.eid0900.pdf # Supplementary figure
Rscript /lizardfs/guarracino/chromosome_communities/scripts/plot_recombinant_regions_chrXY.R \
  chrXY+recombinant.tsv \
  90 12 \
  "Y" \
  /lizardfs/guarracino/chromosome_communities/chrY.recombinant_regions.eid0900.pdf # Supplementary figure
  
# Merge chromosomes's PDF files
/gnu/store/d0njxcgymxvf8s7di32m9q4v9vibd11z-poppler-0.86.1/bin/pdfunite \
  /lizardfs/guarracino/chromosome_communities/chrX.recombinant_regions.eid0900.pdf /lizardfs/guarracino/chromosome_communities/chrY.recombinant_regions.eid0900.pdf \
  /lizardfs/guarracino/chromosome_communities/chrXY.recombinant_regions.eid0900.pdf
rm /lizardfs/guarracino/chromosome_communities/chrX.recombinant_regions.eid0900.pdf /lizardfs/guarracino/chromosome_communities/chrY.recombinant_regions.eid0900.pdf



f=/lizardfs/guarracino/chromosome_communities/untangle_sex/grounded/recombinant_regions/chrSEX+refs.fa.gz.2ed2c67.04f1c29.22fc5c8.smooth.final.untangle.ALL.e50000.m1000.grounded.reliable.recombinant_regions.table.counts.tsv
awk '$1 == 0 && $2 == 0.9 && $6 > 0' $f | cut -f 3,4,5,6 | awk '$4 > 0' | bedtools sort | bedtools merge | awk -v OFS='\t' '{SUM+=$3-$2}END{print(SUM)}'
awk '$1 == 0 && $2 == 0.9 && $6 > 0' $f | cut -f 3,4,5,6 | awk '$4 > 0' | bedtools sort | bedtools merge | grep chrX | awk -v OFS='\t' '{SUM+=$3-$2}END{print(SUM)}'
awk '$1 == 0 && $2 == 0.9 && $6 > 0' $f | cut -f 3,4,5,6 | awk '$4 > 0' | bedtools sort | bedtools merge | grep chrY | awk -v OFS='\t' '{SUM+=$3-$2}END{print(SUM)}'

awk '$1 == 0 && $2 == 0.9 && $6 > 19' $f | cut -f 3,4,5,6 | awk '$4 > 0' | bedtools sort | bedtools merge | awk -v OFS='\t' '{SUM+=$3-$2}END{print(SUM)}'
awk '$1 == 0 && $2 == 0.9 && $6 > 19' $f | cut -f 3,4,5,6 | awk '$4 > 0' | bedtools sort | bedtools merge | grep chrX | awk -v OFS='\t' '{SUM+=$3-$2}END{print(SUM)}'
awk '$1 == 0 && $2 == 0.9 && $6 > 19' $f | cut -f 3,4,5,6 | awk '$4 > 0' | bedtools sort | bedtools merge | grep chrY | awk -v OFS='\t' '{SUM+=$3-$2}END{print(SUM)}'