Skip to content

Latest commit

 

History

History
405 lines (348 loc) · 15.2 KB

1_Community.md

File metadata and controls

405 lines (348 loc) · 15.2 KB

Community detection

We evaluate the chromosome separation among the HPRCy1v2 assemblies.

We first apply all-to-all approximate mapping:

RUN_WFMASH=/home/guarracino/tools/wfmash/build/bin/wfmash-ad8aebae1be96847839778af534866bc9545adb9

mkdir -p /lizardfs/guarracino/chromosome_communities/mappings/HPRCy1v2genbank

# Mappings without references included
PATH_HPRCY1_FA_GZ=/lizardfs/guarracino/chromosome_communities/assemblies/HPRCy1v2genbank.fa.gz

p=95
s=50k
l=250k
n=93 # There are 94 haplotypes, so the max `-n` should be equal to `94 - 1`
sbatch -p workers -c 48 --job-name all-vs-all --wrap "hostname; cd /scratch; \time -v $RUN_WFMASH $PATH_HPRCY1_FA_GZ -t 48 -Y '#' -p $p -s $s -l $l -n $n -H 0.001 -m > HPRCy1v2genbank.self.s$s.l$l.p$p.n$n.h0001.paf && mv HPRCy1v2genbank.self.s$s.l$l.p$p.n$n.h0001.paf /lizardfs/guarracino/chromosome_communities/mappings/HPRCy1v2genbank/"


# Mappings with chm13v2 included
PATH_HPRCY1_REFS_FA_GZ=/lizardfs/guarracino/chromosome_communities/assemblies/HPRCy1v2genbank+chm13v2.fa.gz

w=800
for s in 50k; do
  for ln in 5; do
    s_no_k=${s::-1}
    l_no_k=$(echo $s_no_k '*' $ln | bc)
    l=${l_no_k}k
    
    for p in 98; do    
      if [[ $s == "100k" && $p == "98" ]]; then
        w=20000
      elif [[ $s == "100k" && $p == "95" ]]; then
        w=10000
      elif [[ $s == "50k" && $p == "98" ]]; then
        w=10000
      elif [[ $s == "50k" && $p == "95" ]]; then
        w=5000
      elif [[ $s == "20k" && $p == "98" ]]; then
        w=4000
      elif [[ $s == "20k" && $p == "95" ]]; then
        w=2000
      fi
    
      echo $s $p $l $w
      
      # Runs with stringent n values are for visualization
      for n in 5 3; do
        sbatch -p workers -c 48 --job-name all-vs-all --wrap "hostname; cd /scratch; \time -v $RUN_WFMASH $PATH_HPRCY1_REFS_FA_GZ -t 48 -Y '#' -p $p -s $s -l $l -n $n -H 0.001 -w $w -m > HPRCy1v2genbank+chm13v2.self.s$s.l$l.p$p.n$n.h0001.big_w.paf && mv HPRCy1v2genbank+chm13v2.self.s$s.l$l.p$p.n$n.h0001.big_w.paf /lizardfs/guarracino/chromosome_communities/mappings/HPRCy1v2genbank/"
      done
    done
  done
done

Then, we collect mappings between sequences at least l kbp long:

cd /lizardfs/guarracino/chromosome_communities/mappings/HPRCy1v2genbank/

for s in 50k; do
  for ln in 5; do
    s_no_k=${s::-1}
    l_no_k=$(echo $s_no_k '*' $ln | bc)
    l=${l_no_k}k
    
    for p in 95; do  
      for n in 93 5 3; do
        for L in 1000000; do
          for prefix in HPRCy1v2genbank+chm13v2 HPRCy1v2genbank; do
            for big_y in Y N; do
                PREFIX=$prefix.self.s$s.l$l.p$p.n$n.h0001
                if [ $big_y == Y ]; then
                    PREFIX=$prefix.self.s$s.l$l.p$p.n$n.h0001.big_w
                fi
                if [[ -s $PREFIX.paf && ! -s $PREFIX.l$L.paf ]]; then
                  echo $s $p $n $l $prefix $big_y
                  awk -v len=$L '$2 >= len && $7 >= len' $PREFIX.paf > $PREFIX.l$L.paf
                fi
            done
          done
        done
      done
    done
  done
done

To evaluate chromosome communities, we build a mapping graph from our mappings using the paf2net script delivered in pggb repository. In this graph, nodes are contigs and edges are mappings between them.

for s in 50k; do
  for ln in 5; do
    s_no_k=${s::-1}
    l_no_k=$(echo $s_no_k '*' $ln | bc)
    l=${l_no_k}k
    
    for p in 95; do  
      for n in 93 5 3; do
        for L in 1000000; do
          for prefix in HPRCy1v2genbank+chm13v2 HPRCy1v2genbank; do
            for big_y in Y N; do
              PAF=$prefix.self.s$s.l$l.p$p.n$n.h0001.l$L.paf
              if [ $big_y == Y ]; then
                PAF=$prefix.self.s$s.l$l.p$p.n$n.h0001.big_w.l$L.paf
              fi

              if [[ -s $PAF && ! -s $PAF.edges.list.txt ]]; then
                echo $s $l $p $n $L $prefix $big_y
                python3 /home/guarracino/tools/pggb/scripts/paf2net.py -p $PAF
                
                ID2NAME=$PAF.vertices.id2name.txt
                if [ $prefix == "HPRCy1v2genbank" ]; then
                  # Prepare labels by contig
                  join -1 2 -2 1 -a 1 -e 'unmapped' -o '1.1 1.2 2.2' \
                    <(sort -k 2,2 $PAF.vertices.id2name.txt) \
                    <(sort -k 1,1 /lizardfs/guarracino/chromosome_communities/assemblies/partitioning/pq_info/*.partitioning_with_pq.tsv) | cut -f 1,2,3 -d ' ' | \
                    awk '{print($1,$2"-"$3)}' > $PAF.vertices.id2name.chr.txt
                    
                  ID2NAME=$PAF.vertices.id2name.chr.txt
                fi
              fi
            done
          done
        done
      done
    done
  done
done

Then we identify the communities by applying the Leiden algorithm:

for s in 50k; do
  for ln in 5; do
    s_no_k=${s::-1}
    l_no_k=$(echo $s_no_k '*' $ln | bc)
    l=${l_no_k}k
    
    for p in 95; do  
      for n in 93; do
        for L in 1000000; do
          for prefix in HPRCy1v2genbank; do
            for big_y in Y N; do
                PAF=$prefix.self.s$s.l$l.p$p.n$n.h0001.l$L.paf
                if [ $big_y == Y ]; then
                    PAF=$prefix.self.s$s.l$l.p$p.n$n.h0001.big_w.l$L.paf
                fi

                if [[ -s $PAF && ! -s $PAF.edges.weights.txt.community.0.txt ]]; then
                  echo $s $l $p $n $L $prefix $big_y
                                 
                  ID2NAME=$PAF.vertices.id2name.txt
                  if [ $prefix == "HPRCy1v2genbank" ]; then                    
                    ID2NAME=$PAF.vertices.id2name.chr.txt
                  fi
                                 
                  # guix install python-igraph
                  python3 /home/guarracino/tools/pggb/scripts/net2communities.py \
                    -e $PAF.edges.list.txt \
                    -w $PAF.edges.weights.txt \
                    -n $ID2NAME --accurate-detection
                fi
            done          
          done
        done
      done
    done
  done
done

Finally, we check how chromosomes where partitioned:

cd /lizardfs/guarracino/chromosome_communities/mappings/HPRCy1v2genbank/

s=50k
l=250k
p=95
n=93
L=1000000
PAF=HPRCy1v2genbank.self.s$s.l$l.p$p.n$n.h0001.l$L.paf

# A look in the communities
ls $PAF.edges.weights.txt.community.*.txt | while read f; do echo $f; cat $f | cut -f 2 -d '-' | cut -f 2 -d '#' | sort | uniq -c | sort -k 1nr | awk '$1 > 0' ; done

# Compute community sizes
rm $PAF.community2size.tsv
ls $PAF.edges.weights.txt.community.*.txt | while read f; do 
  n=$(echo $f | rev | cut -f 2 -d '.' | rev)
  
  join -1 1 -2 1 \
    <( awk '{split($1, a, "-"); print(a[1], a[2])}' $f | sed 's/chm13#//g' | sed 's/grch38#//g' | sed 's/_pq//g' | sed 's/_p//g' | sed 's/_q//g' | sort -k 1,1) \
    <( cut -f 1,2 /lizardfs/guarracino/chromosome_communities/assemblies/HPRCy1v2genbank.fa.gz.fai | sort -k 1,1 ) |\
     awk -v OFS='\t' -v n=$n '{a[$2]+=$3}END { for (key in a) { print (n, key, a[key]) } }' >> $PAF.community2size.tsv
done

# Compute community composition, adding a column regarding their size (remove the first column for the Supplementary Table 1)
python3 /lizardfs/guarracino/chromosome_communities/scripts/get_table_about_communities.py $PAF.edges.weights.txt.community.*.txt | cut -f 1-26,28 > $PAF.community.leiden.tsv
cat $PAF.community.leiden.tsv | column -t

# Plot the community composition
# Total length of all contigs
TOTAL_SEQUENCE_BP=$(awk -F'\t' 'BEGIN{LEN=0}{ LEN+=$2 }END{print LEN}' /lizardfs/guarracino/chromosome_communities/assemblies/HPRCy1v2genbank.fa.gz.fai)
Rscript /lizardfs/guarracino/chromosome_communities/scripts/plot_community_table.R \
  $PAF.community.leiden.tsv \
  $PAF.community2size.tsv \
  $TOTAL_SEQUENCE_BP \
  $PAF.community.leiden.composition.pdf


# File for mapping contigs to communities (used later for creating files for gephi)
ls $PAF.edges.weights.txt.community.*.txt | while read f; do
  n=$(echo $f | rev | cut -f 2 -d '.' | rev)
  cut -f 1 $f -d '-' | awk -v OFS='\t' -v n=$n '{print($1,n)}' >> $PAF.contig2community.tsv
done

Info:

echo "Total number of contigs"
cat /lizardfs/guarracino/chromosome_communities/assemblies/HPRCy1v2genbank.fa.gz.fai | wc -l

echo "Total length of all contigs"
awk -F'\t' 'BEGIN{LEN=0}{ LEN+=$2 }END{print LEN}' /lizardfs/guarracino/chromosome_communities/assemblies/HPRCy1v2genbank.fa.gz.fai

echo "Mapped contigs"
cut -f 1,6 HPRCy1v2genbank.self.s$s.l$l.p$p.n$n.h0001.paf | tr '\t' '\n' | sort | uniq | wc -l

echo "Mapped filtered contigs ($L bps)"
cut -f 1,6 $PAF | tr '\t' '\n' | sort | uniq | wc -l

echo "Mapped filtered sequence"
cat \
  <( cut -f 1,2 $PAF | sort | uniq ) \
  <( cut -f 6,7 $PAF | sort | uniq ) | sort | uniq | \
  awk -F'\t' 'BEGIN{LEN=0}{ LEN+=$2 }END{print LEN}'


# Contigs of the communities (manually selected) containing only not partitioned contigs
grep -f <(cat \
  HPRCy1v2genbank.self.s50k.l250k.p95.n93.h0001.l1000000.paf.edges.weights.txt.community.28.txt \
  HPRCy1v2genbank.self.s50k.l250k.p95.n93.h0001.l1000000.paf.edges.weights.txt.community.29.txt \
  HPRCy1v2genbank.self.s50k.l250k.p95.n93.h0001.l1000000.paf.edges.weights.txt.community.30.txt | cut -f 1 -d '-'
  ) /lizardfs/guarracino/chromosome_communities/assemblies/HPRCy1v2genbank.fa.gz.fai | cut -f 1,2

Generate files for gephi:

cd /lizardfs/guarracino/chromosome_communities/mappings/HPRCy1v2genbank/

s=50k
l=250k
p=95
n=3
L=1000000
PAF=HPRCy1v2genbank+chm13v2.self.s$s.l$l.p$p.n$n.h0001.big_w.l$L.paf

# Add labels only to acrocentric chromosomes
( echo "Id,Label,Chromosome"; \
  join -1 2 -2 1 -a 1 -e 'unmapped' -o '1.1 1.2 2.2' \
    <(sort -k 2,2 $PAF.vertices.id2name.txt) \
    <(cat \
      <(sort -k 1,1 /lizardfs/guarracino/chromosome_communities/assemblies/partitioning/pq_info/*.partitioning_with_pq.tsv |\
          sed 's/chm13#//' | sed 's/grch38#//') \
      <(echo -e chm13#chr1"\t"chr1_pq) \
      <(echo -e chm13#chr2"\t"chr2_pq) \
      <(echo -e chm13#chr3"\t"chr3_pq) \
      <(echo -e chm13#chr4"\t"chr4_pq) \
      <(echo -e chm13#chr5"\t"chr5_pq) \
      <(echo -e chm13#chr6"\t"chr6_pq) \
      <(echo -e chm13#chr7"\t"chr7_pq) \
      <(echo -e chm13#chr8"\t"chr8_pq) \
      <(echo -e chm13#chr9"\t"chr9_pq) \
      <(echo -e chm13#chr10"\t"chr10_pq) \
      <(echo -e chm13#chr11"\t"chr11_pq) \
      <(echo -e chm13#chr12"\t"chr12_pq) \
      <(echo -e chm13#chr13"\t"chr13_pq) \
      <(echo -e chm13#chr14"\t"chr14_pq) \
      <(echo -e chm13#chr15"\t"chr15_pq) \
      <(echo -e chm13#chr16"\t"chr16_pq) \
      <(echo -e chm13#chr17"\t"chr17_pq) \
      <(echo -e chm13#chr18"\t"chr18_pq) \
      <(echo -e chm13#chr19"\t"chr19_pq) \
      <(echo -e chm13#chr20"\t"chr20_pq) \
      <(echo -e chm13#chr21"\t"chr21_pq) \
      <(echo -e chm13#chr22"\t"chr22_pq) \
      <(echo -e chm13#chrX"\t"chrX_pq) \
      <(echo -e chm13#chrY"\t"chrY_pq) \
      <(echo -e grch38#chr1"\t"chr1_pq) \
      <(echo -e grch38#chr2"\t"chr2_pq) \
      <(echo -e grch38#chr3"\t"chr3_pq) \
      <(echo -e grch38#chr4"\t"chr4_pq) \
      <(echo -e grch38#chr5"\t"chr5_pq) \
      <(echo -e grch38#chr6"\t"chr6_pq) \
      <(echo -e grch38#chr7"\t"chr7_pq) \
      <(echo -e grch38#chr8"\t"chr8_pq) \
      <(echo -e grch38#chr9"\t"chr9_pq) \
      <(echo -e grch38#chr10"\t"chr10_pq) \
      <(echo -e grch38#chr11"\t"chr11_pq) \
      <(echo -e grch38#chr12"\t"chr12_pq) \
      <(echo -e grch38#chr13"\t"chr13_pq) \
      <(echo -e grch38#chr14"\t"chr14_pq) \
      <(echo -e grch38#chr15"\t"chr15_pq) \
      <(echo -e grch38#chr16"\t"chr16_pq) \
      <(echo -e grch38#chr17"\t"chr17_pq) \
      <(echo -e grch38#chr18"\t"chr18_pq) \
      <(echo -e grch38#chr19"\t"chr19_pq) \
      <(echo -e grch38#chr20"\t"chr20_pq) \
      <(echo -e grch38#chr21"\t"chr21_pq) \
      <(echo -e grch38#chr22"\t"chr22_pq) \
      <(echo -e grch38#chrX"\t"chrX_pq)  \
      <(echo -e grch38#chrY"\t"chrY_pq) | sort)  | \
        awk '{print($1,$2,$3)}' | tr ' ' ',' \
  ) > $PAF.nodes.tmp.csv

# Add community information
join -1 1 -2 2 \
  <(cut -f 1,2 HPRCy1v2genbank.self.s$s.l$l.p$p.n93.h0001.l$L.paf.community.leiden.tsv | sort -k 1) \
  <(sort -k 2 HPRCy1v2genbank.self.s$s.l$l.p$p.n93.h0001.l$L.paf.contig2community.tsv) |  awk -v OFS='\t' '{print $3,$2}' > $PAF.contig2namedCommunity.tsv

(echo "Id,Label,Chromosome,Community"; \
  join -1 2 -2 1 \
    <(sed '1d' $PAF.nodes.tmp.csv | tr ',' ' ' | sort -k 2) \
    <(sort -k 1 $PAF.contig2namedCommunity.tsv) | \
      awk -v OFS=',' '{print $2,$1,$3,$4}') > $PAF.nodes.tmp2.csv

# Add color column (for the "givecolortonodes" plugin [that doesn't work!])
(echo "Id,Label,Chromosome,Community,ColorOfNode"; \
  join -1 3 -2 1 \
    <(sed '1d' $PAF.nodes.tmp2.csv | tr ',' ' ' | sort -k 3) \
    <(sed '1d' /lizardfs/guarracino/chromosome_communities/data/chromosome.colors.csv | tr ',' ' ' | sort -k 1) | \
      awk -v OFS=',' '{print $2,$3,$1,$4,$5}') \
     > $PAF.nodes.csv

rm $PAF.contig2namedCommunity.tsv $PAF.nodes.tmp*.csv

# Edges
( echo "Source,Target,Weight"; \
  paste -d ' ' $PAF.edges.list.txt $PAF.edges.weights.txt | tr ' ' ','
) > $PAF.edges.csv