aidenlab · sa501428 · Jan 10, 2022 · Jan 18, 2022 · Jan 18, 2022 · Jan 20, 2022
diff --git a/CPU/juicer.sh b/CPU/juicer.sh
@@ -628,62 +628,62 @@ fi
 
 #Skip if post-processing only is required
 if [ -z $postproc ]
-    then
-    # if we haven't already zipped up merged_dedup
-    if [ ! -s  ${outputdir}/merged_dedup.bam ]
-    then 
-        # Check that dedupping worked properly
-        # in ideal world, we would check this in split_rmdups and not remove before we know they are correct
-	size1=$(samtools view $sthreadstring -h ${outputdir}/merged_sort.bam | wc -l | awk '{print $1}')
-	size2=$(wc -l ${outputdir}/merged_dedup.sam | awk '{print $1}')
-
-	if [ $size1 -ne $size2 ]
-	then
-	    echo "***! Error! The sorted file and dups/no dups files do not add up, or were empty."
-	    exit 1
-	fi
-	samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.sam | awk -v mapq=1 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged1.txt
-	samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.sam | awk -v mapq=30 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged30.txt
-    else
-	if [ ! -s ${outputdir}/merged1.txt ] 
-	then
-	    samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.bam | awk -v mapq=1 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged1.txt
-	fi
-	if [ ! -s ${outputdir}/merged30.txt ]
-	then
-	    samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.bam | awk -v mapq=30 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged30.txt
-	fi
+then
+	# if we haven't already zipped up merged_dedup
+	if [ ! -s  ${outputdir}/merged_dedup.bam ]
+	then 
+		# Check that dedupping worked properly
+		# in ideal world, we would check this in split_rmdups and not remove before we know they are correct
+		size1=$(samtools view $sthreadstring -h ${outputdir}/merged_sort.bam | wc -l | awk '{print $1}')
+		size2=$(wc -l ${outputdir}/merged_dedup.sam | awk '{print $1}')
+		
+		if [ $size1 -ne $size2 ]
+		then
+			echo "***! Error! The sorted file and dups/no dups files do not add up, or were empty."
+			exit 1
+		fi
+		samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.sam | awk -v mapq=1 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged1.txt
+		samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.sam | awk -v mapq=30 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged30.txt
+	else
+		if [ ! -s ${outputdir}/merged1.txt ] 
+		then
+			samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.bam | awk -v mapq=1 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged1.txt
+		fi
+		if [ ! -s ${outputdir}/merged30.txt ]
+		then
+			samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.bam | awk -v mapq=30 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged30.txt
+		fi
     fi
 
     if [[ $threadsHic -gt 1 ]]
     then
-	if [ ! -s ${outputdir}/merged1_index.txt ]
-	then
-	    time ${juiceDir}/scripts/common/index_by_chr.awk ${outputdir}/merged1.txt 500000 > ${outputdir}/merged1_index.txt
-	fi
-	if [ ! -s ${outputdir}/merged30_index.txt ]
-	then
-	    time ${juiceDir}/scripts/common/index_by_chr.awk ${outputdir}/merged30.txt 500000 > ${outputdir}/merged30_index.txt
-	fi
+		if [ ! -s ${outputdir}/merged1_index.txt ]
+		then
+			time ${juiceDir}/scripts/common/index_by_chr.awk ${outputdir}/merged1.txt 500000 > ${outputdir}/merged1_index.txt
+		fi
+		if [ ! -s ${outputdir}/merged30_index.txt ]
+		then
+			time ${juiceDir}/scripts/common/index_by_chr.awk ${outputdir}/merged30.txt 500000 > ${outputdir}/merged30_index.txt
+		fi
     fi
 
     if [ ! -s  ${outputdir}/merged_dedup.bam ]
     then
-	if samtools view -b $sthreadstring ${outputdir}/merged_dedup.sam > ${outputdir}/merged_dedup.bam
-	then
-	    rm ${outputdir}/merged_dedup.sam
-	    rm ${outputdir}/merged_sort.bam
-	fi
+		if samtools view -b $sthreadstring ${outputdir}/merged_dedup.sam > ${outputdir}/merged_dedup.bam
+		then
+			rm ${outputdir}/merged_dedup.sam
+			rm ${outputdir}/merged_sort.bam
+		fi
     fi
 
     if [ "$methylation" = 1 ]
     then
-	$load_methyl
-	samtools sort $sthreadstring ${outputdir}/merged_dedup.bam > ${outputdir}/merged_dedup_sort.bam
-	$call_methyl extract -F 1024 --keepSingleton --keepDiscordant $refSeq ${outputdir}/merged_dedup_sort.bam
-	$call_methyl extract -F 1024 --keepSingleton --keepDiscordant --cytosine_report  --CHH --CHG $refSeq ${outputdir}/merged_dedup_sort.bam
-	${juiceDir}/scripts/common/conversion.sh ${outputdir}/merged_dedup_sort.cytosine_report.txt > ${outputdir}/conversion_report.txt
-	rm ${outputdir}/merged_dedup_sort.bam ${outputdir}/merged_dedup_sort.cytosine_report.txt*
+		$load_methyl
+		samtools sort $sthreadstring ${outputdir}/merged_dedup.bam > ${outputdir}/merged_dedup_sort.bam
+		$call_methyl extract -F 1024 --keepSingleton --keepDiscordant $refSeq ${outputdir}/merged_dedup_sort.bam
+		$call_methyl extract -F 1024 --keepSingleton --keepDiscordant --cytosine_report  --CHH --CHG $refSeq ${outputdir}/merged_dedup_sort.bam
+		${juiceDir}/scripts/common/conversion.sh ${outputdir}/merged_dedup_sort.cytosine_report.txt > ${outputdir}/conversion_report.txt
+		rm ${outputdir}/merged_dedup_sort.bam ${outputdir}/merged_dedup_sort.cytosine_report.txt*
     fi
 
 
@@ -692,24 +692,24 @@ if [ -z $postproc ]
     tail -n1 $headfile | awk '{printf"%-1000s\n", $0}' > $outputdir/inter.txt
     if [ $singleend -eq 1 ] 
     then
-	ret=$(samtools view $sthreadstring -f 1024 -F 256 $outputdir/merged_dedup.bam | awk '{if ($0~/rt:A:7/){singdup++}else{dup++}}END{print dup,singdup}')
-	dups=$(echo $ret | awk '{print $1}')
-	singdups=$(echo $ret | awk '{print $2}')
-	cat $splitdir/*.res.txt | awk -v dups=$dups -v singdups=$singdups -v ligation=$ligation -v singleend=1 -f ${juiceDir}/scripts/common/stats_sub.awk >> $outputdir/inter.txt
-    else
-	dups=$(samtools view -c -f 1089 -F 256 $sthreadstring $outputdir/merged_dedup.bam)
-	cat $splitdir/*.res.txt | awk -v dups=$dups -v ligation=$ligation -f ${juiceDir}/scripts/common/stats_sub.awk >> $outputdir/inter.txt
+		ret=$(samtools view $sthreadstring -f 1024 -F 256 $outputdir/merged_dedup.bam | awk '{if ($0~/rt:A:7/){singdup++}else{dup++}}END{print dup,singdup}')
+	  dups=$(echo $ret | awk '{print $1}')
+	  singdups=$(echo $ret | awk '{print $2}')
+	  cat $splitdir/*.res.txt | awk -v dups=$dups -v singdups=$singdups -v ligation=$ligation -v singleend=1 -f ${juiceDir}/scripts/common/stats_sub.awk >> $outputdir/inter.txt
+      else
+	  dups=$(samtools view -c -f 1089 -F 256 $sthreadstring $outputdir/merged_dedup.bam)
+	  cat $splitdir/*.res.txt | awk -v dups=$dups -v ligation=$ligation -f ${juiceDir}/scripts/common/stats_sub.awk >> $outputdir/inter.txt
     fi
 
     cp $outputdir/inter.txt $outputdir/inter_30.txt
 
     if [ $assembly -eq 1 ] 
     then
-	${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter.txt $outputdir/merged1.txt none
-	${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter_30.txt $outputdir/merged30.txt none
+		${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter.txt $outputdir/merged1.txt none
+		${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter_30.txt $outputdir/merged30.txt none
     else
-	${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter.txt $outputdir/merged1.txt $genomePath
-	${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter_30.txt $outputdir/merged30.txt $genomePath
+		${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter.txt $outputdir/merged1.txt $genomePath
+		${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter_30.txt $outputdir/merged30.txt $genomePath
     fi
 
     # if early exit, we stop here, once the stats are calculated
@@ -718,28 +718,28 @@ if [ -z $postproc ]
         if [ $assembly -eq 1 ]
         then
             samtools view $sthreadstring -O SAM -F 1024 $outputdir/merged_dedup.*am | awk -v mnd=1 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged_nodups.txt
-	fi
-	export splitdir=${splitdir}; export outputdir=${outputdir}; export early=1; 
-	if ${juiceDir}/scripts/common/check.sh && [ "$cleanup" = 1 ]
-	then
-	    ${juiceDir}/scripts/common/cleanup.sh
-	fi
-	exit
+		fi
+		export splitdir=${splitdir}; export outputdir=${outputdir}; export early=1; 
+		if ${juiceDir}/scripts/common/check.sh && [ "$cleanup" = 1 ]
+		then
+			${juiceDir}/scripts/common/cleanup.sh
+		fi
+		exit
     fi
     export IBM_JAVA_OPTIONS="-Xmx150000m -Xgcthreads1"
     export _JAVA_OPTIONS="-Xmx150000m -Xms150000m"
     mkdir ${outputdir}"/HIC_tmp"
     if [ "$qc" = 1 ] || [ "$insitu" = 1 ]
     then
-	resstr="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000"
+		resstr="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000"
     else
-	resstr="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100"
+		resstr="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100"
     fi
     if [ "$nofrag" -eq 1 ]
     then
-	fragstr=""
+		fragstr=""
     else
-	fragstr="-f $site_file"
+		fragstr="-f $site_file"
     fi
 
     time ${juiceDir}/scripts/common/juicer_tools pre -n -s $outputdir/inter.txt -g $outputdir/inter_hists.m $fragstr $resstr $threadHicString $outputdir/merged1.txt $outputdir/inter.hic $genomePath
@@ -755,8 +755,17 @@ if [ -z $postproc ]
     # POSTPROCESSING 
     if [ "$qc" != 1 ]
     then
-	${juiceDir}/scripts/common/juicer_postprocessing.sh -j ${juiceDir}/scripts/common/juicer_tools -i ${outputdir}/inter_30.hic -m ${juiceDir}/references/motif -g ${genomeID}  
-    fi
+		${juiceDir}/scripts/common/juicer_postprocessing.sh -j ${juiceDir}/scripts/common/juicer_tools -i ${outputdir}/inter_30.hic -m ${juiceDir}/references/motif -g ${genomeID}  
+
+		# build mapq 30 accessiblity as part of postprocessing for Ultima
+		# needs kentUtils
+		if [ $singleend -eq 1 ] && [[ "$site" == "none" ]] && [[ ! -z "$genomePath" ]]
+		then
+			awk 'BEGIN{OFS="\t"}{cut[$2" "$3]++; cut[$6" "$7]++}END{for(i in cut){split(i, arr, " "); print arr[1], arr[2]-1, arr[2], cut[i]}}' merged30.txt | sort -k1,1 -k2,2n --parallel=$threads -S6G > merged30.bedgraph
+			bedGraphToBigWig merged30.bedgraph $genomePath inter_30.bw
+		fi
+	fi
+] 
 fi
 #CHECK THAT PIPELINE WAS SUCCESSFUL
 export early=$earlyexit

diff --git a/CPU/mega_from_bams_diploid.sh b/CPU/mega_from_bams_diploid.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 {
-#### Description: Wrapper script to phase genomic variants from ENCODE DCC hic-pipeline.
+#### Description: Wrapper script to calculate a "mega" hic file from a set of bams, with optional diploid portion.
 #### Usage: bash ./mega.sh [-v|--vcf <path_to_vcf>] <path_to_merged_dedupped_bam_1> ... <path_to_merged_dedup_bam_N>.
 #### Input: list of merged_dedup.bam files corresponding to individual Hi-C experiments.
 #### Output: "mega" hic map and "mega" chromatin accessibility bw file.
@@ -16,7 +16,7 @@ USAGE="
 *****************************************************
 Simplified mega script for ENCODE DCC hic-pipeline.
 
-USAGE: ./mega.sh -c|--chrom-sizes <path_to_chrom_sizes_file> [-g|--genome-id genome_id] [-r|--resolutions resolutions_string] [-v|--vcf <path_to_vcf>] [-C|--exclude-chr-from-diploid chr_list]  [--separate-homologs] [-p|--psf <path_to_psf>] [--reads-to-homologs <path_to_reads_to_homologs_file>] [--juicer-dir <path_to_juicer_dir>] [--phaser-dir <path_to_phaser_dir>] [-t|--threads thread_count] [-T|--threads-hic hic_thread_count] [--from-stage stage] [--to-stage stage] <path_to_merged_dedup_bam_1> ... <path_to_merged_dedup_bam_N>
+USAGE: ./mega.sh -c|--chrom-sizes <path_to_chrom_sizes_file> [-g|--genome-id genome_id] [-r|--resolutions resolutions_string] [-v|--vcf <path_to_vcf>] [-C|--exclude-chr-from-diploid chr_list] [--separate-homologs] [-p|--psf <path_to_psf>] [--reads-to-homologs <path_to_reads_to_homologs_file>] [--juicer-dir <path_to_juicer_dir>] [--phaser-dir <path_to_phaser_dir>] [-t|--threads thread_count] [-T|--threads-hic hic_thread_count] [--shortcut-stats-1 inter.hic_1,...inter.hic_N] [--shortcut-stats-30 inter_30.hic_1,...inter_30.hic_N] [--from-stage <stage>] [--to-stage <stage>] <path_to_merged_dedup_bam_1> ... <path_to_merged_dedup_bam_N>
 
 DESCRIPTION:
 This is a simplied mega.sh script to produce aggregate Hi-C maps and chromatic accessibility tracks from multiple experiments. The pipeline includes optional diploid steps which produce diploid versions of the contact maps and chromatin accessibility tracks.
@@ -39,6 +39,12 @@ HAPLOID PORTION
 -r|--resolutions    [string]
                         Comma-separated resolutions at which to build the hic files. Default: 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100,50,20,10.
 
+--shortcut-stats-1  [inter.hic_1,...inter.hic_N]
+                        Comma-separated list of hic files to use to quickly calculate the mega stats file, at mapq=1.
+
+--shortcut-stats-30 [inter_30.hic_1,...inter_30.hic_N]
+                        Comma-separated list of hic files to use to quickly calculate the mega stats file, at mapq=30.
+
 DIPLOID PORTION:
 -v|--vcf [path_to_vcf]
 						Path to a Variant Call Format (vcf) file containing phased sequence variation data, e.g. as generated by the ENCODE DCC Hi-C variant calling & phasing pipeline. Passing a vcf file invokes a diploid section of the script.
@@ -78,6 +84,7 @@ WORKFLOW CONTROL:
 
 # defaults:
 resolutionsToBuildString="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100,50,20,10"
+genome_id="hg38"
 exclude_chr="Y|chrY|MT|chrM"
 separate_homologs=false
 mapq=1 ##lowest mapq of interest
@@ -138,6 +145,16 @@ while :; do
             resolutionsToBuildString="-r "$OPTARG
             shift
         ;;
+        --shortcut-stats-1) OPTARG=$2
+            echo "... --shortcut-stats-1 flag was triggered with $OPTARG value." >&1
+            shortcut_stats_1=$OPTARG
+            shift
+        ;;
+        --shortcut-stats-30) OPTARG=$2
+            echo "... --shortcut-stats-30 flag was triggered with $OPTARG value." >&1
+            shortcut_stats_30=$OPTARG
+            shift
+        ;;
 ## DIPLOID PORTION
         -v|--vcf) OPTARG=$2
             if [ -s $OPTARG ] && [[ $OPTARG == *.vcf ]]; then
@@ -332,7 +349,7 @@ if [ "$first_stage" == "hic" ]; then
     export SHELL=$(type -p bash)
     doit () {
             mapq=$2
-            samtools view -@ 2 -h -q $mapq reads.sorted.bam $1 | awk -F '\t' -v mapq=$mapq '{for(i=12;i<=NF;i++){if($i~/^ip:i:/){ip=substr($i,6)}else if ($i~/^mp:i:/){mp=substr($i,6)}else if ($i~/^MQ:i:/){mq=substr($i,6)}}}(mq<mapq){next}$7=="="{if(ip>mp){next}else if (ip==mp){keep[$1]=$0}else{print 0, $3, ip, 0, 0, $3, mp, 1};next}$7<$3{next}{print 0, $3, ip, 0, 0, $7, mp, 1 > "/dev/stderr"}END{for(i in keep){n=split(keep[i],a,"\t"); for(s=12;s<=n;s++){if(a[s]~"^ip:i:"){ip=substr(a[s],6)}}; print 0, a[3], ip, 0, 0, a[3], ip, 1}}'
+            samtools view -@ 2 -q $mapq reads.sorted.bam $1 | awk -F '\t' -v mapq=$mapq '{ip=0; mp=0; mq=-1; cb=0; chimeric=0; for(i=12;i<=NF;i++){if($i~/^ip:i:/){ip=substr($i,6)}else if ($i~/^mp:i:/){mp=substr($i,6)}else if ($i~/^MQ:i:/){mq=substr($i,6)}else if ($i~/cb:Z:/){cb=i}else if ($i~/SA:Z:/){chimeric=i}}}(mq<mapq){next}$7=="*"{if(!chimeric){next}else{split(substr($chimeric,6),a,","); $7=a[1]}}$7=="="{$7=$3}!cb{next}{cbchr1=gensub("_","","g",$3);cbchr2=gensub("_","","g",$7);testcb1="cb:Z:"cbchr1"_"cbchr2"_"; testcb2="cb:Z:"cbchr2"_"cbchr1"_"}($cb!~testcb1&&$cb!~testcb2){next}(!ip||!mp){next}$7==$3{if(ip>mp){next}else if (ip==mp){keep[$1]=$3" "ip}else{print 0, $3, ip, 0, 0, $7, mp, 1};next}$7<$3{next}{print 0, $3, ip, 0, 0, $7, mp, 1 > "/dev/stderr"}END{for(rd in keep){n=split(keep[rd], a, " "); print 0, a[1], a[2], 0, 0, a[1], a[2], 1}}'
     }
 
     export -f doit
@@ -365,8 +382,16 @@ if [ "$first_stage" == "hic" ]; then
     # awk '{print NR}END{print NR+1}' $chrom_sizes | parallel --will-cite "rm out.{}.txt err.{}.txt"
     # rm temp.log
 
-    touch inter.txt inter_hists.m inter_30.txt inter_hists_30.m
-
+    touch inter.txt inter_hists.m
+    if [ ! -z ${shortcut_stats_1} ]; then
+        tmpstr=""
+        IFS=',' read -ra STATS <<< "$shortcut_stats_1"
+        for i in "${STATS[@]}"; do
+            [ -s $i ] && tmpstr=$tmpstr" "$i || { echo "One or more of the files listed for shortcutting stats calculation does not exist at expected location or is empty. Continuing without the stats!"; tmpstr=""; break; }
+        done
+        [ "$tmpstr" != "" ] && java -jar ${juicer_dir}/scripts/merge-stats.jar inter ${tmpstr:1} 
+    fi
+
     if [[ $threadsHic -gt 1 ]] && [[ ! -s merged1_index.txt ]]
     then
         "${juicer_dir}"/scripts/index_by_chr.awk merged1.txt 500000 > merged1_index.txt
@@ -376,9 +401,19 @@ if [ "$first_stage" == "hic" ]; then
         threadHicString=""
     fi
 
+    touch inter_30.txt inter_30_hists.m
+    if [ ! -z ${shortcut_stats_30} ]; then
+        tmpstr=""
+        IFS=',' read -ra STATS <<< "$shortcut_stats_30"
+        for i in "${STATS[@]}"; do
+            [ -s $i ] && tmpstr=$tmpstr" "$i || { echo "One or more of the files listed for shortcutting stats calculation does not exist at expected location or is empty. Continuing without the stats!"; tmpstr=""; break; }
+        done
+        [ "$tmpstr" != "" ] && java -jar ${juicer_dir}/scripts/merge-stats.jar inter_30 ${tmpstr:1} 
+    fi
+
     if [[ $threadsHic -gt 1 ]] && [[ ! -s merged30_index.txt ]]
 	then
-	    "${juicer_dir}"/scripts/common/index_by_chr.awk merged30.txt 500000 > merged30_index.txt
+	    "${juicer_dir}"/scripts/index_by_chr.awk merged30.txt 500000 > merged30_index.txt
         tempdirPre30="HIC30_tmp" && mkdir "${tempdirPre30}"
 	    threadHic30String="--threads $threadsHic -i merged30_index.txt -t ${tempdirPre30}"
     else
@@ -396,7 +431,7 @@ if [ "$first_stage" == "hic" ]; then
     ## TODO: check for failure
 
     "${juicer_dir}"/scripts/juicer_tools pre -n -s inter_30.txt -g inter_30_hists.m -q 30 "$resolutionsToBuildString" "$threadHic30String" merged30.txt inter_30.hic "$chrom_sizes"
-	"${juicer_dir}"/scripts/juicer_tools addNorm --threads $threadsHic "${outputDir}"/inter_30.hic
+	"${juicer_dir}"/scripts/juicer_tools addNorm --threads $threadsHic inter_30.hic
     rm -Rf "${tempdirPre30}"
     ## TODO: check for failure