Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge Olga's changes #260

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
145 changes: 77 additions & 68 deletions CPU/juicer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -628,62 +628,62 @@ fi

#Skip if post-processing only is required
if [ -z $postproc ]
then
# if we haven't already zipped up merged_dedup
if [ ! -s ${outputdir}/merged_dedup.bam ]
then
# Check that dedupping worked properly
# in ideal world, we would check this in split_rmdups and not remove before we know they are correct
size1=$(samtools view $sthreadstring -h ${outputdir}/merged_sort.bam | wc -l | awk '{print $1}')
size2=$(wc -l ${outputdir}/merged_dedup.sam | awk '{print $1}')

if [ $size1 -ne $size2 ]
then
echo "***! Error! The sorted file and dups/no dups files do not add up, or were empty."
exit 1
fi
samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.sam | awk -v mapq=1 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged1.txt
samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.sam | awk -v mapq=30 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged30.txt
else
if [ ! -s ${outputdir}/merged1.txt ]
then
samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.bam | awk -v mapq=1 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged1.txt
fi
if [ ! -s ${outputdir}/merged30.txt ]
then
samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.bam | awk -v mapq=30 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged30.txt
fi
then
# if we haven't already zipped up merged_dedup
if [ ! -s ${outputdir}/merged_dedup.bam ]
then
# Check that dedupping worked properly
# in ideal world, we would check this in split_rmdups and not remove before we know they are correct
size1=$(samtools view $sthreadstring -h ${outputdir}/merged_sort.bam | wc -l | awk '{print $1}')
size2=$(wc -l ${outputdir}/merged_dedup.sam | awk '{print $1}')
if [ $size1 -ne $size2 ]
then
echo "***! Error! The sorted file and dups/no dups files do not add up, or were empty."
exit 1
fi
samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.sam | awk -v mapq=1 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged1.txt
samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.sam | awk -v mapq=30 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged30.txt
else
if [ ! -s ${outputdir}/merged1.txt ]
then
samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.bam | awk -v mapq=1 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged1.txt
fi
if [ ! -s ${outputdir}/merged30.txt ]
then
samtools view $sthreadstring -F 1024 -O sam ${outputdir}/merged_dedup.bam | awk -v mapq=30 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged30.txt
fi
fi

if [[ $threadsHic -gt 1 ]]
then
if [ ! -s ${outputdir}/merged1_index.txt ]
then
time ${juiceDir}/scripts/common/index_by_chr.awk ${outputdir}/merged1.txt 500000 > ${outputdir}/merged1_index.txt
fi
if [ ! -s ${outputdir}/merged30_index.txt ]
then
time ${juiceDir}/scripts/common/index_by_chr.awk ${outputdir}/merged30.txt 500000 > ${outputdir}/merged30_index.txt
fi
if [ ! -s ${outputdir}/merged1_index.txt ]
then
time ${juiceDir}/scripts/common/index_by_chr.awk ${outputdir}/merged1.txt 500000 > ${outputdir}/merged1_index.txt
fi
if [ ! -s ${outputdir}/merged30_index.txt ]
then
time ${juiceDir}/scripts/common/index_by_chr.awk ${outputdir}/merged30.txt 500000 > ${outputdir}/merged30_index.txt
fi
fi

if [ ! -s ${outputdir}/merged_dedup.bam ]
then
if samtools view -b $sthreadstring ${outputdir}/merged_dedup.sam > ${outputdir}/merged_dedup.bam
then
rm ${outputdir}/merged_dedup.sam
rm ${outputdir}/merged_sort.bam
fi
if samtools view -b $sthreadstring ${outputdir}/merged_dedup.sam > ${outputdir}/merged_dedup.bam
then
rm ${outputdir}/merged_dedup.sam
rm ${outputdir}/merged_sort.bam
fi
fi

if [ "$methylation" = 1 ]
then
$load_methyl
samtools sort $sthreadstring ${outputdir}/merged_dedup.bam > ${outputdir}/merged_dedup_sort.bam
$call_methyl extract -F 1024 --keepSingleton --keepDiscordant $refSeq ${outputdir}/merged_dedup_sort.bam
$call_methyl extract -F 1024 --keepSingleton --keepDiscordant --cytosine_report --CHH --CHG $refSeq ${outputdir}/merged_dedup_sort.bam
${juiceDir}/scripts/common/conversion.sh ${outputdir}/merged_dedup_sort.cytosine_report.txt > ${outputdir}/conversion_report.txt
rm ${outputdir}/merged_dedup_sort.bam ${outputdir}/merged_dedup_sort.cytosine_report.txt*
$load_methyl
samtools sort $sthreadstring ${outputdir}/merged_dedup.bam > ${outputdir}/merged_dedup_sort.bam
$call_methyl extract -F 1024 --keepSingleton --keepDiscordant $refSeq ${outputdir}/merged_dedup_sort.bam
$call_methyl extract -F 1024 --keepSingleton --keepDiscordant --cytosine_report --CHH --CHG $refSeq ${outputdir}/merged_dedup_sort.bam
${juiceDir}/scripts/common/conversion.sh ${outputdir}/merged_dedup_sort.cytosine_report.txt > ${outputdir}/conversion_report.txt
rm ${outputdir}/merged_dedup_sort.bam ${outputdir}/merged_dedup_sort.cytosine_report.txt*
fi


Expand All @@ -692,24 +692,24 @@ if [ -z $postproc ]
tail -n1 $headfile | awk '{printf"%-1000s\n", $0}' > $outputdir/inter.txt
if [ $singleend -eq 1 ]
then
ret=$(samtools view $sthreadstring -f 1024 -F 256 $outputdir/merged_dedup.bam | awk '{if ($0~/rt:A:7/){singdup++}else{dup++}}END{print dup,singdup}')
dups=$(echo $ret | awk '{print $1}')
singdups=$(echo $ret | awk '{print $2}')
cat $splitdir/*.res.txt | awk -v dups=$dups -v singdups=$singdups -v ligation=$ligation -v singleend=1 -f ${juiceDir}/scripts/common/stats_sub.awk >> $outputdir/inter.txt
else
dups=$(samtools view -c -f 1089 -F 256 $sthreadstring $outputdir/merged_dedup.bam)
cat $splitdir/*.res.txt | awk -v dups=$dups -v ligation=$ligation -f ${juiceDir}/scripts/common/stats_sub.awk >> $outputdir/inter.txt
ret=$(samtools view $sthreadstring -f 1024 -F 256 $outputdir/merged_dedup.bam | awk '{if ($0~/rt:A:7/){singdup++}else{dup++}}END{print dup,singdup}')
dups=$(echo $ret | awk '{print $1}')
singdups=$(echo $ret | awk '{print $2}')
cat $splitdir/*.res.txt | awk -v dups=$dups -v singdups=$singdups -v ligation=$ligation -v singleend=1 -f ${juiceDir}/scripts/common/stats_sub.awk >> $outputdir/inter.txt
else
dups=$(samtools view -c -f 1089 -F 256 $sthreadstring $outputdir/merged_dedup.bam)
cat $splitdir/*.res.txt | awk -v dups=$dups -v ligation=$ligation -f ${juiceDir}/scripts/common/stats_sub.awk >> $outputdir/inter.txt
fi

cp $outputdir/inter.txt $outputdir/inter_30.txt

if [ $assembly -eq 1 ]
then
${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter.txt $outputdir/merged1.txt none
${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter_30.txt $outputdir/merged30.txt none
${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter.txt $outputdir/merged1.txt none
${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter_30.txt $outputdir/merged30.txt none
else
${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter.txt $outputdir/merged1.txt $genomePath
${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter_30.txt $outputdir/merged30.txt $genomePath
${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter.txt $outputdir/merged1.txt $genomePath
${juiceDir}/scripts/common/juicer_tools statistics $site_file $outputdir/inter_30.txt $outputdir/merged30.txt $genomePath
fi

# if early exit, we stop here, once the stats are calculated
Expand All @@ -718,28 +718,28 @@ if [ -z $postproc ]
if [ $assembly -eq 1 ]
then
samtools view $sthreadstring -O SAM -F 1024 $outputdir/merged_dedup.*am | awk -v mnd=1 -f ${juiceDir}/scripts/common/sam_to_pre.awk > ${outputdir}/merged_nodups.txt
fi
export splitdir=${splitdir}; export outputdir=${outputdir}; export early=1;
if ${juiceDir}/scripts/common/check.sh && [ "$cleanup" = 1 ]
then
${juiceDir}/scripts/common/cleanup.sh
fi
exit
fi
export splitdir=${splitdir}; export outputdir=${outputdir}; export early=1;
if ${juiceDir}/scripts/common/check.sh && [ "$cleanup" = 1 ]
then
${juiceDir}/scripts/common/cleanup.sh
fi
exit
fi
export IBM_JAVA_OPTIONS="-Xmx150000m -Xgcthreads1"
export _JAVA_OPTIONS="-Xmx150000m -Xms150000m"
mkdir ${outputdir}"/HIC_tmp"
if [ "$qc" = 1 ] || [ "$insitu" = 1 ]
then
resstr="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000"
resstr="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000"
else
resstr="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100"
resstr="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100"
fi
if [ "$nofrag" -eq 1 ]
then
fragstr=""
fragstr=""
else
fragstr="-f $site_file"
fragstr="-f $site_file"
fi

time ${juiceDir}/scripts/common/juicer_tools pre -n -s $outputdir/inter.txt -g $outputdir/inter_hists.m $fragstr $resstr $threadHicString $outputdir/merged1.txt $outputdir/inter.hic $genomePath
Expand All @@ -755,8 +755,17 @@ if [ -z $postproc ]
# POSTPROCESSING
if [ "$qc" != 1 ]
then
${juiceDir}/scripts/common/juicer_postprocessing.sh -j ${juiceDir}/scripts/common/juicer_tools -i ${outputdir}/inter_30.hic -m ${juiceDir}/references/motif -g ${genomeID}
fi
${juiceDir}/scripts/common/juicer_postprocessing.sh -j ${juiceDir}/scripts/common/juicer_tools -i ${outputdir}/inter_30.hic -m ${juiceDir}/references/motif -g ${genomeID}

# build mapq 30 accessiblity as part of postprocessing for Ultima
# needs kentUtils
if [ $singleend -eq 1 ] && [[ "$site" == "none" ]] && [[ ! -z "$genomePath" ]]
then
awk 'BEGIN{OFS="\t"}{cut[$2" "$3]++; cut[$6" "$7]++}END{for(i in cut){split(i, arr, " "); print arr[1], arr[2]-1, arr[2], cut[i]}}' merged30.txt | sort -k1,1 -k2,2n --parallel=$threads -S6G > merged30.bedgraph
bedGraphToBigWig merged30.bedgraph $genomePath inter_30.bw
fi
fi
]
fi
#CHECK THAT PIPELINE WAS SUCCESSFUL
export early=$earlyexit
Expand Down
49 changes: 42 additions & 7 deletions CPU/mega_from_bams_diploid.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
{
#### Description: Wrapper script to phase genomic variants from ENCODE DCC hic-pipeline.
#### Description: Wrapper script to calculate a "mega" hic file from a set of bams, with optional diploid portion.
#### Usage: bash ./mega.sh [-v|--vcf <path_to_vcf>] <path_to_merged_dedupped_bam_1> ... <path_to_merged_dedup_bam_N>.
#### Input: list of merged_dedup.bam files corresponding to individual Hi-C experiments.
#### Output: "mega" hic map and "mega" chromatin accessibility bw file.
Expand All @@ -16,7 +16,7 @@ USAGE="
*****************************************************
Simplified mega script for ENCODE DCC hic-pipeline.

USAGE: ./mega.sh -c|--chrom-sizes <path_to_chrom_sizes_file> [-g|--genome-id genome_id] [-r|--resolutions resolutions_string] [-v|--vcf <path_to_vcf>] [-C|--exclude-chr-from-diploid chr_list] [--separate-homologs] [-p|--psf <path_to_psf>] [--reads-to-homologs <path_to_reads_to_homologs_file>] [--juicer-dir <path_to_juicer_dir>] [--phaser-dir <path_to_phaser_dir>] [-t|--threads thread_count] [-T|--threads-hic hic_thread_count] [--from-stage stage] [--to-stage stage] <path_to_merged_dedup_bam_1> ... <path_to_merged_dedup_bam_N>
USAGE: ./mega.sh -c|--chrom-sizes <path_to_chrom_sizes_file> [-g|--genome-id genome_id] [-r|--resolutions resolutions_string] [-v|--vcf <path_to_vcf>] [-C|--exclude-chr-from-diploid chr_list] [--separate-homologs] [-p|--psf <path_to_psf>] [--reads-to-homologs <path_to_reads_to_homologs_file>] [--juicer-dir <path_to_juicer_dir>] [--phaser-dir <path_to_phaser_dir>] [-t|--threads thread_count] [-T|--threads-hic hic_thread_count] [--shortcut-stats-1 inter.hic_1,...inter.hic_N] [--shortcut-stats-30 inter_30.hic_1,...inter_30.hic_N] [--from-stage <stage>] [--to-stage <stage>] <path_to_merged_dedup_bam_1> ... <path_to_merged_dedup_bam_N>

DESCRIPTION:
This is a simplied mega.sh script to produce aggregate Hi-C maps and chromatic accessibility tracks from multiple experiments. The pipeline includes optional diploid steps which produce diploid versions of the contact maps and chromatin accessibility tracks.
Expand All @@ -39,6 +39,12 @@ HAPLOID PORTION
-r|--resolutions [string]
Comma-separated resolutions at which to build the hic files. Default: 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100,50,20,10.

--shortcut-stats-1 [inter.hic_1,...inter.hic_N]
Comma-separated list of hic files to use to quickly calculate the mega stats file, at mapq=1.

--shortcut-stats-30 [inter_30.hic_1,...inter_30.hic_N]
Comma-separated list of hic files to use to quickly calculate the mega stats file, at mapq=30.

DIPLOID PORTION:
-v|--vcf [path_to_vcf]
Path to a Variant Call Format (vcf) file containing phased sequence variation data, e.g. as generated by the ENCODE DCC Hi-C variant calling & phasing pipeline. Passing a vcf file invokes a diploid section of the script.
Expand Down Expand Up @@ -78,6 +84,7 @@ WORKFLOW CONTROL:

# defaults:
resolutionsToBuildString="-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000,2000,1000,500,200,100,50,20,10"
genome_id="hg38"
exclude_chr="Y|chrY|MT|chrM"
separate_homologs=false
mapq=1 ##lowest mapq of interest
Expand Down Expand Up @@ -138,6 +145,16 @@ while :; do
resolutionsToBuildString="-r "$OPTARG
shift
;;
--shortcut-stats-1) OPTARG=$2
echo "... --shortcut-stats-1 flag was triggered with $OPTARG value." >&1
shortcut_stats_1=$OPTARG
shift
;;
--shortcut-stats-30) OPTARG=$2
echo "... --shortcut-stats-30 flag was triggered with $OPTARG value." >&1
shortcut_stats_30=$OPTARG
shift
;;
## DIPLOID PORTION
-v|--vcf) OPTARG=$2
if [ -s $OPTARG ] && [[ $OPTARG == *.vcf ]]; then
Expand Down Expand Up @@ -332,7 +349,7 @@ if [ "$first_stage" == "hic" ]; then
export SHELL=$(type -p bash)
doit () {
mapq=$2
samtools view -@ 2 -h -q $mapq reads.sorted.bam $1 | awk -F '\t' -v mapq=$mapq '{for(i=12;i<=NF;i++){if($i~/^ip:i:/){ip=substr($i,6)}else if ($i~/^mp:i:/){mp=substr($i,6)}else if ($i~/^MQ:i:/){mq=substr($i,6)}}}(mq<mapq){next}$7=="="{if(ip>mp){next}else if (ip==mp){keep[$1]=$0}else{print 0, $3, ip, 0, 0, $3, mp, 1};next}$7<$3{next}{print 0, $3, ip, 0, 0, $7, mp, 1 > "/dev/stderr"}END{for(i in keep){n=split(keep[i],a,"\t"); for(s=12;s<=n;s++){if(a[s]~"^ip:i:"){ip=substr(a[s],6)}}; print 0, a[3], ip, 0, 0, a[3], ip, 1}}'
samtools view -@ 2 -q $mapq reads.sorted.bam $1 | awk -F '\t' -v mapq=$mapq '{ip=0; mp=0; mq=-1; cb=0; chimeric=0; for(i=12;i<=NF;i++){if($i~/^ip:i:/){ip=substr($i,6)}else if ($i~/^mp:i:/){mp=substr($i,6)}else if ($i~/^MQ:i:/){mq=substr($i,6)}else if ($i~/cb:Z:/){cb=i}else if ($i~/SA:Z:/){chimeric=i}}}(mq<mapq){next}$7=="*"{if(!chimeric){next}else{split(substr($chimeric,6),a,","); $7=a[1]}}$7=="="{$7=$3}!cb{next}{cbchr1=gensub("_","","g",$3);cbchr2=gensub("_","","g",$7);testcb1="cb:Z:"cbchr1"_"cbchr2"_"; testcb2="cb:Z:"cbchr2"_"cbchr1"_"}($cb!~testcb1&&$cb!~testcb2){next}(!ip||!mp){next}$7==$3{if(ip>mp){next}else if (ip==mp){keep[$1]=$3" "ip}else{print 0, $3, ip, 0, 0, $7, mp, 1};next}$7<$3{next}{print 0, $3, ip, 0, 0, $7, mp, 1 > "/dev/stderr"}END{for(rd in keep){n=split(keep[rd], a, " "); print 0, a[1], a[2], 0, 0, a[1], a[2], 1}}'
}

export -f doit
Expand Down Expand Up @@ -365,8 +382,16 @@ if [ "$first_stage" == "hic" ]; then
# awk '{print NR}END{print NR+1}' $chrom_sizes | parallel --will-cite "rm out.{}.txt err.{}.txt"
# rm temp.log

touch inter.txt inter_hists.m inter_30.txt inter_hists_30.m

touch inter.txt inter_hists.m
if [ ! -z ${shortcut_stats_1} ]; then
tmpstr=""
IFS=',' read -ra STATS <<< "$shortcut_stats_1"
for i in "${STATS[@]}"; do
[ -s $i ] && tmpstr=$tmpstr" "$i || { echo "One or more of the files listed for shortcutting stats calculation does not exist at expected location or is empty. Continuing without the stats!"; tmpstr=""; break; }
done
[ "$tmpstr" != "" ] && java -jar ${juicer_dir}/scripts/merge-stats.jar inter ${tmpstr:1}
fi

if [[ $threadsHic -gt 1 ]] && [[ ! -s merged1_index.txt ]]
then
"${juicer_dir}"/scripts/index_by_chr.awk merged1.txt 500000 > merged1_index.txt
Expand All @@ -376,9 +401,19 @@ if [ "$first_stage" == "hic" ]; then
threadHicString=""
fi

touch inter_30.txt inter_30_hists.m
if [ ! -z ${shortcut_stats_30} ]; then
tmpstr=""
IFS=',' read -ra STATS <<< "$shortcut_stats_30"
for i in "${STATS[@]}"; do
[ -s $i ] && tmpstr=$tmpstr" "$i || { echo "One or more of the files listed for shortcutting stats calculation does not exist at expected location or is empty. Continuing without the stats!"; tmpstr=""; break; }
done
[ "$tmpstr" != "" ] && java -jar ${juicer_dir}/scripts/merge-stats.jar inter_30 ${tmpstr:1}
fi

if [[ $threadsHic -gt 1 ]] && [[ ! -s merged30_index.txt ]]
then
"${juicer_dir}"/scripts/common/index_by_chr.awk merged30.txt 500000 > merged30_index.txt
"${juicer_dir}"/scripts/index_by_chr.awk merged30.txt 500000 > merged30_index.txt
tempdirPre30="HIC30_tmp" && mkdir "${tempdirPre30}"
threadHic30String="--threads $threadsHic -i merged30_index.txt -t ${tempdirPre30}"
else
Expand All @@ -396,7 +431,7 @@ if [ "$first_stage" == "hic" ]; then
## TODO: check for failure

"${juicer_dir}"/scripts/juicer_tools pre -n -s inter_30.txt -g inter_30_hists.m -q 30 "$resolutionsToBuildString" "$threadHic30String" merged30.txt inter_30.hic "$chrom_sizes"
"${juicer_dir}"/scripts/juicer_tools addNorm --threads $threadsHic "${outputDir}"/inter_30.hic
"${juicer_dir}"/scripts/juicer_tools addNorm --threads $threadsHic inter_30.hic
rm -Rf "${tempdirPre30}"
## TODO: check for failure

Expand Down
Loading