Skip to content

Commit

Permalink
This commit is primary of a revert of commit: 32cd457 , which marked …
Browse files Browse the repository at this point in the history
…duplicates on all libraries as one. This new commit aligns and marks duplicates on each library separately. it is also more robust to malformed BAM headers for realignment
  • Loading branch information
cc2qe committed Jan 18, 2015
1 parent f947a16 commit b2f19e1
Show file tree
Hide file tree
Showing 6 changed files with 293 additions and 39 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ bin/bamtofastq.py
bin/lumpyToBedpe
bin/bamgroupreads.py
bin/bamfilterrg.py
bin/bamcleanheader.py

# Compiled source #
###################
Expand Down
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ scripts:
cp $(SCRIPTS_DIR)/bamheadrg.py $(TARGET_BIN)
cp $(SCRIPTS_DIR)/bamgroupreads.py $(TARGET_BIN)
cp $(SCRIPTS_DIR)/bamfilterrg.py $(TARGET_BIN)
cp $(SCRIPTS_DIR)/bamcleanheader.py $(TARGET_BIN)

clean:
rm -f \
Expand All @@ -105,6 +106,7 @@ clean:
bin/bamheadrg.py \
bin/bamgroupreads.py \
bin/bamfilterrg.py \
bin/bamcleanheader.py \
bin/cnvnator-multi \
bin/annotate_rd.py
$(MAKE) -C $(BWA_DIR) clean
Expand Down
179 changes: 142 additions & 37 deletions bin/speedseq
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ function source_binaries() {
BAMTOFASTQ=`which bamtofastq.py || true`
MBUFFER=`which mbuffer || true`
BAMHEADRG=`which bamheadrg.py || true`
BAMCLEANHEADER=`which bamcleanheader.py || true`
fi
}

Expand Down Expand Up @@ -1304,6 +1305,11 @@ global options:
sv_usage
echo -e "Error: bamheadrg.py executable not found. Please set path in $SPEEDSEQ_DIR/speedseq.config file\n"
exit 1
elif [[ -z "$BAMCLEANHEADER" ]] && [[ "$REALIGN" -eq 1 ]]
then
sv_usage
echo -e "Error: bamcleanheader.py executable not found. Please set path in $SPEEDSEQ_DIR/speedseq.config file\n"
exit 1
elif [[ -z "$MBUFFER" ]] && [[ "$REALIGN" -eq 1 ]]
then
sv_usage
Expand All @@ -1312,7 +1318,7 @@ global options:
elif [[ -z "$BAMFILTERRG" ]]
then
sv_usage
echo -e "Error: bamheadrg.py executable not found. Please set path in $SPEEDSEQ_DIR/speedseq.config file\n"
echo -e "Error: bamfilterrg.py executable not found. Please set path in $SPEEDSEQ_DIR/speedseq.config file\n"
exit 1
fi

Expand Down Expand Up @@ -1431,7 +1437,7 @@ global options:
READ_LENGTH=`$SAMBAMBA view $FULL_BAM | head -n 10000 | awk 'BEGIN { MAX_LEN=0 } { LEN=length($10); if (LEN>MAX_LEN) MAX_LEN=LEN } END { print MAX_LEN }'`

# parse the libraries in the BAM header to extract readgroups from the same library
LIB_RG_LIST=(`$SAMBAMBA view -H $FULL_BAM \
LIB_RG_LIST=(`$BAMCLEANHEADER $FULL_BAM \
| grep "^@RG" \
| awk \
'{
Expand Down Expand Up @@ -1600,7 +1606,7 @@ global options:

# LIB_RG_LIST contains an element for each library in the BAM file.
# These elements are comma delimited strings for the readgroups for each library.
LIB_RG_LIST=(`$SAMBAMBA view -H ${FULL_BAM_LIST[$i]} \
LIB_RG_LIST=(`$BAMCLEANHEADER ${FULL_BAM_LIST[$i]} \
| grep "^@RG" \
| awk \
'{
Expand All @@ -1626,6 +1632,13 @@ global options:
}
}'`)

if [[ ${#REALIGN_RG_LIST[@]} -eq 0 ]]
then
realign_usage
echo -e "Error: $IN_BAM lacks read group fields (@RG) in BAM header\n"
exit 1
fi

# generate the histo, stats, and config files
echo "Calculating insert distributions... "
for j in $( seq 0 $(( ${#LIB_RG_LIST[@]}-1 )) )
Expand Down Expand Up @@ -2071,6 +2084,11 @@ global options:
realign_usage
echo -e "Error: bamheadrg.py executable not found. Please set path in $SPEEDSEQ_DIR/speedseq.config file\n"
exit 1
elif [[ -z "$BAMCLEANHEADER" ]]
then
realign_usage
echo -e "Error: bamcleanheader.py executable not found. Please set path in $SPEEDSEQ_DIR/speedseq.config file\n"
exit 1
elif [[ -z "$PARALLEL" ]]
then
realign_usage
Expand All @@ -2082,6 +2100,44 @@ global options:
# check for required python modules (pysam, numpy, scipy, etc)
check_python_modules $PYTHON27

# parse the libraries in the BAM header to extract readgroups from the same library
if [[ -z $REALIGN_RG_LIST ]]
then
REALIGN_RG_LIST=(`$BAMCLEANHEADER $IN_BAM \
| grep "^@RG" \
| awk \
'{
for (j=1;j<=NF;++j) {
if ($j~"^ID") {
id=$j
gsub("^ID:","",id)
}
else if ($j~"^LB:") {
lib=$j
gsub("^LB:","",lib);
}
}
lib_list[lib][length(lib_list[lib])+1] = id
}
END {
for (i in lib_list) {
RG_STRING=lib_list[i][1]
for (j=2;j<=length(lib_list[i]);++j) {
RG_STRING=RG_STRING","lib_list[i][j]
}
print RG_STRING
}
}'`)
fi

echo $REALIGN_RG_LIST
if [[ ${#REALIGN_RG_LIST[@]} -eq 0 ]]
then
realign_usage
echo -e "Error: $IN_BAM lacks read group fields (@RG) in BAM header\n"
exit 1
fi

# set the output name
OUTBASE=`basename "$OUTPUT"`

Expand Down Expand Up @@ -2123,43 +2179,92 @@ global options:
fi
FQ="$TEMP_DIR/fq_pipe"

# alignment command
if [[ $VERBOSE -eq 1 ]]
then
for i in $( seq 0 $(( ${#REALIGN_RG_LIST[@]}-1 )) )
do
REALIGN_RG=${REALIGN_RG_LIST[i]}
echo " Re-aligning readgroups: $REALIGN_RG"

# alignment command
if [[ $VERBOSE -eq 1 ]]
then
echo -e "
$PYTHON27 $BAMTOFASTQ -r $REALIGN_RG $RENAME -i $IN_BAM | $MBUFFER -q -m 1G > $FQ
$BWA mem -t $THREADS -C -M -p $INS_DIST $RG_FMT $REF $FQ \\
| $PYTHON27 $BAMHEADRG -d $IN_BAM -r $REALIGN_RG \\
| $SAMBLASTER $INCLUDE_DUPS --addMateTags --maxSplitCount $MAX_SPLIT_COUNT --minNonOverlap $MIN_NON_OVERLAP --splitterFile $TEMP_DIR/spl_pipe --discordantFile $TEMP_DIR/disc_pipe \\
| $SAMBAMBA view -S -f bam -l 0 /dev/stdin \\
| $SAMBAMBA sort -t $THREADS -m $((${SORT_MEM}-2))G --tmpdir=$TEMP_DIR/full -o $TEMP_DIR/$OUTBASE.$(($i+1)).bam /dev/stdin
$SAMBAMBA view -S -f bam -l 0 $TEMP_DIR/spl_pipe \\
| $SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/spl -o $TEMP_DIR/$OUTBASE.$(($i+1)).splitters.bam /dev/stdin
$SAMBAMBA view -S -f bam $TEMP_DIR/disc_pipe \\
| $SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/disc -o $TEMP_DIR/$OUTBASE.$(($i+1)).discordants.bam /dev/stdin"
fi

echo -e "
$PYTHON27 $BAMTOFASTQ $RENAME -i $IN_BAM | $MBUFFER -q -m 1G > $FQ
$BWA mem -t $THREADS -C -M -p $INS_DIST $REF $FQ | \\
$PYTHON27 $BAMHEADRG -d $IN_BAM | \\
$SAMBLASTER $INCLUDE_DUPS --addMateTags --maxSplitCount $MAX_SPLIT_COUNT --minNonOverlap $MIN_NON_OVERLAP --splitterFile $TEMP_DIR/spl_pipe --discordantFile $TEMP_DIR/disc_pipe | \\
$SAMBAMBA view -S -f bam -l 0 /dev/stdin | \\
$SAMBAMBA sort -t $THREADS -m $((${SORT_MEM}-2))G --tmpdir=$TEMP_DIR/full -o $OUTPUT.bam /dev/stdin
$SAMBAMBA view -S -f bam -l 0 $TEMP_DIR/spl_pipe | \\
$SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/spl -o $OUTPUT.splitters.bam /dev/stdin
$SAMBAMBA view -S -f bam $TEMP_DIR/disc_pipe | \\
$SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/disc -o $OUTPUT.discordants.bam /dev/stdin"
fi

echo -e "
$PYTHON27 $BAMTOFASTQ $RENAME -i $IN_BAM | $MBUFFER -q -m 1G > $FQ
$BWA mem -t $THREADS -C -M -p $INS_DIST $REF $FQ | \
$PYTHON27 $BAMHEADRG -d $IN_BAM | \
$SAMBLASTER $INCLUDE_DUPS --addMateTags --maxSplitCount $MAX_SPLIT_COUNT --minNonOverlap $MIN_NON_OVERLAP --splitterFile $TEMP_DIR/spl_pipe --discordantFile $TEMP_DIR/disc_pipe | \
$SAMBAMBA view -S -f bam -l 0 /dev/stdin | \
$SAMBAMBA sort -t $THREADS -m $((${SORT_MEM}-2))G --tmpdir=$TEMP_DIR/full -o $OUTPUT.bam /dev/stdin
$SAMBAMBA view -S -f bam -l 0 $TEMP_DIR/spl_pipe | \
$SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/spl -o $OUTPUT.splitters.bam /dev/stdin
$SAMBAMBA view -S -f bam $TEMP_DIR/disc_pipe | \
$SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/disc -o $OUTPUT.discordants.bam /dev/stdin
" | $PARALLEL -j 4
$PYTHON27 $BAMTOFASTQ -r $REALIGN_RG $RENAME -i $IN_BAM | $MBUFFER -q -m 1G > $FQ
$BWA mem -t $THREADS -C -M -p $INS_DIST $RG_FMT $REF $FQ \
| $PYTHON27 $BAMHEADRG -d $IN_BAM -r $REALIGN_RG \
| $SAMBLASTER $INCLUDE_DUPS --addMateTags --maxSplitCount $MAX_SPLIT_COUNT --minNonOverlap $MIN_NON_OVERLAP --splitterFile $TEMP_DIR/spl_pipe --discordantFile $TEMP_DIR/disc_pipe \
| $SAMBAMBA view -S -f bam -l 0 /dev/stdin \
| $SAMBAMBA sort -t $THREADS -m $((${SORT_MEM}-2))G --tmpdir=$TEMP_DIR/full -o $TEMP_DIR/$OUTBASE.$(($i+1)).bam /dev/stdin
$SAMBAMBA view -S -f bam -l 0 $TEMP_DIR/spl_pipe \
| $SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/spl -o $TEMP_DIR/$OUTBASE.$(($i+1)).splitters.bam /dev/stdin
$SAMBAMBA view -S -f bam $TEMP_DIR/disc_pipe \
| $SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/disc -o $TEMP_DIR/$OUTBASE.$(($i+1)).discordants.bam /dev/stdin
" | $PARALLEL -j 4
done

echo "
$SAMBAMBA index $OUTPUT.bam
$SAMBAMBA index $OUTPUT.discordants.bam
$SAMBAMBA index $OUTPUT.splitters.bam
" | $PARALLEL -j 3
# if only 1 library, then rename files and index
if [[ ${#REALIGN_RG_LIST[@]} -eq 1 ]]
then
mv $TEMP_DIR/$OUTBASE.1.bam $OUTPUT.bam
mv $TEMP_DIR/$OUTBASE.1.discordants.bam $OUTPUT.discordants.bam
mv $TEMP_DIR/$OUTBASE.1.splitters.bam $OUTPUT.splitters.bam

echo "
$SAMBAMBA index $OUTPUT.bam
$SAMBAMBA index $OUTPUT.discordants.bam
$SAMBAMBA index $OUTPUT.splitters.bam
" | $PARALLEL -j 3

else
MERGE_FULL=""
MERGE_DISCORDANTS=""
MERGE_SPLITTERS=""
for i in $( seq 0 $(( ${#REALIGN_RG_LIST[@]}-1 )) )
do
MERGE_FULL="$MERGE_FULL $TEMP_DIR/$OUTBASE.$(($i+1)).bam"
MERGE_DISCORDANTS="$MERGE_DISCORDANTS $TEMP_DIR/$OUTBASE.$(($i+1)).discordants.bam"
MERGE_SPLITTERS="$MERGE_SPLITTERS $TEMP_DIR/$OUTBASE.$(($i+1)).splitters.bam"
done

if [[ $VERBOSE -eq 1 ]]
then
echo "
$SAMBAMBA merge -t $THREADS $OUTPUT.bam $MERGE_FULL
$SAMBAMBA merge -t $THREADS $OUTPUT.discordants.bam $MERGE_DISCORDANTS
$SAMBAMBA merge -t $THREADS $OUTPUT.splitters.bam $MERGE_SPLITTERS
rm $MERGE_FULL $MERGE_DISCORDANTS $MERGE_SPLITTERS"
fi
$SAMBAMBA merge -t $THREADS $OUTPUT.bam $MERGE_FULL
$SAMBAMBA merge -t $THREADS $OUTPUT.discordants.bam $MERGE_DISCORDANTS
$SAMBAMBA merge -t $THREADS $OUTPUT.splitters.bam $MERGE_SPLITTERS
rm $MERGE_FULL $MERGE_DISCORDANTS $MERGE_SPLITTERS

# index the files
if [[ $VERBOSE -eq 1 ]]
then
echo -e "
$SAMBAMBA index $OUTPUT.bam
$SAMBAMBA index $OUTPUT.discordants.bam
$SAMBAMBA index $OUTPUT.splitters.bam"
fi
echo "
$SAMBAMBA index $OUTPUT.bam
$SAMBAMBA index $OUTPUT.discordants.bam
$SAMBAMBA index $OUTPUT.splitters.bam
" | $PARALLEL -j 3
fi

# clean up
rm -r $TEMP_DIR
Expand Down
1 change: 1 addition & 0 deletions bin/speedseq.config
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ CNVNATOR_CHROMS_DIR=/gscmnt/gc2719/halllab/genomes/human/GRCh37/hs37_ebv/chroms
BAMTOFASTQ=$SPEEDSEQ_HOME/bin/bamtofastq.py
MBUFFER=$SPEEDSEQ_HOME/bin/mbuffer
BAMHEADRG=$SPEEDSEQ_HOME/bin/bamheadrg.py
BAMCLEANHEADER=$SPEEDSEQ_HOME/bin/bamcleanheader.py

# TGI cluster
# capture the git commit hash of SpeedSeq
Expand Down
Loading

0 comments on commit b2f19e1

Please sign in to comment.