prepare_bacterial_reference

#!/bin/bash 

set -eo pipefail

GRN='\033[1;32m'
GRN2='\033[0;32m'
RED='\033[1;31m'
NC='\033[0m' # No Color

if [[ $# < 3 ]]
then
  echo
  echo "Version:"
  echo "  v0.8.0, Alexander Predeus (predeus@gmail.com), 2020"  
  echo "Synopsis:"
  echo "  One-command bacterial RNA-Seq reference preparation."
  echo 
  printf "prepare multi-reference for all ${RED}study${NC} and ${GRN}reference${NC} strains listed in the config file, or\n"
  printf "prepare simple reference using reference genome fasta and GFF annotation.\n"
  echo 
  echo "=========================================================================================="
  echo 
  printf "Usage: ${GRN}prepare_bacterial_reference ${GRN2}--multi  <working_directory> <config> [-p CPUs] [-r <fasta.fa>]${NC}\n"
  echo   "         - to use reference ncRNA/extra CDS fasta file (see below), or" 
  printf "       ${GRN}prepare_bacterial_reference ${GRN2}--multi  <working_directory> <config> [-p CPUs]${NC}\n"
  echo   "         - to use Prokka to predict non-coding RNAs with Rfam, or "
  printf "       ${GRN}prepare_bacterial_reference ${GRN2}--simple <working_directory>  <tag>   [-p CPUs]${NC}\n"
  echo   "         - to use simple workflow with one reference genome FASTA and GFF annotation."
  echo 
  echo "Positional arguments:"
  echo "  <working_dir>  Directory containing sub-directories named fastqs, study_strains, and ref_strains"
  echo "  <config>       (multi-strain only) Tab-separated file listing each sample and strain tag," 
  echo "                 as well as reference strain tags"
  echo "  <tag>          (single-strain only) Strain tag for the study strain"
  echo 
  echo "Options:"
  echo "  -p [X]         Number of cores for parallel execution (default '4')"
  echo "  -r [X]         (multi-strain only) Additional reference nucleotide sequences to be added to the final annotation via Blastn"
  echo "                 If -r prokka is selected, ncRNAs would be predicted using Rfam functionality in Prokka"
  echo "                 If an external fasta file is provided, each sequence should be named using ><name>.<type> convention"
  echo "                 Types can be \"CDS\", \"ncRNA\", or \"misc\" - e.g. >RygC.ncRNA, >trpL.CDS, or >fdnG.misc"
  echo  
  echo 
  exit 1
fi

WDIR=""
CONFIG="" 
TAG=""
CPUS=""
REF=""
NEWREF=""
PTYPE=""

! getopt --test > /dev/null
if [[ ${PIPESTATUS[0]} -ne 4 ]]; then
    >&2 echo "ERROR: \"getopt --test\" failed in this environment"
    >&2 echo "Please make sure you have the most up-to-date version of getopt!" 
    exit 1
fi

! PARSED=$(getopt --options=p:r: --longoptions=simple,multi --name "$0" -- "$@")
if [[ ${PIPESTATUS[0]} -ne 0 ]]; then
  exit 2
fi

# read getopt’s output this way to handle the quoting right:
eval set -- "$PARSED"

while true; do
    case "$1" in
        -r)
            REF="$2" 
            shift 2
            ;;
        -p)
            CPUS="$2"
            shift 2
            ;;
        --simple)
            PTYPE="simple"
            shift 
            ;;
        --multi)
            PTYPE="multi"
            shift 
            ;;
        --)
            shift
            break
            ;;
        *)
            echo "Programming error"
            exit 3
            ;;
    esac
done

###################################################################
#### some sanity checks now 
#####################################3

if [[ $PTYPE == "multi" ]]
then
  WDIR=$1
  CONFIG=$2
  echo "==> Initiating bacpipe reference preparation for MULTI-STRAIN workflow!"  
  echo "==> Following variables were set:"
  echo
  echo "              WDIR: $WDIR"
  echo "            CONFIG: $CONFIG"
  echo "              CPUS: $CPUS"
  echo " ncRNA/smCDS FASTA: $REF"
  echo
elif [[ $PTYPE == "simple" ]]
then
  WDIR=$1
  TAG=$2
  echo "==> Initiating bacpipe reference preparation for SIMPLE SINGLE-STRAIN workflow!"  
  echo "==> Following variables were set:"
  echo
  echo "              WDIR: $WDIR"
  echo "               TAG: $TAG"
  echo "              CPUS: $CPUS"
  echo
else 
  >&2 echo "ERROR: please set either --simple or --multi option!"
  exit 1
fi 

if [[ $WDIR == "" || ! -d $WDIR ]]
then 
  >&2 echo "ERROR: you must identify an extisting working directory!" 
  exit 1 
fi 

if [[ $PTYPE == "simple" ]] && [[ $TAG == "" || ! -s $WDIR/study_strains/$TAG.fa || ! -s $WDIR/study_strains/$TAG.gff ]]
then
  >&2 echo "ERROR: you must specify the strain TAG and provide genome  <TAG>.fa and annotation <TAG>.gff in /study_strains sub-directory!"
  exit 1 
fi 

WDIR=`readlink -f $WDIR`

if [[ $CONFIG == "" && $TAG == "" ]] 
then
  >&2 echo "ERROR: please provide config file (for --multi) or strain tag (for --simple)!"
  exit 1
fi

if [[ $CPUS == "" ]]
then
  echo "==> Parallel jobs will be ran on 4 cores (default)."
  CPUS=4
else 
  echo "==> Parallel jobs will be ran on $CPUS cores."
fi

if [[ $REF != "" && $PTYPE == "simple" ]]
then 
  >&2 echo "ERROR: reference option can only be set for --multi workflow!"
  exit 1
fi 

SDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"


###################################################################################
###                                                                             ###  
###             arguments are set and checked, let's make ref files!            ###
###                                                                             ###  
###################################################################################

if [[ $PTYPE == "multi" ]]
then
  ## case 1: make multi-strain reference using Roary
  ## config file has to be in $WDIR; throw an error otherwise 
  CONFIG=`basename $CONFIG`
  
  if [[ ! -s $WDIR/$CONFIG ]]
  then
    >&2 echo "ERROR: Non-empty config file must be provided in the working directory!" 
    exit 1 
  fi
  
  ## TODO: check if $STUDY and $REFSTR come up non-empty
  ## TODO: check if all works with Rfam workflow (names don't get extracted)  
  
  STUDY=`grep -v "^Reference" $WDIR/$CONFIG | cut -f 2 | sort | uniq`
  REFSTR=`grep   "^Reference" $WDIR/$CONFIG | cut -f 2 | sort | uniq`
  
  echo -e "Following study strains will be processed:\n\n$STUDY\n"
  echo -e "Following reference strains will be processed:\n\n$REFSTR\n"
  
  
  ## prepare individual strain references for each study strain
  ## you have to have a genome file and prophage file in place 
 
  if [[ ! -d $WDIR/roary ]]
  then
    echo "==> $WDIR/roary/ not found and will be created."
    mkdir $WDIR/roary
  elif [[ -d $WDIR/roary/output ]]
  then 
    echo "==> Found directory $WDIR/roary/; $WDIR/roary/output will be erased and re-created."
    rm -rf $WDIR/roary/output
  else  
    echo "==> Found directory $WDIR/roary/; $WDIR/roary/output will be created."
  fi 

  if [[ $REF != "" ]]
  then  
    ## this bit is to check for frameshift alignments of external ref CDS vs study genomes
    ## if at least one study genome generates a frameshift, CDS in the ref becomes pseudogene
    cd $WDIR
    REFNAME=`basename $REF`
    NEWREF=$WDIR/${REFNAME%%.fa*}.check.fa
    $SDIR/script/check_extra_ref.sh $SDIR $WDIR $REF $NEWREF
    ## we are going to use the modified ref fasta from now on 
  fi 
  
  cd $WDIR/study_strains
  
  if [[ $REF == "" ]]
  then
    for i in $STUDY
    do
      $SDIR/script/make_study_strain_files.sh $SDIR $WDIR $i $CPUS
    done 
  else 
    for i in $STUDY 
    do
      $SDIR/script/make_study_strain_files.sh $SDIR $WDIR $i $CPUS $NEWREF
    done
  fi
  
  ## now check if that all worked out OK 
  for i in $STUDY
  do
    $SDIR/script/check_study_strain_files.sh $WDIR $i
  done  
  echo
  echo "ALL STUDY STRAIN FILES AND DIRS ARE OK!"  
  echo
 

  cd $WDIR/ref_strains
   
  for i in $REFSTR
  do
    $SDIR/script/make_reference_strain_files.sh $SDIR $WDIR $i $NEWREF
    grep -P "\tCDS\t" $WDIR/ref_strains/$i/$i.clean.gff > $WDIR/roary/1_$i.roary.gff 
    echo "##FASTA" >> $WDIR/roary/1_$i.roary.gff
    cat $WDIR/ref_strains/$i/$i.genome.fa >> $WDIR/roary/1_$i.roary.gff
  done

  ## check if reference strain GTF files in refstr are in Roary-friendly format (e.g. all NCBI files were cleaned up). 
  ## reqs: 1) unique IDs that are locus tags; 2) no names that are equal to ID; 3) only CDS features; 4) ##FASTA and genomic fa are present. 
  
  for i in $REFSTR
  do
    $SDIR/script/check_reference_strain_files.sh $WDIR $i
  done 
  echo
  echo "ALL REFERENCE STRAIN FILES ARE OK!"  
  echo
  
  ## make study strain GFF to run roary 
  for i in $STUDY
  do
    grep -P "\tCDS\t" $WDIR/study_strains/$i/$i.united.gff > $WDIR/roary/0_$i.roary.gff 
    echo "##FASTA"                          >> $WDIR/roary/0_$i.roary.gff
    cat $WDIR/study_strains/$i/$i.genome.fa >> $WDIR/roary/0_$i.roary.gff
  done 
  
  ## run Roary on all the strains present in the refstr
  
  cd $WDIR/roary 
  echo "==> Running Roary on all strains, using $CPUS cores..."
  roary -i 90 -p $CPUS -v -f output *.gff &> roary.log
  
  ## edit the presence-absence file. You should have dos2unix in your $PATH
  
  echo "==> Roary pan-genome analysis is done!"
  echo "==>  Performing Roary output reformatting; results will be written to roary/presence_absence_unix.csv"
  ## basically we replace all commas with tabs, lose all quote marks, and make sure strains are named correctly
  perl -ne 's/\r\n/\n/g; s/\"0_(.*?).roary/\"$1/g; s/\"1_(.*?).roary/\"$1/g; s/","/\t/g; s/"//g; print' output/gene_presence_absence.csv > presence_absence_unix.csv 

  ## make annotated CDS and ncRNA tables
  ## we will use blast output files for all study & reference strains.

  cd $WDIR
  echo "==> Generating a table of CDS orthologs."
  $SDIR/script/make_ortholog_table.pl $WDIR roary/presence_absence_unix.csv $CONFIG $NEWREF | awk '{if (NR==1) {print} else {print | "sort -k4,4"}}' > orthologs.tsv
  
  
  ## TODO: print stats - how many genes are common for all study strains, how many are unique, how many are seen in 2 and more strains.
  
  echo "==> DONE generating multi-strain reference!" 
else 
  ## case 2: simple prep
  cd $WDIR/study_strains 
  $SDIR/script/make_simple_reference_files.sh $SDIR $WDIR $TAG $CPUS 

  $SDIR/script/check_study_strain_files.sh $WDIR $TAG

  cd $WDIR/fastqs 
  echo "==> Writing simple config file, simple.cfg"
  KK=`for i in *fastq.gz
  do 
    TAG1=${i%%.fastq.gz}
    TAG2=${TAG1%%.R?}
    echo $TAG2
  done | sort | uniq`
  
  cd $WDIR
  for i in $KK
  do 
    echo -e "$i\t$TAG"
  done > simple.cfg
  echo "==> DONE generating simple single-strain reference!" 
fi