progenome-classifier

#!/usr/bin/env python

from __future__ import division
import os
import errno
import sys
import argparse
import shlex
import shutil
import time
import subprocess
import glob
import multiprocessing
import tempfile
from collections import Counter


# position of the script -------------------------------------------------------
path_DB = os.path.realpath(__file__)
path_array = path_DB.split("/")
relative_path = "/".join(path_array[0:-1])
relative_path = relative_path + "/"

# check if setup.py has been ran already ---------------------------------------
if not(os.path.isdir(relative_path+'specI_DB')):
    sys.stderr.write("[E::main] Error: database has not been downloaded. Run setup.py before using the progenomes classifier\n\n")
    sys.exit(1)


# ------------------------------------------------------------------------------
#       print the help informations
# ------------------------------------------------------------------------------
class CapitalisedHelpFormatter(argparse.HelpFormatter):
    def add_usage(self, usage, actions, groups, prefix=None):
        if prefix is None:
            prefix = ''
        return super(CapitalisedHelpFormatter, self).add_usage(usage, actions, groups, prefix)


def msg(name=None):
    str_msg = '''
\00
Program: progenome-classifier - a tool for genome taxonomy annotation

Usage: progenome-classifier <fasta_file> [options]

Options:
  -t  INT     number of threads [1]
  -v  INT     verbose level: 1=error, 2=warning, 3=message, 4+=debugging [3]
  -o  FILE    output file name [stdout]
  -m  FILE    save the 10 marker genes [None]
  -M  FILE    save the 10 marker genes and exit [None]
  -s          short output format (one-line output)
  -H          print the result in HTML format (used on the progenomes website)
        '''
    return str_msg

# ------------------------------------------------------------------------------
# function to check if a specific tool exists
def is_tool(name):
    try:
        devnull = open(os.devnull)
        subprocess.Popen([name], stdout=devnull, stderr=devnull).communicate()
    except OSError as e:
        if e.errno == errno.ENOENT:
            return False
    return True


# ------------------------------------------------------------------------------
# MAIN
# ------------------------------------------------------------------------------
def main(argv=None):

    devnull = open(os.devnull)

    parser = argparse.ArgumentParser(usage=msg(), formatter_class=CapitalisedHelpFormatter,add_help=False)
    parser.add_argument('fasta_file', action="store", default=None, help='fasta file')
    parser.add_argument('-v', action='store', type=int, default=None, dest='verbose', help='Verbose levels')
    parser.add_argument('-t', type=int, action="store", dest='threads', default=None, help='Number of threads to be used.')
    parser.add_argument('-o', action="store", dest='output', default=None, help='name of output file')
    parser.add_argument('-m', action="store", dest='save_10_MGs', default=None, help='save the 10 marker genes')
    parser.add_argument('-M', action="store", dest='save_10_MGs_only', default=None, help='save the 10 marker genes, and do not run vsearch')
    parser.add_argument('-s', action='store_true', default=None, dest='short_output', help='print the output in only one line')
    parser.add_argument('-H', action='store_true', default=None, dest='print_html', help='print the output in html format')
    args = parser.parse_args()

    # set default for args.verbose
    if (args.verbose is None): args.verbose = 3

    # set the default
    if (args.output is None): args.output = ""
    if (args.threads is None): args.threads = 1
    if (args.save_10_MGs is None): args.save_10_MGs = ""
    if (args.save_10_MGs_only is None): args.save_10_MGs_only = ""
    if (args.short_output is None): args.short_output = False
    if (args.print_html is None): args.print_html = False

    # name fasta file
    fasta_name = args.fasta_file.split("/")[-1]

    # ---------------------- check general parameters --------------------------
    if args.verbose < 1:
        sys.stderr.write("[E::main] Error: verbose level (-v) is less than 1\n")
        sys.exit(1)
    if args.threads < 1:
        sys.stderr.write("[E::main] Error: number of threads (-t) is less than 1\n")
        sys.exit(1)

    # COG that we evaluate
    marker_genes = {"COG0012":0, "COG0049":1, "COG0052":2, "COG0048":3, "COG0016":4,
                      "COG0018":5, "COG0080":6, "COG0088":7, "COG0081":8, "COG0087":9,
                       "COG0090":10, "COG0085":11, "COG0091":12, "COG0092":13, "COG0093":14,
                        "COG0094":15, "COG0096":16, "COG0097":17, "COG0098":18, "COG0099":19,
                         "COG0100":20, "COG0102":21, "COG0103":22, "COG0124":23, "COG0172":24,
                          "COG0184":25, "COG0185":26, "COG0186":27, "COG0197":28, "COG0200":29,
                           "COG0201":30, "COG0202":31, "COG0215":32, "COG0256":33, "COG0522":34,
                            "COG0495":35, "COG0533":36, "COG0525":37, "COG0552":38, "COG0541":39}

    # --------------------------------------------------------------------------
    #   0. CHECK DEPENDENCIES
    # --------------------------------------------------------------------------
    # Perl
    #if not is_tool("perl --version"):
    #    sys.stderr.write("[E::main] Error: perl is not in the path. Please install perl.\n")
    #    sys.exit(1)

    # cdbtools
    if not is_tool("cdbyank"):
        sys.stderr.write("[E::main] Error: cdbtools is not in the path. Please install cdbtools.\n")
        sys.exit(1)

    # Prodigal
    if not is_tool("prodigal"):
        sys.stderr.write("[E::main] Error: prodigal is not in the path. Please install prodigal.\n")
        sys.exit(1)

    # SAMtools
    #if not is_tool("samtools"):
    #    sys.stderr.write("[E::main] Error: samtools is not in the path. Please install samtools.\n")
    #    sys.exit(1)

    # HMMER
    if not is_tool("hmmsearch"):
        sys.stderr.write("[E::main] Error: hmmsearch is not in the path. Please install HMMER.\n")
        sys.exit(1)

    # vsearch
    if not is_tool("vsearch"):
        sys.stderr.write("[E::main] Error: vsearch is not in the path. Please install vsearch.\n")
        sys.exit(1)

    # --------------------------------------------------------------------------
    #   0. CHECK FILE
    # --------------------------------------------------------------------------
    # check that it is a fasta file
    try:
        o = open(args.fasta_file,"r")
        # check that it starts with ">"
        if not(o.readline().startswith(">")):
            sys.stderr.write("[E::main] Error. Not a fasta file: "+fasta_name+"\n")
            sys.stderr.write("          Fasta file is expected to start with '>'\n")
            o.close()
            sys.exit(1)
        o.close()
    except:
        sys.stderr.write("[E::main] Error: Cannot open file: "+fasta_name+"\n")
        sys.exit(1)


    # --------------------------------------------------------------------------
    #   1. EXTRACT GENES
    # --------------------------------------------------------------------------
    # create a temp dir to save the intermediate results
    if args.verbose > 2: sys.stderr.write("Extract genes...")
    start = time.time()

    genes = tempfile.mkdtemp()

    # count the number of founded genes
    founded_genes = {"COG0012":0, "COG0049":0, "COG0052":0, "COG0048":0, "COG0016":0,
                      "COG0018":0, "COG0080":0, "COG0088":0, "COG0081":0, "COG0087":0,
                       "COG0090":0, "COG0085":0, "COG0091":0, "COG0092":0, "COG0093":0,
                        "COG0094":0, "COG0096":0, "COG0097":0, "COG0098":0, "COG0099":0,
                         "COG0100":0, "COG0102":0, "COG0103":0, "COG0124":0, "COG0172":0,
                          "COG0184":0, "COG0185":0, "COG0186":0, "COG0197":0, "COG0200":0,
                           "COG0201":0, "COG0202":0, "COG0215":0, "COG0256":0, "COG0522":0,
                            "COG0495":0, "COG0533":0, "COG0525":0, "COG0552":0, "COG0541":0}


    find_genes = "python "+ relative_path + 'makePaddedMG.py ' + args.fasta_file
    find_genes = find_genes + " " +genes+"/tmp1 tmp2 --padLength 0 --numThreads "+str(args.threads)
    if args.verbose > 3:
        sys.stderr.write("call: "+find_genes+"\n")

    find_genesCMD = shlex.split(find_genes)
    if args.verbose > 3:
        find_genes_cmd = subprocess.Popen(find_genesCMD)
    else:
        dummy_file = tempfile.NamedTemporaryFile(mode="w")
        find_genes_cmd = subprocess.Popen(find_genesCMD,stdout=dummy_file, stderr=dummy_file)
    return_code = find_genes_cmd.wait()
    if return_code:
        sys.stderr.write("[E::main] Error: extracting genes failed.\n")
        sys.exit(1)

    # save the genes to a file
    genes_temp_file = tempfile.NamedTemporaryFile(delete=False, mode="w")
    os.chmod(genes_temp_file.name, 0o644)
    if args.verbose > 3:
        sys.stderr.write("temp file with genes: "+genes_temp_file.name+"\n")

    for i in marker_genes:
        file_g = open(genes+"/tmp1/tmp2_markerGenes/"+i+".fna","r")
        cont = 1
        for k in file_g:
            if k.startswith(">"):
                genes_temp_file.write(">"+i+"_"+str(cont)+"\n")
                cont = cont + 1
                founded_genes[i] = founded_genes[i] + 1 # update dict with the number of founded genes
            else:
                genes_temp_file.write(k)

    # close the temp file (but not deleted)
    genes_temp_file.flush()
    os.fsync(genes_temp_file.fileno())
    genes_temp_file.close()

    # remove temp dir
    shutil.rmtree(os.path.abspath(os.path.join(genes)))

    if args.verbose > 4:
        sys.stderr.write("\nfounded_genes:\n")
        for ii in founded_genes:
            sys.stderr.write(str(ii)+": "+str(founded_genes[i])+"\n")

    # print time
    end_gene_finding = time.time()
    if args.verbose > 2: sys.stderr.write(str( float("{0:.2f}".format(end_gene_finding - start)) )+" seconds\n")

    # check if we only want to extract the genes and we dont care of vsearch
    if args.save_10_MGs_only != "": # save it somewhere
        try:
            shutil.move(genes_temp_file.name,args.save_10_MGs_only) #It is not atomic if the files are on different filsystems.
        except:
            sys.stderr.write("[E::main] Error: failed to save the marker genes sequences\n")
            os.remove(os.path.abspath(os.path.join(genes_temp_file.name)))
            sys.exit(1)
        sys.exit(0) # correct and exit

    # find how many marker genes have multiple copies
    n_MG_multiple_copies = 0
    for cog in founded_genes:
        if founded_genes[cog] > 1:
            n_MG_multiple_copies = n_MG_multiple_copies + 1


    ## =========================================================================
    # if genes are detected, then we map them with vsearch
    if sum(founded_genes.values()) != 0:

        # --------------------------------------------------------------------------
        #   2. MAP GENES TO DATABASE
        # --------------------------------------------------------------------------
        # now in genes_temp_file.name we have all the extracted genes
        if args.verbose > 2: sys.stderr.write("Map genes to specI database...")
        start = time.time()

        # load COG cutoffs
        if args.verbose > 4: sys.stderr.write("  load cutoffs\n")
        all_cutoffs = dict()
        cutoff_f = open(relative_path+"lib/cutoffs_specI_40MGs_new.txt","r")
        for i in cutoff_f:
            all_v = i.rstrip().split("\t")
            all_cutoffs[all_v[0]] = float(all_v[1])
        cutoff_f.close()


        vsearch_res = tempfile.NamedTemporaryFile(delete=False, mode="w")
        os.chmod(vsearch_res.name, 0o644)
        if args.verbose > 3:
            sys.stderr.write("temp file with result from vsearch: "+vsearch_res.name+"\n")

        map_genes = "vsearch --usearch_global "+genes_temp_file.name+" --db "+relative_path+"specI_DB/progenomes2_40MGs.fasta.gz --id 0.94 --blast6out "+vsearch_res.name+" --minqt 0.7 --maxrejects 10000 --threads "+str(args.threads)

        map_genesCMD = shlex.split(map_genes)
        if args.verbose > 3:
            map_genes_cmd = subprocess.Popen(map_genesCMD)
        else:
            map_genes_cmd = subprocess.Popen(map_genesCMD,stdout=devnull, stderr=devnull)
        return_code = map_genes_cmd.wait()
        if return_code:
            sys.stderr.write("Error: vsearch failed")
            sys.exit(1)

        # parse result from vsearch
        matches = dict()
        all_matched_perc_id = dict()
        tp = open(vsearch_res.name,"r")
        for i in tp:
            vals = i.rstrip().split("\t")
            all_matched_perc_id[vals[0]] = vals[2]
            i_cog_type = vals[0].split("_")[0]
            perc_id = float(vals[2])
            if perc_id > all_cutoffs[i_cog_type]:
                # add
                query = vals[0]
                matches[query] = vals[1]
        tp.close()

        # print time
        end_mapping = time.time()
        if args.verbose > 2: sys.stderr.write(str( float("{0:.2f}".format(end_mapping - start)) )+" seconds\n")
        start = time.time()

        # remove file with the results from vsearch
        os.remove(os.path.abspath(os.path.join(vsearch_res.name)))

        # decide what to do with the genes file
        if args.save_10_MGs != "": # save it somewhere
            try:
                shutil.move(genes_temp_file.name,args.save_10_MGs) #It is not atomic if the files are on different filsystems.
            except:
                sys.stderr.write("[E::main] Error: failed to save the marker genes sequences\n")
                os.remove(os.path.abspath(os.path.join(genes_temp_file.name)))
        else: # or delete it
            os.remove(os.path.abspath(os.path.join(genes_temp_file.name)))


        # --------------------------------------------------------------------------
        #   3. FIND CONGRUENCES BETWEEN THE SPECIs
        # --------------------------------------------------------------------------
        # now in matches we have the specI database ids
        if args.verbose > 2: sys.stderr.write("Find assignemnt from genes...")
        start = time.time()

        # we check from matches the congruences and how to classify the input ---
        # find matches
        n_agreeing_genes = "-" # percentage of agreeing genes
        perc_agreeing_genes = "-" # percentage of agreeing genes
        n_mapped_genes = len(matches)

        if len(matches) == 0:
            best = "No match"
        else:
            list_founded_speci = list()
            for i in matches:
                list_founded_speci.append(matches[i].split("#")[-2])

            all_tax = dict(Counter(list_founded_speci))
            max_agree = max(all_tax.values())
            n_max = 0
            list_n_max = list()
            for i in all_tax:
                if all_tax[i] == max_agree:
                    n_max = n_max + 1
                    best = "Consistent"
                    list_n_max.append(i)
            if n_max > 1:
                best = "Inconsistent"
            else:
                perc_agreeing_genes = str( ( max_agree/(sum(all_tax.values())) ) * 100 )+"%" # percentage agreeing
                n_agreeing_genes = str(max_agree)

        # print time
        end_assi = time.time()
        if args.verbose > 2: sys.stderr.write(str( float("{0:.2f}".format(end_assi - start)) )+" seconds\n")
        start = time.time()
   ## =========================================================================
   # if no genes are detected, then no need to run vsearch and map to specI
    else:
        best = "No genes"
        n_mapped_genes = 0
    # ARRIVED HERE: we know what type of match it is
    n_detected_genes = sum(founded_genes.values())

    # if it is consistent, or inconsistent we find the taxonomy
    tax_levels_class = ["-","-","-","-","-","-","-","-"]
    if best != "No genes" and best != "No match":
        # we only need the taxonomy
        tax = dict()
        f_ref_tax = open(relative_path+"specI_DB/specI_taxonomy.tsv","r")
        f_ref_tax.readline() # remove header
        for i in f_ref_tax:
            all_v = i.rstrip().split("\t")
            tax[all_v[0]] = all_v[1:8]
            # remove NCBI id
            for c in range(7):
                tax[all_v[0]][c] = " ".join(tax[all_v[0]][c].split(" ")[1:])

        f_ref_tax.close()
        # find taxonomy agreement
        # we use list_n_max
        for i in range(7):
            possible_vals = list()
            for vv in list_n_max:
                possible_vals.append(tax[vv][i])
            tax_levels_class[i] = "/".join(list(set(possible_vals)))
        # merge the specI
        tax_levels_class[7] = "/".join(list(set(list_n_max)))

    # Find the accuracy of the assignement
    if best != "Consistent":
        accuracy_v = "-"
    else:
        # load the data and check
        try:
            accu = dict()
            f_accu = open(relative_path+"specI_DB/confidence_values.tsv","r")
            f_accu.readline() # remove header
            for i in f_accu:
                all_v = i.rstrip().split("\t")
                accu[str(all_v[0])+"-"+str(all_v[1])] = all_v[2]+" %"
            f_accu.close()

            curr_genes_info = str(n_agreeing_genes)+"-"+str(n_mapped_genes)
            if curr_genes_info in accu:
                accuracy_v = accu[curr_genes_info]
            else:
                accuracy_v = "-"

        except:
            sys.stderr.write("Error while calculating accuracy.\n")
            accuracy_v = "-"

    # SET UP THE PRINTING ######################################################
    # general print
    if args.output != "":
        outfile = tempfile.NamedTemporaryFile(delete=False, mode="w")
        os.chmod(outfile.name, 0o644)
    else:
        outfile = sys.stdout

    # PRINT RESULT IN HTML FORMAT ==============================================
    if args.print_html:
        outfile.write("<h2>Result</h2>\n")
        outfile.write("<strong>kingdom: </strong><em>"+tax_levels_class[0]+"</em><br>\n")
        outfile.write("<strong>phylum: </strong><em>"+tax_levels_class[1]+"</em><br>\n")
        outfile.write("<strong>class: </strong><em>"+tax_levels_class[2]+"</em><br>\n")
        outfile.write("<strong>order: </strong><em>"+tax_levels_class[3]+"</em><br>\n")
        outfile.write("<strong>family: </strong><em>"+tax_levels_class[4]+"</em><br>\n")
        outfile.write("<strong>genus: </strong><em>"+tax_levels_class[5]+"</em><br>\n")
        outfile.write("<strong>species: </strong><em>"+tax_levels_class[6]+"</em><br>\n")
        outfile.write("<strong>specI: </strong>")
        if tax_levels_class[7] != "-":
            link_to_specI = list()
            for i in tax_levels_class[7].split("/"):
                link_to_specI.append('<a href="http://progenomes.embl.de/index.cgi?search='+i+'">'+i+'</a>')
            outfile.write( ("/".join(link_to_specI)) + "<br>\n")
        else:
            outfile.write("-<br>\n")

        outfile.write("<br>\n")

        outfile.write("<strong>Annotation:</strong> "+str(best)+"<br>\n")
        outfile.write("<strong>Confidence:</strong> "+str(accuracy_v)+"<br>\n")
        outfile.write("<br>\n")

        outfile.write("<h3>Classification information</h3>\n")
        outfile.write("<strong>Number of detected genes:</strong> "+str(n_detected_genes)+"<br>\n")
        outfile.write("<strong>Unique marker genes found in multiple copies [chimera?]:</strong> "+str(n_MG_multiple_copies)+"<br>\n")
        outfile.write("<strong>Number of mapped genes:</strong> "+str(n_mapped_genes)+"<br>\n")
        outfile.write("<strong>Number of agreeing genes:</strong> "+str(n_agreeing_genes)+"<br>\n")
        outfile.write("<strong>Percentage of agreeing genes:</strong> "+str(perc_agreeing_genes)+"<br>\n")
        outfile.write("<br>\n")

        outfile.write("<h3>Single genes classification</h3>\n")
        outfile.write("<table>\n")
        outfile.write(" <tr>\n")
        outfile.write("   <th style=\"text-align:center\">Gene from fasta file</th>\n")
        outfile.write("   <th style=\"text-align:center\">SpecI gene</th>\n")
        outfile.write("   <th style=\"text-align:center\">Percentage identity</th>\n")
        outfile.write(" <tr>\n")
        for i in matches:
            outfile.write(" <tr>\n")
            outfile.write("   <td style=\"text-align:center\">"+i+"</td>\n")
            outfile.write("   <td style=\"text-align:center\">"+matches[i]+"</td>\n")
            outfile.write("   <td style=\"text-align:center\">"+str(all_matched_perc_id[i])+"</td>\n")
            outfile.write(" <tr>\n")
        outfile.write("</table>\n")

    # PRINT NORMAL RESULT ======================================================
    if (not args.print_html) and (not args.short_output):
        outfile.write("RESULT\n")
        outfile.write("fasta sequence: "+fasta_name+"\n\n")

        outfile.write("kingdom: "+tax_levels_class[0]+"\n")
        outfile.write("phylum: "+tax_levels_class[1]+"\n")
        outfile.write("class: "+tax_levels_class[2]+"\n")
        outfile.write("order: "+tax_levels_class[3]+"\n")
        outfile.write("family: "+tax_levels_class[4]+"\n")
        outfile.write("genus: "+tax_levels_class[5]+"\n")
        outfile.write("species: "+tax_levels_class[6]+"\n")
        outfile.write("specI: "+tax_levels_class[7]+"\n\n")

        outfile.write("Annotation: "+best+"\n")
        outfile.write("Confidence: "+accuracy_v+"\n\n")

        outfile.write("Classification information\n")
        outfile.write("Number of detected genes: "+str(n_detected_genes)+"\n")
        outfile.write("Unique marker genes found in multiple copies [chimera?]: "+str(n_MG_multiple_copies)+"\n")
        outfile.write("Number of mapped genes: "+str(n_mapped_genes)+"\n")
        outfile.write("Number of agreeing genes: "+str(n_agreeing_genes)+"\n")
        outfile.write("Percentage of agreeing genes: "+str(perc_agreeing_genes)+"\n\n")

        outfile.write("Single genes classification\n")
        for i in matches:
            outfile.write(i+"\t")
            outfile.write(matches[i]+"\t")
            outfile.write(str(all_matched_perc_id[i])+"\n")

    # PRINT NORMAL RESULT - ONE LINE VERSION ===================================
    if (not args.print_html) and args.short_output:
        outfile.write(fasta_name+"\t")

        outfile.write(tax_levels_class[0]+"\t")
        outfile.write(tax_levels_class[1]+"\t")
        outfile.write(tax_levels_class[2]+"\t")
        outfile.write(tax_levels_class[3]+"\t")
        outfile.write(tax_levels_class[4]+"\t")
        outfile.write(tax_levels_class[5]+"\t")
        outfile.write(tax_levels_class[6]+"\t")
        outfile.write(tax_levels_class[7]+"\t")

        outfile.write(best+"\t")
        outfile.write(accuracy_v+"\t")

        outfile.write(str(n_detected_genes)+"\t")
        outfile.write(str(n_MG_multiple_copies)+"\t")
        outfile.write(str(n_mapped_genes)+"\t")
        outfile.write(str(n_agreeing_genes)+"\t")
        outfile.write(str(perc_agreeing_genes)) # "\t" will be after

        for i in matches:
            outfile.write(i+"#")
            outfile.write(matches[i]+"#")
            outfile.write(str(all_matched_perc_id[i])+"\t")
        outfile.write("\n")

    # CLOSE UP THE PRINTING ####################################################
    if args.output != "":
        try:
            outfile.flush()
            os.fsync(outfile.fileno())
            outfile.close()
        except:
            sys.stderr.write("[E::main] Error: failed to save the output file\n")
            sys.exit(1)
        try:
            #os.rename(outfile.name,output) # atomic operation
            shutil.move(outfile.name,args.output) #It is not atomic if the files are on different filsystems.
        except:
            sys.stderr.write("[E::main] Error: failed to save the output file\n")
            sys.stderr.write("[E::main] you can find the file here:\n"+outfile.name+"\n")
            sys.exit(1)


    return 0        # success


#-------------------------------- run main -------------------------------------
if __name__ == '__main__':
    status = main()
    sys.exit(status)