motus_genomes_download

#!/usr/bin/env python

import sys
import argparse
import os
from urllib.request import Request, urlopen, urlretrieve
import gzip
import tempfile
import shutil
import time
import hashlib
import tarfile
import shlex
import subprocess

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# position of the script -------------------------------------------------------
path_mOTUs = os.path.realpath(__file__)
path_array = path_mOTUs.split("/")
relative_path = "/".join(path_array[0:-1]) + "/"
result_repo = relative_path + "motus_v3_genomes"


# ------------------------------------------------------------------------------
#       print the help informations
# ------------------------------------------------------------------------------
class CapitalisedHelpFormatter(argparse.HelpFormatter):
    def add_usage(self, usage, actions, groups, prefix=None):
        if prefix is None:
            prefix = ''
        return super(CapitalisedHelpFormatter, self).add_usage(usage, actions, groups, prefix)


def msg(name=None):
    str_msg = '''
\00
Program: motus_genomes_download - Download genomes used in mOTUs 3
Usage: motus_genomes_download -m <ID> -p <path/to/dir>
Options:
  -m  STR     ID of the genome or mOTU to download, use "all" to download all genomes
  -f          force to re-download the data
  -l          list all motus and the number of associated genomes
  -g          list all genomes and the associated motus
  -h          print this help

Examples:
If you want to download the genome LIAN20-1_SAMN11649416_METAG_000035, you need to run:
motus_genomes_download -m LIAN20-1_SAMN11649416_METAG_000035

If you want to download all genomes in the motu ref_mOTU_v3_00006, you need to run:
motus_genomes_download -m ref_mOTU_v3_00006

If you want to run all genomes, run:
motus_genomes_download -m all
        '''
    return str_msg


# ------------------------------------------------------------------------------
# function to return the md5 of a file
# ------------------------------------------------------------------------------
def md5(fname):
    hash_md5 = hashlib.md5()
    with open(fname, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()


# ------------------------------------------------------------------------------
# print all motus and their information
# ------------------------------------------------------------------------------
def print_all_motus():
    all_url_info = download_url_links()
    # dict like: 'ext_mOTU_v3_31652': ['f0cbf61dabb043108da42eb948bee657', 'https://libdrive.ethz.ch/index.php/s/AovZ7KpStxfGdhs/download?path=%2FmOTU_31508-31657&files=ext_mOTU_v3_31652.tar']
    genome_2_motus = load_map_genome_motus()
    # it's a dict genome --> mOTU

    # we find the number of genomes per mOTU
    motus_count = dict()
    for genome in genome_2_motus:
        if not genome_2_motus[genome] in motus_count:
            motus_count[genome_2_motus[genome]] = 0
        motus_count[genome_2_motus[genome]] = motus_count[genome_2_motus[genome]] + 1

    # print the result
    print("motu\tnumber_of_genomes\turl_file\tmd5")
    for motu in all_url_info:
        print(motu+"\t"+str(motus_count[motu])+"\t"+all_url_info[motu][1]+"\t"+all_url_info[motu][0])

# ------------------------------------------------------------------------------
# print all genomes and to which motus they map to
# ------------------------------------------------------------------------------
def print_all_genomes():
    genome_2_motus = load_map_genome_motus()
    # it's a dict genome --> mOTU

    print("genome\tmotus")
    for genome in genome_2_motus:
        print(genome+"\t"+genome_2_motus[genome])


# ------------------------------------------------------------------------------
# download all genomes
# ------------------------------------------------------------------------------
def download_all_genomes(all_url_info, force_redownload):
    sys.stderr.write("Download all genomes\n")
    # we save also the position of the genomes
    pos_all_genomes = list()
    # we check how many genomes we are working with
    n_genomes = 0
    # we go through each motu
    for motu in all_url_info:
        # do we force the re-download?
        if force_redownload:
            if os.path.isdir(result_repo+"/"+motu):
                shutil.rmtree(result_repo+"/"+motu)
        # download if necessary
        if os.path.isdir(result_repo+"/"+motu):
            sys.stderr.write("mOTUs "+motu+" already downloaded\n")
        else:
            download_motu(motu,all_url_info)
        # add to list of all genomes downloaded
        o = open(result_repo+"/"+motu+"/1.list_files.txt","r")
        for line in o:
            pos_all_genomes.append(line)
            n_genomes = n_genomes + 1
        o.close()
    # save position of all genomes ---------------------------------------------
    w = open(result_repo+"/1.list_all_files.txt","w")
    for line in pos_all_genomes:
        w.write(line)
    w.close()
    # print info ---------------------------------------------------------------
    sys.stderr.write("\nResult in: "+result_repo+"\n")
    sys.stderr.write("List of all genomes in: "+result_repo+"/1.list_all_files.txt\n")


# ------------------------------------------------------------------------------
# download one mOTU
# ------------------------------------------------------------------------------
# progress bar function --------------------------------------------------------
def reporthook(count, block_size, total_size):
    global start_time
    if count == 0:
        start_time = time.time()
        return
    duration = time.time() - start_time
    progress_size = int(count * block_size)
    speed = int(progress_size / (1024 * duration))
    percent = int(count * block_size * 100 / total_size)
    sys.stdout.write("\r %d%%, %d MB, %d KB/s, %d seconds passed" %
                    (percent, progress_size / (1024 * 1024), speed, duration))
    sys.stdout.flush()

# main function ----------------------------------------------------------------
def download_motu(ID, all_url_info):
    sys.stderr.write("Download mOTU "+ID+"\n")

    # we create a dir for this motu --------------------------------------------
    if os.path.isdir(result_repo+"/temp_dir/"+ID):
        # there is a repo in the temp dir, so it means it failed, I remove it
        shutil.rmtree(result_repo+"/temp_dir/"+ID)
    os.mkdir(result_repo+"/temp_dir/"+ID)

    # download the data --------------------------------------------------------
    path_result = result_repo+"/temp_dir/"+ID+"/"+ID+".tar"
    urlretrieve(all_url_info[ID][1], path_result, reporthook)

    # check md5 ----------------------------------------------------------------
    sys.stderr.write("\n Check md5: ")
    current_md5 = md5(path_result)
    if current_md5 == all_url_info[ID][0]:
        sys.stderr.write("MD5 verified\n")
    else:
        sys.stderr.write("MD5 verification failed!\n")
        shutil.rmtree(result_repo+"/temp_dir/"+ID)
        sys.exit(1)

    # untar the file -----------------------------------------------------------
    my_tar = tarfile.open(path_result)
    where_to_save = "/".join(path_result.split("/")[0:-1])
    my_tar.extractall(where_to_save)
    my_tar.close()

    # move all files -----------------------------------------------------------
    source_dir = where_to_save + "/" + ID
    file_names = os.listdir(source_dir)
    for file_name in file_names:
        shutil.move(os.path.join(source_dir, file_name), where_to_save)

    # remove unused file and repo ----------------------------------------------
    shutil.rmtree(source_dir)
    os.remove(path_result)

    # unzip the genome files ---------------------------------------------------
    file_names = os.listdir(where_to_save)
    for file_name in file_names:
        unzip_CMD = "gunzip "+ where_to_save + "/" + file_name
        unzip_cmd = subprocess.Popen(shlex.split(unzip_CMD))
        return_code = unzip_cmd.wait()
        if return_code:
            sys.stderr.write("Error: unzipping failed.\n")
            #shutil.rmtree(result_repo+"/temp_dir/"+ID)
            sys.exit(1)

    # create a list of all the files -------------------------------------------
    file_names = os.listdir(where_to_save)
    w = open(where_to_save+"/1.list_files.txt","w")
    n_genomes = 0
    for file_name in file_names:
        w.write(result_repo + "/" + ID + "/" + file_name+"\n")
        n_genomes = n_genomes + 1
    w.close()
    sys.stderr.write(" Number of genomes downloaded: "+str(n_genomes)+"\n")

    # move repository to the final destination ---------------------------------
    shutil.move(where_to_save, result_repo)

    # return position of the repository ----------------------------------------
    return result_repo+"/"+ID


# ------------------------------------------------------------------------------
# download one genome
# ------------------------------------------------------------------------------
def download_genome(ID, genome_2_motus, all_url_info,force_redownload):
    sys.stderr.write("Download genome "+ID+"\n")
    # mOTUs to be downloaded
    motu = genome_2_motus[ID]

    # do we force the re-download?
    if force_redownload:
        if os.path.isdir(result_repo+"/"+motu):
            shutil.rmtree(result_repo+"/"+motu)
    # download if necessary
    if os.path.isdir(result_repo+"/"+motu):
        sys.stderr.write("mOTUs "+motu+" already downloaded\n")
    else:
        download_motu(motu,all_url_info)

    # Choose the correct file
    sys.stderr.write("\nResult in: "+result_repo+"/"+genome_2_motus[ID]+"/"+ID+".fa\n")


# ------------------------------------------------------------------------------
# download map of genomes to mOTUs
# ------------------------------------------------------------------------------
def load_map_genome_motus():
    file_map="https://zenodo.org/record/6975139/files/genome_2_mOTU_v3.0.1.tsv.gz?download=1"

    res = dict()

    req = Request(file_map)
    req.add_header('Accept-Encoding', 'gzip')
    response = urlopen(req)
    content = gzip.decompress(response.read())
    decomp_req = content.splitlines()
    for line in decomp_req:
        vals = line.decode('utf-8').rstrip().split("\t")
        res[vals[0]] = vals[1]

    return res

# ------------------------------------------------------------------------------
# download the info about the mOTUs on the ETH archive
# ------------------------------------------------------------------------------
def download_url_links():
    file_URLs="https://sunagawalab.ethz.ch/share/TEMP_MOTUS_DB/data_download_genomes/motu_md5_libdrive.tsv"

    res = dict()

    data = urlopen(file_URLs)
    for line in data:
        vals = line.decode('utf8').rstrip().split("\t")
        res[vals[0]] = [vals[1],vals[2]]

    return res


# ------------------------------------------------------------------------------
# MAIN
# ------------------------------------------------------------------------------
def main(argv=None):

    devnull = open(os.devnull)

    parser = argparse.ArgumentParser(usage=msg(), formatter_class=CapitalisedHelpFormatter,add_help=False)
    parser.add_argument('-m', action="store", dest='ID', default=None, help='what needs to be saved')
    parser.add_argument('-f', action='store_true', default=None, dest='force_redownload', help='redo the download')
    parser.add_argument('-l', action='store_true', default=None, dest='print_list_motus', help='print all motus')
    parser.add_argument('-g', action='store_true', default=None, dest='print_list_genomes', help='print all genomes')
    parser.add_argument('-h', action='store_true', default=None, dest='print_help', help='print the help')
    parser.add_argument('-help', action='store_true', default=None, dest='print_help2', help='print the help')
    parser.add_argument('--help', action='store_true', default=None, dest='print_help3', help='print the help')
    args = parser.parse_args()

    if (args.print_list_motus is None) and (args.force_redownload is None) and (args.ID is None) and (args.print_list_genomes is None):
        print(msg())
        return 0
    if not(args.print_help is None) or not(args.print_help2 is None) or not(args.print_help3 is None):
        print(msg())
        return 0

    # set default for the -f
    if args.force_redownload is None:
        force_redownload = False


    # print the motus and exit -------------------------------------------------
    if not(args.print_list_motus is None):
        print_all_motus()
        return 0
    # print the genomes and exit -----------------------------------------------
    if not(args.print_list_genomes is None):
        print_all_genomes()
        return 0


    # identify what needs to be downloaded and run the correct function --------
    if (args.ID is None):
        sys.stderr.write("Error: missing -m\n")
        return 1

    # create directory with the result -----------------------------------------
    if os.path.isdir(result_repo):
        sys.stderr.write("Warning: output directory exists already ("+result_repo+")\n")
    else:
        sys.stderr.write("Creating output directory ("+result_repo+")\n")
        os.mkdir(result_repo)

    # create a temp dir where to save the files while they are being downloaded
    if not os.path.isdir(result_repo+"/temp_dir"):
        os.mkdir(result_repo+"/temp_dir")


    # download the info about the URLs from the ETH archive --------------------
    all_url_info = download_url_links()
    # dict like: 'ext_mOTU_v3_31652': ['f0cbf61dabb043108da42eb948bee657', 'https://libdrive.ethz.ch/index.php/s/AovZ7KpStxfGdhs/download?path=%2FmOTU_31508-31657&files=ext_mOTU_v3_31652.tar']


    # --------------------------------------------------------------------------
    # download all genomes
    if args.ID == "all":
        download_all_genomes(all_url_info,args.force_redownload)
        return 0

    # download all genomes from a mOTU -----------------------------------------
    if args.ID.startswith("ref_mOTU") or args.ID.startswith("meta_mOTU") or args.ID.startswith("ext_mOTU") or args.ID == "no_mOTU":
        if not args.ID in all_url_info:
            sys.stderr.write("Error: mOTUs not found\n")
            return 1
        # do we force the re-download:
        if args.force_redownload:
            if os.path.isdir(result_repo+"/"+args.ID):
                shutil.rmtree(result_repo+"/"+args.ID)
        # download if necessary
        if os.path.isdir(result_repo+"/"+args.ID):
            sys.stderr.write("mOTUs already downloaded before\n")
            path_result = result_repo+"/"+args.ID
        else:
            path_result = download_motu(args.ID,all_url_info)
        sys.stderr.write("\nResult in: "+path_result+"\n")
        sys.stderr.write("List of all genomes in: "+path_result+"/1.list_files.txt\n")
        return 0

    # download one genome ------------------------------------------------------
    genome_2_motus = load_map_genome_motus()   # it's a dict genome --> mOTU
    # check if the genome is a correct ID
    if not args.ID in genome_2_motus:
        sys.stderr.write("Error: genome not found\n")
        return 1
    download_genome(args.ID, genome_2_motus,all_url_info,args.force_redownload)
    return 0


#-------------------------------- run main -------------------------------------
if __name__ == '__main__':
    status = main()
    sys.exit(status)