From 4e9ffc6bbef33a314e2a2bfc9fc8a58c08486506 Mon Sep 17 00:00:00 2001 From: Laura Luebbert Date: Sat, 14 Oct 2023 14:07:04 -0700 Subject: [PATCH] Clean up gget diamond --- gget/gget_diamond.py | 214 ++++++++++++++++++------------------------- gget/utils.py | 39 +++++++- 2 files changed, 126 insertions(+), 127 deletions(-) diff --git a/gget/gget_diamond.py b/gget/gget_diamond.py index 7ef4b6b5..eb523130 100644 --- a/gget/gget_diamond.py +++ b/gget/gget_diamond.py @@ -5,12 +5,10 @@ import os import pandas as pd import uuid - -# DIAMOND and ELM id for temporary files -RANDOM_ID = str(uuid.uuid4()) - +import json as json_package from .compile import PACKAGE_PATH +from .utils import tsv_to_df, create_tmp_fasta, remove_temp_files # Path to precompiled diamond binary if platform.system() == "Windows": @@ -23,146 +21,97 @@ ) -def tsv_to_df(tsv_file, headers=None): - """ - Convert tsv file to dataframe format - - Args: - tsv_file - file to be converted - - Returns: - df - dataframe - - """ - - try: - df = pd.DataFrame() - if headers: - df = pd.read_csv(tsv_file, sep="\t", names=headers) - else: - # ELM Instances.tsv file had 5 lines before headers and data - df = pd.read_csv(tsv_file, sep="\t", skiprows=5) - return df - - except pd.errors.EmptyDataError: - logging.warning(f"Query did not result in any matches.") - return None - - -def create_input_file(sequences): - """ - Copy sequences to a temporary fasta file for DIAMOND alignment - - Args: - sequences - list of user input amino acid sequences - - Returns: input file absolute path - """ - # print(f"sequences for input file{sequences}") - if type(sequences) == str: - sequences = [sequences] - - with open(f"tmp_{RANDOM_ID}.fa", "w") as f: - for idx, seq in enumerate(sequences): - f.write(f">Seq {idx}\n") - f.write(f"{seq}\n") - - return f"tmp_{RANDOM_ID}.fa" - # check if correct sequences are written to file - # try: - # with open(f"{os.getcwd()}tmp_{RANDOM_ID}.fa", 'r') as f: - # print(f.read()) - # except: - # continue - - -def remove_temp_files(): - """ - Delete temporary files - - Args: - input - Input fasta file containing amino acid sequences - out - Output tsv file containing the output returned by DIAMOND - reference - Reference database binary file produced by DIAMOND - - Returns: - None - """ - if os.path.exists(f"tmp_{RANDOM_ID}_out.tsv"): - os.remove(f"tmp_{RANDOM_ID}_out.tsv") - if os.path.exists("reference.dmnd"): - os.remove("reference.dmnd") - if os.path.exists("tmp_{RANDOM_ID}.fa"): - os.remove("tmp_{RANDOM_ID}.fa") - - def diamond( - sequences, + query, reference, + diamond_db=None, + out=None, json=False, verbose=True, - out=None, sensitivity="very-sensitive", + threads=1, ): """ Perform protein sequence alignment using DIAMOND for multiple sequences Args: - - input Input sequences path and file name (include ,fa) in FASTA file format - - reference Reference file path and file name (include .fa) in FASTA file format - - json If True, returns results in json format instead of data frame. Default: False. - - out folder name to save two resulting csv files. Default: results (default: None). - - verbose True/False whether to print progress information. Default True. - - sensitivity The sensitivity can be adjusted using the options --fast, --mid-sensitive, --sensitive, --more-sensitive, --very-sensitive and --ultra-sensitive. - - Returns DIAMOND output in tsv format + - query Sequences (str or list) or path to FASTA file containing sequences to be aligned against the reference. + - reference Reference sequences (str or list) or path to FASTA file containing reference sequences. + - diamond_db Path to save DIAMOND database created from reference. + Default: None -> Temporary db file will be deleted after alingment or saved in out. + - out Path to folder to save DIAMOND results in. Default: None. + - json If True, returns results in json format instead of data frame. Default: False. + - verbose True/False whether to print progress information. Default True. + - sensitivity One of the following: fast, mid-sensitive, sensitive, more-sensitive, very-sensitive or ultra-sensitive. + Default: "very-sensitive" + - threads Number of threads to use for alignment. Default: 1. + + Returns a data frame with the DIAMOND alignment results. (Or JSON formatted dictionary if json=True.) """ - # TODO: --very_sensitive and makedb --in as args - # if out is None, create temp file and delete once get dataframe - # if make + # Check argument validity + supported_sens = [ + "fast", + "mid-sensitive", + "sensitive", + "more-sensitive", + "very-sensitive", + "ultra-sensitive", + ] + if sensitivity not in supported_sens: + raise ValueError( + f"'sensitivity' argument specified as {sensitivity}. Expected one of: {', '.join(supported_sens)}" + ) + + # Define paths to query/reference/db/output files + files_to_delete = [] + if "." in query: + input_file = os.path.abspath(query) + else: + input_file = create_tmp_fasta(query) + files_to_delete.append(input_file) - input_file = create_input_file(sequences) - output = f"tmp_{RANDOM_ID}_out.tsv" + if "." in reference: + reference_file = os.path.abspath(reference) + else: + reference_file = create_tmp_fasta(reference) + files_to_delete.append(reference_file) - if out is None: - command = f"{PRECOMPILED_DIAMOND_PATH} makedb --quiet --in {reference} -d reference \ - && {PRECOMPILED_DIAMOND_PATH} blastp --quiet -q {input_file} -d reference -o {output} --{sensitivity} --ignore-warnings" + if out: + out = os.path.abspath(out) + output = f"{out}/DIAMOND_results.tsv" else: - output = out - # The double-quotation marks allow white spaces in the path, but this does not work for Windows - command = f"{PRECOMPILED_DIAMOND_PATH} makedb --quiet --in {reference} -d reference \ - && {PRECOMPILED_DIAMOND_PATH} blastp --quiet -q {input_file} -d reference -o {out}.tsv --{sensitivity} --ignore-warnings" - # Run diamond command and write command output - with subprocess.Popen(command, shell=True, stderr=subprocess.PIPE) as process_2: - stderr_2 = process_2.stderr.read().decode("utf-8") + output = os.path.abspath(f"tmp_{str(uuid.uuid4())}_out.tsv") + files_to_delete.append(output) + + if not diamond_db and out: + diamond_db = f"{out}/DIAMOND_db" + elif not diamond_db: + diamond_db = os.path.abspath(f"tmp_db_{str(uuid.uuid4())}") + files_to_delete.append(diamond_db + ".dmnd") + + command = f"{PRECOMPILED_DIAMOND_PATH} makedb --in {reference_file} --db {diamond_db} --threads {threads} \ + && {PRECOMPILED_DIAMOND_PATH} blastp --query {input_file} --db {reference_file} --out {output} --{sensitivity} --threads {threads}" + + # Run DIAMOND + if verbose: + logging.info(f"Creating DIAMOND database and initiating alignment...") + + with subprocess.Popen(command, shell=True, stderr=subprocess.PIPE) as process: + stderr = process.stderr.read().decode("utf-8") # Log the standard error if it is not empty - if stderr_2: - sys.stderr.write(stderr_2) - # Exit system if the subprocess returned wstdout = sys.stdout + if stderr: + sys.stderr.write(stderr) - if process_2.wait() != 0: - #TODO: change error message - logging.error( - """ - DIAMOND failed. - """ - ) - return + # Exit system if the subprocess returned wstdout = sys.stdout + if process.wait() != 0: + raise RuntimeError("DIAMOND alignment failed.") else: if verbose: - logging.info(f"DIAMOND run complete.") - # try: - # with open(f"{os.getcwd()}/out.fa", 'r') as f: - # print(f.read()) - # except: - # pass - # print(f"Input file: {input_file}") - # print(f"Reference file: {reference}") - # print(f"Output file: {output}") + logging.info(f"DIAMOND alignment complete.") df_diamond = tsv_to_df( output, - [ + headers=[ "query_accession", "target_accession", "Per. Ident", @@ -177,5 +126,20 @@ def diamond( "bit_score", ], ) - remove_temp_files() - return df_diamond + + # Delete temporary files + if files_to_delete: + remove_temp_files(files_to_delete) + + if json: + results_dict = json_package.loads(df_diamond.to_json(orient="records")) + if out: + with open(f"{out}/gget_diamond_results.json", "w", encoding="utf-8") as f: + json_package.dump(results_dict, f, ensure_ascii=False, indent=4) + + return results_dict + + else: + if out: + df_diamond.to_csv(f"{out}/gget_diamond_results.csv", index=False) + return df_diamond diff --git a/gget/utils.py b/gget/utils.py index 7cb14f28..6a1b44ca 100644 --- a/gget/utils.py +++ b/gget/utils.py @@ -4,6 +4,8 @@ # from requests.adapters import HTTPAdapter, Retry # import time import re +import os +import uuid import pandas as pd import numpy as np from IPython.display import display, HTML @@ -767,5 +769,38 @@ def tsv_to_df(tsv_file, headers=None, skiprows=None): return df except pd.errors.EmptyDataError: - logging.error(f"tsv to data frame reformatting failed.") - return None + raise RuntimeError(f"tsv to data frame reformatting failed.") + + +def create_tmp_fasta(sequences): + """ + Create temporary FASTA file from str or list of sequences. + + Args: + - sequences List of user input amino acid sequences + + Returns: Absolute path to temoprary FASTA file. + """ + # Generate random ID + random_id = str(uuid.uuid4()) + + if type(sequences) == str: + sequences = [sequences] + + with open(f"tmp_{random_id}.fa", "w") as f: + for idx, seq in enumerate(sequences): + f.write(f">Seq {idx}\n" + seq + "\n") + + return os.path.abspath(f"tmp_{random_id}.fa") + + +def remove_temp_files(files_to_delete): + """ + Delete temporary files. + + Args: + - files_to_delete List of paths to files to delete. + """ + for file in files_to_delete: + if os.path.exists(file): + os.remove(file)