Skip to content

Commit

Permalink
adding hallmark taxonomy db
Browse files Browse the repository at this point in the history
  • Loading branch information
mtisza1 committed Mar 19, 2024
1 parent f3b2ad8 commit 624a325
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 41 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.Rhistory

blast_DBs/
cdd_rps_db/
hmmscan_DBs/
Expand Down
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ conda 23.7.4

`cd ..`

`get_ct3_dbs -o ct3_DBs --hmm T --mmseqs_tax T --mmseqs_cdd T --domain_list T`
`get_ct3_dbs -o ct3_DBs --hmm T --hallmark_tax T --mmseqs_cdd T --domain_list T`

<details>

Expand All @@ -107,7 +107,7 @@ conda 23.7.4
| pdb70 | 56 GB |

```
get_ct3_dbs -o ct3_DBs --hmm T --mmseqs_tax T --mmseqs_cdd T --domain_list T --hhCDD T --hhPFAM T --hhPDB T
get_ct3_dbs -o ct3_DBs --hmm T --hallmark_tax T --mmseqs_cdd T --domain_list T --hhCDD T --hhPFAM T --hhPDB T
```

</details>
Expand Down Expand Up @@ -142,7 +142,7 @@ conda 23.7.4

`cd ..`

`get_ct3_dbs -o ct3_DBs --hmm T --mmseqs_tax T --mmseqs_cdd T --domain_list T`
`get_ct3_dbs -o ct3_DBs --hmm T --hallmark_tax T --mmseqs_cdd T --domain_list T`

<details>

Expand All @@ -161,7 +161,7 @@ conda 23.7.4
| pdb70 | 56 GB |

```
get_ct3_dbs -o ct3_DBs --hmm T --mmseqs_tax T --mmseqs_cdd T --domain_list T --hhCDD T --hhPFAM T --hhPDB T
get_ct3_dbs -o ct3_DBs --hmm T --hallmark_tax T --mmseqs_cdd T --domain_list T --hhCDD T --hhPFAM T --hhPDB T
```

</details>
Expand Down
19 changes: 3 additions & 16 deletions src/cenote/cenote_main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ echo "Cenote scripts directory: $CENOTE_SCRIPTS" >> ${C_OUTDIR}/run_arg
echo "Template file: $TEMPLATE_FILE" >> ${C_OUTDIR}/run_arguments.txt
echo "read file(s): $READS" >> ${C_OUTDIR}/run_arguments.txt
echo "HHsuite tool: $HHSUITE_TOOL" >> ${C_OUTDIR}/run_arguments.txt
echo "Taxonomy DB: $TAXDBV" >> ${C_OUTDIR}/run_arguments.txt


cat ${C_OUTDIR}/run_arguments.txt

Expand Down Expand Up @@ -1232,22 +1234,7 @@ if [ -n "$FSA_FILES" ] && [ "$GENBANK" == "True" ] ; then
${TEMP_DIR}/mapping_reads/oriented_hallmark_contigs.pruned.coverage.tsv\
${C_OUTDIR}/sequin_and_genome_maps\
$ASSEMBLER $SEQTECH
# for REC in $FSA_FILES ; do
# if [ -s ${TEMP_DIR}/mapping_reads/oriented_hallmark_contigs.pruned.coverage.tsv ] ; then
# COVERAGE=$( awk -v SEQNAME="${REC%.fsa}" '{OFS=FS="\t"}{ if ($1 == SEQNAME) {print $7}}' \
# ${TEMP_DIR}/mapping_reads/oriented_hallmark_contigs.pruned.coverage.tsv | head -n1 )
#
# else
# COVERAGE=1
# fi
#
# echo "StructuredCommentPrefix ##Genome-Assembly-Data-START##" > ${REC%.fsa}.cmt
# echo "Assembly Method ${ASSEMBLER}" >> ${REC%.fsa}.cmt
# echo "Genome Coverage "$COVERAGE"x" >> ${REC%.fsa}.cmt
# echo "Sequencing Technology Illumina" >> ${REC%.fsa}.cmt
# echo "Annotation Pipeline Cenote-Taker 3" >> ${REC%.fsa}.cmt
# echo "URL https://github.com/mtisza1/Cenote-Taker3" >> ${REC%.fsa}.cmt
# done

fi

### Merge final virus seqs
Expand Down
11 changes: 7 additions & 4 deletions src/cenote/cenotetaker3.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,9 +227,11 @@ def cenotetaker3():
the first nucleotide in the contig/genome')
optional_args.add_argument("--genbank", dest="GENBANK", type=str2bool, default="True",
help='Default: True -- Make GenBank files (.gbf, .sqn, .fsa, .tbl, .cmt, etc)?')
optional_args.add_argument("--taxdb", dest="TAXDB", type=str, choices=['refseq', 'nr90'],
default='nr90',
help='Default: nr90 -- Which taxonomy database to use, refseq virus OR nr clustered at 90 percent AAI and filtered down to virus hallmark genes')
optional_args.add_argument("--taxdb", dest="TAXDB", type=str, choices=['refseq', 'hallmark'],
default='hallmark',
help='Default: hallmark -- Which taxonomy database to use, just refseq virus OR virus hallmark genes \
from nr virus containing genus, family, and class taxonomy labels and clustered at 90 percent \
AAI plus all hallmark genes from refseq virus')
optional_args.add_argument("--seqtech", dest="SEQTECH", type=str,
default='Illumina',
help='Default: Illumina -- Which sequencing technology produced the reads?')
Expand Down Expand Up @@ -292,7 +294,8 @@ def cenotetaker3():
TAXDBV = 'refseq_virus_prot_taxDB'
elif args.TAXDB == 'nr90':
TAXDBV = 'ct3_hallmark_nr_virus.cd90.taxDB'

elif args.TAXDB == 'hallmark':
TAXDBV = 'ct3_hallmark.taxDB'

## check that all DBs exist
def check_ct3_dbs():
Expand Down
55 changes: 40 additions & 15 deletions src/cenote/get_ct3_databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ def get_ct3_dbs():
#print(cenote_script_path)

parser = argparse.ArgumentParser(description='Update and/or download databases associated with \
Cenote-Taker 3. HMM (hmmer) databases: updated October 8th 2023.\
RefSeq Virus taxonomy DB compiled July 31, 2023.')
Cenote-Taker 3. HMM (hmmer) databases: updated January 10th, 2024.\
RefSeq Virus taxonomy DB compiled July 31, 2023.\
hallmark taxonomy database added March 19th, 2024')

required_args = parser.add_argument_group(' REQUIRED ARGUMENTS')

Expand All @@ -32,7 +33,9 @@ def get_ct3_dbs():

optional_args.add_argument("--hmm", dest="HMM_DB", type=str2bool, default=False,
help=' Default: False -- choose: True -or- False')
optional_args.add_argument("--mmseqs_tax", dest="MMSEQS_TAX", type=str2bool, default=False,
optional_args.add_argument("--refseq_tax", dest="REFSEQ_TAX", type=str2bool, default=False,
help=' Default: False -- choose: True -or- False')
optional_args.add_argument("--hallmark_tax", dest="HALLMARK_TAX", type=str2bool, default=False,
help=' Default: False -- choose: True -or- False')
optional_args.add_argument("--mmseqs_cdd", dest="MMSEQS_CDD", type=str2bool, default=False,
help=' Default: False -- choose: True -or- False')
Expand Down Expand Up @@ -61,44 +64,66 @@ def is_tool(name):
sys.exit()

if str(args.HMM_DB) == "True":
# https://zenodo.org/records/10480890/files/hmmscan_DBs.tgz
# https://zenodo.org/records/10840546/files/hmmscan_DBs.tgz
print ("running HMM database update/install")
HMM_outdir = os.path.join(str(args.C_DBS), "hmmscan_DBs")
if not os.path.isdir(HMM_outdir):
os.makedirs(HMM_outdir, exist_ok=True)
subprocess.call(['wget', '--directory-prefix=' + str(HMM_outdir),
'https://zenodo.org/records/10480890/files/hmmscan_DBs.tgz'])
'https://zenodo.org/records/10840546/files/hmmscan_DBs.tgz'])
subprocess.call(['tar', '-xvf', os.path.join(HMM_outdir, 'hmmscan_DBs.tgz'),
'-C', str(HMM_outdir)])
subprocess.call(['rm', '-f', os.path.join(HMM_outdir, 'hmmscan_DBs.tgz')])

if str(args.MMSEQS_TAX) == "True":
# https://zenodo.org/records/10480890/files/refseq_virus_prot.fasta.gz
# https://zenodo.org/records/10480890/files/refseq_virus_prot_taxids.mmseqs_fmt.tsv
print ("running mmseqs taxdb database update/install")
if str(args.REFSEQ_TAX) == "True":
# https://zenodo.org/records/10840546/files/refseq_virus_prot.fasta.gz
# https://zenodo.org/records/10840546/files/ct3_hallmark_nr_cd90_refseq.prot_taxids.mmseqs_fmt.tsv
print ("running mmseqs refseq taxdb database update/install")
if not is_tool("mmseqs") :
print("mmseqs is not found. Exiting. Is conda environment activated?")
sys.exit()
MMSEQS_outdir = os.path.join(str(args.C_DBS), "mmseqs_DBs")
if not os.path.isdir(MMSEQS_outdir):
os.makedirs(MMSEQS_outdir, exist_ok=True)
subprocess.call(['wget', '--directory-prefix=' + str(MMSEQS_outdir),
'https://zenodo.org/records/10480890/files/refseq_virus_prot.fasta.gz'])
'https://zenodo.org/records/10840546/files/refseq_virus_prot.fasta.gz'])
subprocess.call(['gunzip', '-d', os.path.join(MMSEQS_outdir, 'refseq_virus_prot.fasta.gz')])
#subprocess.call(['rm', '-f', os.path.join(MMSEQS_outdir, 'refseq_virus_prot.fasta.gz')])
subprocess.call(['wget', '--directory-prefix=' + str(MMSEQS_outdir),
'https://zenodo.org/records/10480890/files/refseq_virus_prot_taxids.mmseqs_fmt.tsv'])
'https://zenodo.org/records/10840546/files/ct3_hallmark_nr_cd90_refseq.prot_taxids.mmseqs_fmt.tsv'])
subprocess.call(['mmseqs', 'createdb', os.path.join(MMSEQS_outdir, 'refseq_virus_prot.fasta'),
os.path.join(MMSEQS_outdir, 'refseq_virus_prot_taxDB')])
subprocess.call(['mmseqs', 'createtaxdb', os.path.join(MMSEQS_outdir, 'refseq_virus_prot_taxDB'),
os.path.join(MMSEQS_outdir, 'tmp'), '--tax-mapping-file',
os.path.join(MMSEQS_outdir, 'refseq_virus_prot_taxids.mmseqs_fmt.tsv')])

if str(args.HALLMARK_TAX) == "True":
# https://zenodo.org/records/10840546/files/ct3_hallmark_nr_cd90_refseq.faa.gz
# https://zenodo.org/records/10840546/files/ct3_hallmark_nr_cd90_refseq.prot_taxids.mmseqs_fmt.tsv
print ("running mmseqs hallmark taxdb database update/install")
if not is_tool("mmseqs") :
print("mmseqs is not found. Exiting. Is conda environment activated?")
sys.exit()
MMSEQS_outdir = os.path.join(str(args.C_DBS), "mmseqs_DBs")
if not os.path.isdir(MMSEQS_outdir):
os.makedirs(MMSEQS_outdir, exist_ok=True)
subprocess.call(['wget', '--directory-prefix=' + str(MMSEQS_outdir),
'https://zenodo.org/records/10840546/files/ct3_hallmark_nr_cd90_refseq.faa.gz'])
subprocess.call(['gunzip', '-d', os.path.join(MMSEQS_outdir, 'ct3_hallmark_nr_cd90_refseq.faa.gz')])
#subprocess.call(['rm', '-f', os.path.join(MMSEQS_outdir, 'ct3_hallmark_nr_cd90_refseq.faa.gz')])
subprocess.call(['wget', '--directory-prefix=' + str(MMSEQS_outdir),
'https://zenodo.org/records/10840546/files/ct3_hallmark_nr_cd90_refseq.prot_taxids.mmseqs_fmt.tsv'])
subprocess.call(['mmseqs', 'createdb', os.path.join(MMSEQS_outdir, 'ct3_hallmark_nr_cd90_refseq.faa'),
os.path.join(MMSEQS_outdir, 'ct3_hallmark.taxDB')])
subprocess.call(['mmseqs', 'createtaxdb', os.path.join(MMSEQS_outdir, 'ct3_hallmark.taxDB'),
os.path.join(MMSEQS_outdir, 'tmp'), '--tax-mapping-file',
os.path.join(MMSEQS_outdir, 'ct3_hallmark_nr_cd90_refseq.prot_taxids.mmseqs_fmt.tsv')])

if str(args.MMSEQS_CDD) == "True":
# https://zenodo.org/records/10480890/files/cddid_all.tbl
# https://zenodo.org/records/10840546/files/cddid_all.tbl
print ("running mmseqs CDD database update/install")
subprocess.call(['wget', '--directory-prefix=' + str(MMSEQS_outdir),
'https://zenodo.org/records/10480890/files/cddid_all.tbl'])
'https://zenodo.org/records/10840546/files/cddid_all.tbl'])
if not is_tool("mmseqs") :
print("mmseqs is not found. Exiting. Is conda environment activated?")
sys.exit()
Expand All @@ -110,10 +135,10 @@ def is_tool(name):
os.path.join(MMSEQS_outdir, 'tmp')])


# https://zenodo.org/records/10480890/files/viral_cdds_and_pfams_191028.txt
# https://zenodo.org/records/10840546/files/viral_cdds_and_pfams_191028.txt
if str(args.DOM_LIST) == "True":
subprocess.call(['wget', '--directory-prefix=' + str(args.C_DBS),
'https://zenodo.org/records/10480890/files/viral_cdds_and_pfams_191028.txt'])
'https://zenodo.org/records/10840546/files/viral_cdds_and_pfams_191028.txt'])

if str(args.HHCDD) == "True":
print ("running hhsuite CDD database update/install")
Expand Down
6 changes: 5 additions & 1 deletion src/cenote/python_modules/summary_statement.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@

gene_df = pd.read_csv(genes_sum, sep = '\t', header = 0)

num_virus_contigs = len(gene_df['contig'].drop_duplicates())
gene_df['chunk_name'] = gene_df['chunk_name'].infer_objects(copy=False).fillna("NaN")

gene_df['contig_chunk'] = gene_df['contig'] + "@" + gene_df['chunk_name']

num_virus_contigs = len(gene_df['contig_chunk'].drop_duplicates())

all_genes = len(gene_df['gene_name'].drop_duplicates())
nonhypo_genes = len(gene_df.query('evidence_description != "hypothetical protein"')['gene_name'].drop_duplicates())
Expand Down
2 changes: 1 addition & 1 deletion src/cenote/python_modules/virus_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@
if not sam_df.empty:
merge_df = pd.merge(merge_df, sam_df, on = ["contig", "chunk_name"], how = "left")
else:
merge_df['coverage'] = 0
merge_df['coverage'] = "NaN"

merge_df['taxon'] = merge_df['taxon'].infer_objects(copy=False).fillna("unclassified virus")

Expand Down

0 comments on commit 624a325

Please sign in to comment.