Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add --input_variant parameter for handling single variant #112

Merged
merged 5 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#FROM ubuntu:18.04
FROM ensemblorg/ensembl-vep:release_104.3
USER root
ENV DEBIAN_FRONTEND noninteractive
ENV DEBIAN_FRONTEND=noninteractive

# Install dependencies
RUN apt-get update && apt-get install -y \
Expand Down Expand Up @@ -38,7 +38,7 @@ RUN pip3 install -r /opt/requirements.txt
RUN pip3 install bgzip

# Install bcftools
RUN wget https://github.com/samtools/bcftools/releases/download/1.20/bcftools-1.20.tar.bz2
RUN curl -OL https://github.com/samtools/bcftools/releases/download/1.20/bcftools-1.20.tar.bz2
RUN mv bcftools-1.20.tar.bz2 /opt/bcftools-1.20.tar.bz2
RUN tar -xf /opt/bcftools-1.20.tar.bz2 -C /opt/ && \
rm /opt/bcftools-1.20.tar.bz2 && \
Expand All @@ -49,7 +49,7 @@ RUN tar -xf /opt/bcftools-1.20.tar.bz2 -C /opt/ && \
rm -rf /opt/bcftools-1.20

# Install bedtools
RUN wget https://github.com/arq5x/bedtools2/releases/download/v2.30.0/bedtools.static.binary
RUN curl -OL https://github.com/arq5x/bedtools2/releases/download/v2.30.0/bedtools.static.binary
RUN mv bedtools.static.binary /run/bedtools
RUN chmod a+x /run/bedtools

Expand Down
37 changes: 18 additions & 19 deletions bin/add_c_nc.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ def add_c_nc(score, ref):
if ref == "hg38":
clin_c = pd.read_csv("merge_expand/hg38/clin_c.tsv.gz", sep="\t")
clin_nc = pd.read_csv("merge_expand/hg38/clin_nc.tsv.gz", sep="\t")
hgmd_nc = pd.read_csv("merge_expand/hg38/hgmd_nc.tsv.gz", sep="\t")
hgmd_c = pd.read_csv("merge_expand/hg38/hgmd_c.tsv.gz", sep="\t")
hgmd_nc = pd.read_csv("merge_expand/hg38/hgmd_nc.tsv.gz", sep="\t")
else:
clin_c = pd.read_csv("merge_expand/hg19/clin_c.tsv.gz", sep="\t")
clin_nc = pd.read_csv("merge_expand/hg19/clin_nc.tsv.gz", sep="\t")
hgmd_nc = pd.read_csv("merge_expand/hg19/hgmd_nc.tsv.gz", sep="\t")
hgmd_c = pd.read_csv("merge_expand/hg19/hgmd_c.tsv.gz", sep="\t")
hgmd_nc = pd.read_csv("merge_expand/hg19/hgmd_nc.tsv.gz", sep="\t")

a = score.pos.values
ac = score.chrom.values
Expand All @@ -28,7 +28,7 @@ def add_c_nc(score, ref):

i, j = np.where((a[:, None] >= bl) & (a[:, None] <= bh) & (ac[:, None] == bc))

cln = pd.concat(
clin = pd.concat(
[
temp.loc[i, :].reset_index(drop=True),
clin_nc.loc[j, :].reset_index(drop=True),
Expand All @@ -38,7 +38,7 @@ def add_c_nc(score, ref):

# Take into account When HGMD data base is empty
if hgmd_nc.shape[0] == 0:
pass
hgmd = hgmd_nc

else:
# merge by region
Expand All @@ -58,29 +58,28 @@ def add_c_nc(score, ref):
axis=1,
)

merged = score.merge(cln, how="left", on="varId")
if hgmd_nc.shape[0] == 0:
merged["nc_HGMD_Exp"] = np.NaN
merged["nc_RANKSCORE"] = np.NaN
else:
merged = merged.merge(hgmd, how="left", on="varId")
merged = score.merge(
clin_c.rename(columns={'new_chr': 'chrom', 'new_pos': 'pos'}),
how="left",
on=["chrom", "pos", "ref", "alt"],
)
merged = merged.merge(clin, how="left", on="varId")

if hgmd_c.shape[0] == 0:
merged["c_HGMD_Exp"] = np.NaN
merged["c_RANKSCORE"] = np.NaN
merged["CLASS"] = np.NaN
else:
merged = merged.merge(
hgmd_c,
hgmd_c.rename(columns={'new_chr': 'chrom', 'new_pos': 'pos'}),
how="left",
left_on=["chrom", "pos", "ref", "alt"],
right_on=["new_chr", "new_pos", "ref", "alt"],
on=["chrom", "pos", "ref", "alt"],
)
merged = merged.merge(
clin_c,
how="left",
left_on=["chrom", "pos", "ref", "alt"],
right_on=["new_chr", "new_pos", "ref", "alt"],
)

if hgmd_nc.shape[0] == 0:
merged["nc_HGMD_Exp"] = np.NaN
merged["nc_RANKSCORE"] = np.NaN
else:
merged = merged.merge(hgmd, how="left", on="varId")

return merged
5 changes: 4 additions & 1 deletion bin/extraModel/generate_bivar_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def process_sample(data_folder, sample_id, default_pred, labeling=False):
]

# Remove all the duplicated variant pairs.
gene_var_pairs_df = pd.DataFrame(gene_var_pairs)
gene_var_pairs_df = pd.DataFrame(gene_var_pairs, columns=['geneEnsId', 'varId1', 'varId2'])
# gene_var_pairs_df = gene_var_pairs_df.drop_duplicates(['varId1', 'varId2'])

# Use only subset columns of features
Expand Down Expand Up @@ -130,4 +130,7 @@ def process_sample(data_folder, sample_id, default_pred, labeling=False):
# Sort before saving
recessive_feature_df = recessive_feature_df.sort_index()

if recessive_feature_df.shape[0] == 0:
return

recessive_feature_df.to_csv(f"{recessive_folder}/{sample_id}.csv")
2 changes: 1 addition & 1 deletion bin/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def main():
print("input annoatated varFile:", args.varFile)
t1 = time.time()
varDf = pd.read_csv(
args.varFile, sep="\t", skiprows=numHeaderSkip, error_bad_lines=False
args.varFile, sep="\t", skiprows=numHeaderSkip,
)
# varDf=varDf[0:10]
# #do this if need to have a small test
Expand Down
35 changes: 35 additions & 0 deletions bin/generate_input_vcf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env python3.8

import string
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("variant", type=str)

args = parser.parse_args()

vcf_template = string.Template("""
##fileformat=VCFv4.2
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##FILTER=<ID=PASS,Description="All filters passed">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE
$chrom $pos . $ref $alt . . . GT:AD:DP:GQ:PL 0/1:7,5:12:99:142,0,214
""".strip())

def main(variant):
chrom, pos, ref, alt = variant.split("-")
with open("input.vcf", "w", encoding="ascii") as text_file:
text_file.write(vcf_template.substitute(
chrom=chrom,
pos=pos,
ref=ref,
alt=alt,
))

if __name__ == "__main__":
main(**vars(args))
1 change: 1 addition & 0 deletions docs/source/nf_usage.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Args:
--outdir Output directory.
--run_id Unique identifier for this run. (default: 1)
--ref_ver Reference genome version [hg38 or hg19] (default: hg19)
--input_variant Variant ID Formatted in 'chr-pos-alt-ref' (optional)
--bed_filter Path to a BED file to perform an analysis only for regions of interest (optional)
--exome_filter Enable exonic filter. Addition will filter out variants outside of exonic region (default: false)

Expand Down
Loading
Loading