Skip to content

Commit

Permalink
fixup: uncomment commented-out pipeline code
Browse files Browse the repository at this point in the history
  • Loading branch information
phildarnowsky-broad committed Jun 5, 2024
1 parent 6439818 commit 2c91e86
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 82 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,4 @@ def annotate_vrs_ids(variants_path, exome_variants_path, genome_variants_path):
vrs = exome_vrs.union(genome_vrs)
vrs = vrs.group_by(vrs.locus, vrs.alleles).aggregate(vrs=hl.agg.collect(vrs.vrs))
ds = ds.join(vrs)
ds.describe()
return ds
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,7 @@ def add_liftover_document_id(ds):
##############################################################################################################
"gnomad_v4_variants": {
"get_table": lambda: subset_table(
add_variant_document_id(
hl.read_table(
"gs://gnomad-browser-data-pipeline/phil-scratch/output/gnomad_v4/gnomad_v4_variants_annotated_4.ht"
)
)
add_variant_document_id(hl.read_table(gnomad_v4_variants_pipeline.get_output("variants").get_output_path()))
),
"args": {
"index": "gnomad_v4_variants",
Expand Down
150 changes: 74 additions & 76 deletions data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
)


# from data_pipeline.pipelines.genes import pipeline as genes_pipeline
from data_pipeline.pipelines.genes import pipeline as genes_pipeline

from data_pipeline.datasets.gnomad_v4.gnomad_v4_validation import (
validate_exome_globals_input,
Expand All @@ -25,9 +25,9 @@
)

from data_pipeline.data_types.variant import (
# annotate_variants,
# annotate_transcript_consequences,
# annotate_caids,
annotate_variants,
annotate_transcript_consequences,
annotate_caids,
annotate_vrs_ids,
)

Expand All @@ -39,59 +39,57 @@

config = PipelineConfig(
name=pipeline_name,
# input_root="gs://gnomad-v4-data-pipeline/input",
# output_root="gs://gnomad-v4-data-pipeline/output",
input_root="gs://gnomad-browser-data-pipeline/phil-scratch/input",
output_root="gs://gnomad-browser-data-pipeline/phil-scratch/output",
input_root="gs://gnomad-v4-data-pipeline/input",
output_root="gs://gnomad-v4-data-pipeline/output",
)


pipeline = Pipeline(config=config)

# pipeline.add_task(
# name="prepare_gnomad_v4_variants",
# task_function=prepare_gnomad_v4_variants,
# output_path=f"{output_sub_dir}/gnomad_v4_variants_base.ht",
# inputs={
# "exome_variants_path": "gs://gcp-public-data--gnomad/release/4.1/ht/exomes/gnomad.exomes.v4.1.sites.ht",
# "genome_variants_path": "gs://gcp-public-data--gnomad/release/4.1/ht/genomes/gnomad.genomes.v4.1.sites.ht",
# "variants_joint_frequency_path": "gs://gcp-public-data--gnomad/release/4.1/ht/joint/gnomad.joint.v4.1.sites.ht",
# },
# )
#
# pipeline.add_task(
# name="annotate_gnomad_v4_variants",
# task_function=annotate_variants,
# output_path=f"{output_sub_dir}/gnomad_v4_variants_annotated_1.ht",
# inputs=(
# {
# "variants_path": pipeline.get_task("prepare_gnomad_v4_variants"),
# "exome_coverage_path": "gs://gcp-public-data--gnomad/release/4.0/coverage/exomes/gnomad.exomes.v4.0.coverage.ht",
# "genome_coverage_path": "gs://gcp-public-data--gnomad/release/3.0.1/coverage/genomes/gnomad.genomes.r3.0.1.coverage.ht",
# }
# ),
# )
#
# pipeline.add_task(
# name="annotate_gnomad_v4_transcript_consequences",
# task_function=annotate_transcript_consequences,
# output_path=f"{output_sub_dir}/gnomad_v4_variants_annotated_2.ht",
# inputs={
# "variants_path": pipeline.get_task("annotate_gnomad_v4_variants"),
# "transcripts_path": genes_pipeline.get_output("base_transcripts_grch38"),
# "mane_transcripts_path": genes_pipeline.get_output("mane_select_transcripts"),
# },
# )
#
# pipeline.add_task(
# name="annotate_gnomad_v4_caids",
# task_function=annotate_caids,
# output_path=f"{output_sub_dir}/gnomad_v4_variants_annotated_3.ht",
# inputs={
# "variants_path": pipeline.get_task("annotate_gnomad_v4_transcript_consequences"),
# "caids_path": "gs://gnomad-browser-data-pipeline/caids/gnomad_v4_caids.ht",
# },
# )
pipeline.add_task(
name="prepare_gnomad_v4_variants",
task_function=prepare_gnomad_v4_variants,
output_path=f"{output_sub_dir}/gnomad_v4_variants_base.ht",
inputs={
"exome_variants_path": "gs://gcp-public-data--gnomad/release/4.1/ht/exomes/gnomad.exomes.v4.1.sites.ht",
"genome_variants_path": "gs://gcp-public-data--gnomad/release/4.1/ht/genomes/gnomad.genomes.v4.1.sites.ht",
"variants_joint_frequency_path": "gs://gcp-public-data--gnomad/release/4.1/ht/joint/gnomad.joint.v4.1.sites.ht",
},
)

pipeline.add_task(
name="annotate_gnomad_v4_variants",
task_function=annotate_variants,
output_path=f"{output_sub_dir}/gnomad_v4_variants_annotated_1.ht",
inputs=(
{
"variants_path": pipeline.get_task("prepare_gnomad_v4_variants"),
"exome_coverage_path": "gs://gcp-public-data--gnomad/release/4.0/coverage/exomes/gnomad.exomes.v4.0.coverage.ht",
"genome_coverage_path": "gs://gcp-public-data--gnomad/release/3.0.1/coverage/genomes/gnomad.genomes.r3.0.1.coverage.ht",
}
),
)

pipeline.add_task(
name="annotate_gnomad_v4_transcript_consequences",
task_function=annotate_transcript_consequences,
output_path=f"{output_sub_dir}/gnomad_v4_variants_annotated_2.ht",
inputs={
"variants_path": pipeline.get_task("annotate_gnomad_v4_variants"),
"transcripts_path": genes_pipeline.get_output("base_transcripts_grch38"),
"mane_transcripts_path": genes_pipeline.get_output("mane_select_transcripts"),
},
)

pipeline.add_task(
name="annotate_gnomad_v4_caids",
task_function=annotate_caids,
output_path=f"{output_sub_dir}/gnomad_v4_variants_annotated_3.ht",
inputs={
"variants_path": pipeline.get_task("annotate_gnomad_v4_transcript_consequences"),
"caids_path": "gs://gnomad-browser-data-pipeline/caids/gnomad_v4_caids.ht",
},
)

pipeline.add_task(
name="annotate_vrs_ids",
Expand All @@ -118,27 +116,27 @@
if RUN:
run_pipeline(pipeline)

# write_schemas(
# [pipeline],
# os.path.expanduser("~/schemas"),
# task_names=[
# "prepare_gnomad_v4_variants",
# "annotate_gnomad_v4_variants",
# "annotate_gnomad_v4_transcript_consequences",
# "annotate_gnomad_v4_caids",
# "annotate_vrs_ids",
# ],
# )
# # copy locally using:
# # gcloud compute scp dp-m:~/schemas . --tunnel-through-iap --recurse
#
# logger.info("Validating pipeline IO formats")
#
# validate_exome_globals_input(pipeline)
# validate_genome_globals_input(pipeline)
# validate_exome_variant_input(pipeline)
# validate_genome_variant_input(pipeline)
# validate_step1_output(pipeline)
# validate_step2_output(pipeline)
# validate_step3_output(pipeline)
# validate_step4_output(pipeline)
write_schemas(
[pipeline],
os.path.expanduser("~/schemas"),
task_names=[
"prepare_gnomad_v4_variants",
"annotate_gnomad_v4_variants",
"annotate_gnomad_v4_transcript_consequences",
"annotate_gnomad_v4_caids",
"annotate_vrs_ids",
],
)
# copy locally using:
# gcloud compute scp dp-m:~/schemas . --tunnel-through-iap --recurse

logger.info("Validating pipeline IO formats")

validate_exome_globals_input(pipeline)
validate_genome_globals_input(pipeline)
validate_exome_variant_input(pipeline)
validate_genome_variant_input(pipeline)
validate_step1_output(pipeline)
validate_step2_output(pipeline)
validate_step3_output(pipeline)
validate_step4_output(pipeline)

0 comments on commit 2c91e86

Please sign in to comment.