From 2c91e86c0aa5a0e279e3a43391babce68dfcba42 Mon Sep 17 00:00:00 2001 From: Phil Darnowsky Date: Wed, 5 Jun 2024 14:20:45 -0400 Subject: [PATCH] fixup: uncomment commented-out pipeline code --- .../data_types/variant/annotate_variants.py | 1 - .../pipelines/export_to_elasticsearch.py | 6 +- .../pipelines/gnomad_v4_variants.py | 150 +++++++++--------- 3 files changed, 75 insertions(+), 82 deletions(-) diff --git a/data-pipeline/src/data_pipeline/data_types/variant/annotate_variants.py b/data-pipeline/src/data_pipeline/data_types/variant/annotate_variants.py index 04e1ebd1a..658c5d222 100644 --- a/data-pipeline/src/data_pipeline/data_types/variant/annotate_variants.py +++ b/data-pipeline/src/data_pipeline/data_types/variant/annotate_variants.py @@ -34,5 +34,4 @@ def annotate_vrs_ids(variants_path, exome_variants_path, genome_variants_path): vrs = exome_vrs.union(genome_vrs) vrs = vrs.group_by(vrs.locus, vrs.alleles).aggregate(vrs=hl.agg.collect(vrs.vrs)) ds = ds.join(vrs) - ds.describe() return ds diff --git a/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py b/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py index 4627a581d..709a788a0 100644 --- a/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py +++ b/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py @@ -120,11 +120,7 @@ def add_liftover_document_id(ds): ############################################################################################################## "gnomad_v4_variants": { "get_table": lambda: subset_table( - add_variant_document_id( - hl.read_table( - "gs://gnomad-browser-data-pipeline/phil-scratch/output/gnomad_v4/gnomad_v4_variants_annotated_4.ht" - ) - ) + add_variant_document_id(hl.read_table(gnomad_v4_variants_pipeline.get_output("variants").get_output_path())) ), "args": { "index": "gnomad_v4_variants", diff --git a/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py b/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py index 8a0945195..adae4fbc3 100644 --- a/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py +++ b/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py @@ -11,7 +11,7 @@ ) -# from data_pipeline.pipelines.genes import pipeline as genes_pipeline +from data_pipeline.pipelines.genes import pipeline as genes_pipeline from data_pipeline.datasets.gnomad_v4.gnomad_v4_validation import ( validate_exome_globals_input, @@ -25,9 +25,9 @@ ) from data_pipeline.data_types.variant import ( - # annotate_variants, - # annotate_transcript_consequences, - # annotate_caids, + annotate_variants, + annotate_transcript_consequences, + annotate_caids, annotate_vrs_ids, ) @@ -39,59 +39,57 @@ config = PipelineConfig( name=pipeline_name, - # input_root="gs://gnomad-v4-data-pipeline/input", - # output_root="gs://gnomad-v4-data-pipeline/output", - input_root="gs://gnomad-browser-data-pipeline/phil-scratch/input", - output_root="gs://gnomad-browser-data-pipeline/phil-scratch/output", + input_root="gs://gnomad-v4-data-pipeline/input", + output_root="gs://gnomad-v4-data-pipeline/output", ) pipeline = Pipeline(config=config) -# pipeline.add_task( -# name="prepare_gnomad_v4_variants", -# task_function=prepare_gnomad_v4_variants, -# output_path=f"{output_sub_dir}/gnomad_v4_variants_base.ht", -# inputs={ -# "exome_variants_path": "gs://gcp-public-data--gnomad/release/4.1/ht/exomes/gnomad.exomes.v4.1.sites.ht", -# "genome_variants_path": "gs://gcp-public-data--gnomad/release/4.1/ht/genomes/gnomad.genomes.v4.1.sites.ht", -# "variants_joint_frequency_path": "gs://gcp-public-data--gnomad/release/4.1/ht/joint/gnomad.joint.v4.1.sites.ht", -# }, -# ) -# -# pipeline.add_task( -# name="annotate_gnomad_v4_variants", -# task_function=annotate_variants, -# output_path=f"{output_sub_dir}/gnomad_v4_variants_annotated_1.ht", -# inputs=( -# { -# "variants_path": pipeline.get_task("prepare_gnomad_v4_variants"), -# "exome_coverage_path": "gs://gcp-public-data--gnomad/release/4.0/coverage/exomes/gnomad.exomes.v4.0.coverage.ht", -# "genome_coverage_path": "gs://gcp-public-data--gnomad/release/3.0.1/coverage/genomes/gnomad.genomes.r3.0.1.coverage.ht", -# } -# ), -# ) -# -# pipeline.add_task( -# name="annotate_gnomad_v4_transcript_consequences", -# task_function=annotate_transcript_consequences, -# output_path=f"{output_sub_dir}/gnomad_v4_variants_annotated_2.ht", -# inputs={ -# "variants_path": pipeline.get_task("annotate_gnomad_v4_variants"), -# "transcripts_path": genes_pipeline.get_output("base_transcripts_grch38"), -# "mane_transcripts_path": genes_pipeline.get_output("mane_select_transcripts"), -# }, -# ) -# -# pipeline.add_task( -# name="annotate_gnomad_v4_caids", -# task_function=annotate_caids, -# output_path=f"{output_sub_dir}/gnomad_v4_variants_annotated_3.ht", -# inputs={ -# "variants_path": pipeline.get_task("annotate_gnomad_v4_transcript_consequences"), -# "caids_path": "gs://gnomad-browser-data-pipeline/caids/gnomad_v4_caids.ht", -# }, -# ) +pipeline.add_task( + name="prepare_gnomad_v4_variants", + task_function=prepare_gnomad_v4_variants, + output_path=f"{output_sub_dir}/gnomad_v4_variants_base.ht", + inputs={ + "exome_variants_path": "gs://gcp-public-data--gnomad/release/4.1/ht/exomes/gnomad.exomes.v4.1.sites.ht", + "genome_variants_path": "gs://gcp-public-data--gnomad/release/4.1/ht/genomes/gnomad.genomes.v4.1.sites.ht", + "variants_joint_frequency_path": "gs://gcp-public-data--gnomad/release/4.1/ht/joint/gnomad.joint.v4.1.sites.ht", + }, +) + +pipeline.add_task( + name="annotate_gnomad_v4_variants", + task_function=annotate_variants, + output_path=f"{output_sub_dir}/gnomad_v4_variants_annotated_1.ht", + inputs=( + { + "variants_path": pipeline.get_task("prepare_gnomad_v4_variants"), + "exome_coverage_path": "gs://gcp-public-data--gnomad/release/4.0/coverage/exomes/gnomad.exomes.v4.0.coverage.ht", + "genome_coverage_path": "gs://gcp-public-data--gnomad/release/3.0.1/coverage/genomes/gnomad.genomes.r3.0.1.coverage.ht", + } + ), +) + +pipeline.add_task( + name="annotate_gnomad_v4_transcript_consequences", + task_function=annotate_transcript_consequences, + output_path=f"{output_sub_dir}/gnomad_v4_variants_annotated_2.ht", + inputs={ + "variants_path": pipeline.get_task("annotate_gnomad_v4_variants"), + "transcripts_path": genes_pipeline.get_output("base_transcripts_grch38"), + "mane_transcripts_path": genes_pipeline.get_output("mane_select_transcripts"), + }, +) + +pipeline.add_task( + name="annotate_gnomad_v4_caids", + task_function=annotate_caids, + output_path=f"{output_sub_dir}/gnomad_v4_variants_annotated_3.ht", + inputs={ + "variants_path": pipeline.get_task("annotate_gnomad_v4_transcript_consequences"), + "caids_path": "gs://gnomad-browser-data-pipeline/caids/gnomad_v4_caids.ht", + }, +) pipeline.add_task( name="annotate_vrs_ids", @@ -118,27 +116,27 @@ if RUN: run_pipeline(pipeline) -# write_schemas( -# [pipeline], -# os.path.expanduser("~/schemas"), -# task_names=[ -# "prepare_gnomad_v4_variants", -# "annotate_gnomad_v4_variants", -# "annotate_gnomad_v4_transcript_consequences", -# "annotate_gnomad_v4_caids", -# "annotate_vrs_ids", -# ], -# ) -# # copy locally using: -# # gcloud compute scp dp-m:~/schemas . --tunnel-through-iap --recurse -# -# logger.info("Validating pipeline IO formats") -# -# validate_exome_globals_input(pipeline) -# validate_genome_globals_input(pipeline) -# validate_exome_variant_input(pipeline) -# validate_genome_variant_input(pipeline) -# validate_step1_output(pipeline) -# validate_step2_output(pipeline) -# validate_step3_output(pipeline) -# validate_step4_output(pipeline) + write_schemas( + [pipeline], + os.path.expanduser("~/schemas"), + task_names=[ + "prepare_gnomad_v4_variants", + "annotate_gnomad_v4_variants", + "annotate_gnomad_v4_transcript_consequences", + "annotate_gnomad_v4_caids", + "annotate_vrs_ids", + ], + ) + # copy locally using: + # gcloud compute scp dp-m:~/schemas . --tunnel-through-iap --recurse + + logger.info("Validating pipeline IO formats") + + validate_exome_globals_input(pipeline) + validate_genome_globals_input(pipeline) + validate_exome_variant_input(pipeline) + validate_genome_variant_input(pipeline) + validate_step1_output(pipeline) + validate_step2_output(pipeline) + validate_step3_output(pipeline) + validate_step4_output(pipeline)