diff --git a/NEWS b/NEWS index d1730ee19..7d03c1eff 100644 --- a/NEWS +++ b/NEWS @@ -128,6 +128,11 @@ Changes affecting specific commands: - New `-H, --print-header` option to print the header with `-f` + - Work around a bug in the LOFTEE VEP plugin used to annotate gnomAD VCFs. There the + LoF_info subfield contains commas which, in general, makes it impossible to parse the + VEP subfields. The +split-vep plugin can now work with such files, replacing the offending + commas with slash (/) characters. See also https://github.com/Ensembl/ensembl-vep/issues/1351 + * bcftools stats - The per-sample stats (PSC) would not be computed when `-i/-e` filtering options and diff --git a/plugins/split-vep.c b/plugins/split-vep.c index a0517a5c8..387ebc8ee 100644 --- a/plugins/split-vep.c +++ b/plugins/split-vep.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2019-2022 Genome Research Ltd. + Copyright (c) 2019-2023 Genome Research Ltd. Author: Petr Danecek @@ -1095,6 +1095,55 @@ static void restrict_csqs_to_genes(args_t *args) } args->ncols_csq = nhit; } + +// Split the VEP annotation by transcript and by field, then check if the number of subfields looks alright. +// Unfortunately, we cannot enforce the number of subfields to match the header definition because that can +// be variable: `bcftools csq` outputs different number of fields for different consequence types. +// So we need to distinguish between this reasonable case and incorrectly formated consequences such +// as those reported for LoF_info subfield here https://github.com/Ensembl/ensembl-vep/issues/1351. +static void split_csq_fields(args_t *args, bcf1_t *rec, int csq_str_len) +{ + int i; + args->cols_tr = cols_split(args->csq_str, args->cols_tr, ','); + if ( args->cols_tr->n > args->mcols_csq ) + { + args->cols_csq = (cols_t**)realloc(args->cols_csq,args->cols_tr->n*sizeof(*args->cols_csq)); + for (i=args->mcols_csq; icols_tr->n; i++) args->cols_csq[i] = NULL; + args->mcols_csq = args->cols_tr->n; + } + args->ncols_csq = args->cols_tr->n; + int nfield_diff = 0, need_fix = 0; + for (i=0; icols_tr->n; i++) + { + args->cols_csq[i] = cols_split(args->cols_tr->off[i], args->cols_csq[i], '|'); + if ( args->csq_idx >= args->cols_csq[i]->n ) need_fix = 1; + if ( nfield_diff < abs(args->cols_csq[i]->n - args->nfield) ) nfield_diff = args->cols_csq[i]->n - args->nfield; + } + if ( !csq_str_len ) return; // called 2nd time, don't attempt to fix + if ( !need_fix ) return; + + static int warned = 0; + if ( !warned ) + fprintf(stderr,"Warning: The number of INFO/%s subfields at %s:%"PRIhts_pos" does not match the header definition,\n" + " expected %d subfields, found as %s as %d. (This warning is printed only once.)\n", + args->vep_tag,bcf_seqname(args->hdr,rec),rec->pos+1,args->nfield,nfield_diff>0?"much":"few",args->nfield+nfield_diff); + warned = 1; + + // One known failure mode is LoF_info subfield which can contain commas. Work around this by relying on + // the number of pipe delimiters matching the header definition, replacing offending commas with slash + // characters. This assumes that there is never a comma in the first subfield, otherwise it would be + // impossible to distinguish between A|A,A,B,B|B and A|A,A,A,B|B + int npipe = 0; + while ( csq_str_len > 0 ) + { + csq_str_len--; + if ( args->csq_str[csq_str_len]=='|' ) { npipe++; continue; } + if ( args->csq_str[csq_str_len]!=',' ) continue; + if ( npipe && !(npipe % (args->nfield-1)) ) { npipe = 0; continue; } // wholesome number of pipes encountered, this is a valid comma + args->csq_str[csq_str_len] = '/'; + } + split_csq_fields(args,rec,0); // try once more +} static void process_record(args_t *args, bcf1_t *rec) { int i,len = bcf_get_info_string(args->hdr,rec,args->vep_tag,&args->csq_str,&args->ncsq_str); @@ -1108,17 +1157,7 @@ static void process_record(args_t *args, bcf1_t *rec) return; } - // split by transcript and by field - args->cols_tr = cols_split(args->csq_str, args->cols_tr, ','); - if ( args->cols_tr->n > args->mcols_csq ) - { - args->cols_csq = (cols_t**)realloc(args->cols_csq,args->cols_tr->n*sizeof(*args->cols_csq)); - for (i=args->mcols_csq; icols_tr->n; i++) args->cols_csq[i] = NULL; - args->mcols_csq = args->cols_tr->n; - } - args->ncols_csq = args->cols_tr->n; - for (i=0; icols_tr->n; i++) - args->cols_csq[i] = cols_split(args->cols_tr->off[i], args->cols_csq[i], '|'); + split_csq_fields(args,rec,len); // restrict to -g genes if ( args->genes ) restrict_csqs_to_genes(args); diff --git a/test/split-vep.broken-LoF.out b/test/split-vep.broken-LoF.out new file mode 100644 index 000000000..04bb3f481 --- /dev/null +++ b/test/split-vep.broken-LoF.out @@ -0,0 +1,18 @@ +chr21:5032064 missense_variant . +chr21:5032064 missense_variant . +chr21:5032064 missense_variant . +chr21:5032064 3_prime_UTR_variant&NMD_transcript_variant . +chr21:5032064 missense_variant . +chr21:5032064 missense_variant . +chr21:5032064 missense_variant . +chr21:5032064 missense_variant . +chr21:5032064 missense_variant . +chr21:5032064 frameshift_variant PERCENTILE:0.773118279569892/GERP_DIST:-366.377766615897/BP_DIST:218/DIST_FROM_LAST_EXON:187/50_BP_RULE:PASS/ANN_ORF:-698.745/MAX_ORF:-698.745 +chr21:5032064 frameshift_variant PERCENTILE:0.635578583765112/GERP_DIST:-366.377766615897/BP_DIST:218/DIST_FROM_LAST_EXON:187/50_BP_RULE:PASS/ANN_ORF:-698.745/MAX_ORF:-698.745 +chr21:5032064 frameshift_variant PERCENTILE:0.659498207885305/GERP_DIST:-372.525567065179/BP_DIST:197/DIST_FROM_LAST_EXON:187/50_BP_RULE:PASS/ANN_ORF:-698.745/MAX_ORF:-698.745 +chr21:5032064 3_prime_UTR_variant&NMD_transcript_variant . +chr21:5032064 frameshift_variant PERCENTILE:0.790979097909791/GERP_DIST:-372.525567065179/BP_DIST:197/DIST_FROM_LAST_EXON:187/50_BP_RULE:PASS/ANN_ORF:-698.745/MAX_ORF:-698.745 +chr21:5032064 frameshift_variant PERCENTILE:0.790979097909791/GERP_DIST:-372.525567065179/BP_DIST:197/DIST_FROM_LAST_EXON:187/50_BP_RULE:PASS/PHYLOCSF_TOO_SHORT +chr21:5032064 frameshift_variant PERCENTILE:0.463571889103804/GERP_DIST:-1141.14512844086/BP_DIST:840/DIST_FROM_LAST_EXON:152/50_BP_RULE:PASS/PHYLOCSF_TOO_SHORT +chr21:5032064 frameshift_variant PERCENTILE:0.662062615101289/GERP_DIST:-354.294564935565/BP_DIST:374/DIST_FROM_LAST_EXON:187/50_BP_RULE:PASS/PHYLOCSF_TOO_SHORT +chr21:5032064 frameshift_variant PERCENTILE:0.785792349726776/GERP_DIST:-391.862466733158/BP_DIST:203/DIST_FROM_LAST_EXON:187/50_BP_RULE:PASS/PHYLOCSF_TOO_SHORT diff --git a/test/split-vep.broken-LoF.vcf b/test/split-vep.broken-LoF.vcf new file mode 100644 index 000000000..8ab605a95 --- /dev/null +++ b/test/split-vep.broken-LoF.vcf @@ -0,0 +1,6 @@ +##fileformat=VCFv4.2 +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr21 5032064 . G A . . vep=A|missense_variant|MODERATE|FP565260.3|ENSG00000277117|Transcript|ENST00000612610|protein_coding|5/7||ENST00000612610.4:c.709G>A|ENSP00000483732.1:p.Gly237Arg|896|709|237|G/R|Gga/Aga|1||1|SNV||Clone_based_ensembl_gene|||1|A2||ENSP00000483732|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|||||||||,A|missense_variant|MODERATE|FP565260.3|ENSG00000277117|Transcript|ENST00000620481|protein_coding|4/6||ENST00000620481.4:c.358G>A|ENSP00000484302.1:p.Gly120Arg|545|358|120|G/R|Gga/Aga|1||1|SNV||Clone_based_ensembl_gene|||5|||ENSP00000484302|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|||||||||,A|missense_variant|MODERATE|FP565260.3|ENSG00000277117|Transcript|ENST00000623795|protein_coding|4/6||ENST00000623795.1:c.358G>A|ENSP00000485649.1:p.Gly120Arg|505|358|120|G/R|Gga/Aga|1||1|SNV||Clone_based_ensembl_gene|||2|||ENSP00000485649|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|||||||||,A|3_prime_UTR_variant&NMD_transcript_variant|MODIFIER|FP565260.3|ENSG00000277117|Transcript|ENST00000623903|nonsense_mediated_decay|5/7||ENST00000623903.3:c.*323G>A||706|||||1||1|SNV||Clone_based_ensembl_gene|||2|||ENSP00000485557||||||||||||||||,A|missense_variant|MODERATE|FP565260.3|ENSG00000277117|Transcript|ENST00000623960|protein_coding|5/7||ENST00000623960.4:c.709G>A|ENSP00000485129.1:p.Gly237Arg|858|709|237|G/R|Gga/Aga|1||1|SNV||Clone_based_ensembl_gene||YES|1|P2|CCDS86973.1|ENSP00000485129|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|||||||||,A|missense_variant|MODERATE|LOC102723996|102723996|Transcript|NM_001363770.2|protein_coding|5/7||NM_001363770.2:c.709G>A|NP_001350699.1:p.Gly237Arg|858|709|237|G/R|Gga/Aga|1||1|SNV||EntrezGene||YES||||NP_001350699.1||||||||||||||||,A|missense_variant|MODERATE|LOC102723996|102723996|Transcript|XM_006723899.2|protein_coding|5/6||XM_006723899.2:c.709G>A|XP_006723962.1:p.Gly237Arg|1345|709|237|G/R|Gga/Aga|1||1|SNV||EntrezGene||||||XP_006723962.1||||||||||||||||,A|missense_variant|MODERATE|LOC102723996|102723996|Transcript|XM_011546078.2|protein_coding|5/7||XM_011546078.2:c.709G>A|XP_011544380.1:p.Gly237Arg|1345|709|237|G/R|Gga/Aga|1||1|SNV||EntrezGene||||||XP_011544380.1||||||||||||||||,A|missense_variant|MODERATE|LOC102723996|102723996|Transcript|XM_011546079.1|protein_coding|5/7||XM_011546079.1:c.709G>A|XP_011544381.1:p.Gly237Arg|1345|709|237|G/R|Gga/Aga|1||1|SNV||EntrezGene||||||XP_011544381.1|||||||||||||||| +chr21 5032064 . G GGA . . vep=GA|frameshift_variant|HIGH|FP565260.3|ENSG00000277117|Transcript|ENST00000612610|protein_coding|5/7||ENST00000612610.4:c.718_719dup|ENSP00000483732.1:p.Asp240GlufsTer35|896-897|709-710|237|G/GX|gga/gGAga|1||1|insertion||Clone_based_ensembl_gene|||1|A2||ENSP00000483732|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|10|||||HC||PHYLOCSF_WEAK|PERCENTILE:0.773118279569892,GERP_DIST:-366.377766615897,BP_DIST:218,DIST_FROM_LAST_EXON:187,50_BP_RULE:PASS,ANN_ORF:-698.745,MAX_ORF:-698.745,GA|frameshift_variant|HIGH|FP565260.3|ENSG00000277117|Transcript|ENST00000620481|protein_coding|4/6||ENST00000620481.4:c.367_368dup|ENSP00000484302.1:p.Asp123GlufsTer35|545-546|358-359|120|G/GX|gga/gGAga|1||1|insertion||Clone_based_ensembl_gene|||5|||ENSP00000484302|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|10|||||HC||PHYLOCSF_WEAK|PERCENTILE:0.635578583765112,GERP_DIST:-366.377766615897,BP_DIST:218,DIST_FROM_LAST_EXON:187,50_BP_RULE:PASS,ANN_ORF:-698.745,MAX_ORF:-698.745,GA|frameshift_variant|HIGH|FP565260.3|ENSG00000277117|Transcript|ENST00000623795|protein_coding|4/6||ENST00000623795.1:c.367_368dup|ENSP00000485649.1:p.Asp123GlufsTer35|505-506|358-359|120|G/GX|gga/gGAga|1||1|insertion||Clone_based_ensembl_gene|||2|||ENSP00000485649|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|10|||||HC||PHYLOCSF_WEAK|PERCENTILE:0.659498207885305,GERP_DIST:-372.525567065179,BP_DIST:197,DIST_FROM_LAST_EXON:187,50_BP_RULE:PASS,ANN_ORF:-698.745,MAX_ORF:-698.745,GA|3_prime_UTR_variant&NMD_transcript_variant|MODIFIER|FP565260.3|ENSG00000277117|Transcript|ENST00000623903|nonsense_mediated_decay|5/7||ENST00000623903.3:c.*332_*333dup||706-707|||||1||1|insertion||Clone_based_ensembl_gene|||2|||ENSP00000485557||||||||||||||||,GA|frameshift_variant|HIGH|FP565260.3|ENSG00000277117|Transcript|ENST00000623960|protein_coding|5/7||ENST00000623960.4:c.718_719dup|ENSP00000485129.1:p.Asp240GlufsTer35|858-859|709-710|237|G/GX|gga/gGAga|1||1|insertion||Clone_based_ensembl_gene||YES|1|P2|CCDS86973.1|ENSP00000485129|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|10|||||HC||PHYLOCSF_WEAK|PERCENTILE:0.790979097909791,GERP_DIST:-372.525567065179,BP_DIST:197,DIST_FROM_LAST_EXON:187,50_BP_RULE:PASS,ANN_ORF:-698.745,MAX_ORF:-698.745,GA|frameshift_variant|HIGH|LOC102723996|102723996|Transcript|NM_001363770.2|protein_coding|5/7||NM_001363770.2:c.718_719dup|NP_001350699.1:p.Asp240GlufsTer35|858-859|709-710|237|G/GX|gga/gGAga|1||1|insertion||EntrezGene||YES||||NP_001350699.1||||||||10|||||HC|||PERCENTILE:0.790979097909791,GERP_DIST:-372.525567065179,BP_DIST:197,DIST_FROM_LAST_EXON:187,50_BP_RULE:PASS,PHYLOCSF_TOO_SHORT,GA|frameshift_variant|HIGH|LOC102723996|102723996|Transcript|XM_006723899.2|protein_coding|5/6||XM_006723899.2:c.718_719dup|XP_006723962.1:p.Asp240GlufsTer35|1345-1346|709-710|237|G/GX|gga/gGAga|1||1|insertion||EntrezGene||||||XP_006723962.1||||||||10|||||HC|||PERCENTILE:0.463571889103804,GERP_DIST:-1141.14512844086,BP_DIST:840,DIST_FROM_LAST_EXON:152,50_BP_RULE:PASS,PHYLOCSF_TOO_SHORT,GA|frameshift_variant|HIGH|LOC102723996|102723996|Transcript|XM_011546078.2|protein_coding|5/7||XM_011546078.2:c.718_719dup|XP_011544380.1:p.Asp240GlufsTer35|1345-1346|709-710|237|G/GX|gga/gGAga|1||1|insertion||EntrezGene||||||XP_011544380.1||||||||10|||||HC|||PERCENTILE:0.662062615101289,GERP_DIST:-354.294564935565,BP_DIST:374,DIST_FROM_LAST_EXON:187,50_BP_RULE:PASS,PHYLOCSF_TOO_SHORT,GA|frameshift_variant|HIGH|LOC102723996|102723996|Transcript|XM_011546079.1|protein_coding|5/7||XM_011546079.1:c.718_719dup|XP_011544381.1:p.Asp240GlufsTer35|1345-1346|709-710|237|G/GX|gga/gGAga|1||1|insertion||EntrezGene||||||XP_011544381.1||||||||10|||||HC|||PERCENTILE:0.785792349726776,GERP_DIST:-391.862466733158,BP_DIST:203,DIST_FROM_LAST_EXON:187,50_BP_RULE:PASS,PHYLOCSF_TOO_SHORT diff --git a/test/test.pl b/test/test.pl index 4ef961c1a..32e4563b5 100755 --- a/test/test.pl +++ b/test/test.pl @@ -659,6 +659,7 @@ run_test(\&test_vcf_plugin,$opts,in=>'split-vep.gene-list',out=>'split-vep.gene-list.1.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Gene %Consequence\\n']); run_test(\&test_vcf_plugin,$opts,in=>'split-vep.gene-list',out=>'split-vep.gene-list.2.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Gene %Consequence\\n' -g {PATH}/split-vep.gene-list.txt]); run_test(\&test_vcf_plugin,$opts,in=>'split-vep.gene-list',out=>'split-vep.gene-list.3.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Gene %Consequence\\n' -g +{PATH}/split-vep.gene-list.txt]); +run_test(\&test_vcf_plugin,$opts,in=>'split-vep.broken-LoF',out=>'split-vep.broken-LoF.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Consequence %LoF_info\\n' -a vep]); run_test(\&test_vcf_plugin,$opts,in=>'parental-origin',out=>'parental-origin.1.out',cmd=>'+parental-origin',args=>qq[-r 20:100 -p proband,father,mother -t del | grep -v ^#]); run_test(\&test_vcf_plugin,$opts,in=>'parental-origin',out=>'parental-origin.2.out',cmd=>'+parental-origin',args=>qq[-r 20:101 -p proband,father,mother -t del | grep -v ^#]); run_test(\&test_vcf_plugin,$opts,in=>'parental-origin',out=>'parental-origin.3.out',cmd=>'+parental-origin',args=>qq[-r 20:102 -p proband,father,mother -t del | grep -v ^#]);