Skip to content

Commit

Permalink
Work around a bug in the LOFTEE VEP plugin used to annotate gnomAD VCFs
Browse files Browse the repository at this point in the history
The LoF_info subfield contains commas which, in general, makes it impossible to parse the
VEP subfields in automated way. The +split-vep plugin can now work with such files, replacing the offending
commas with slash (/) characters.

Note that this makes two assumptions:
1) the number of subfields delimited by the pipe characters (|) are consistent with the header definition
2) the first subfield never contains a comma, otherwise it woud be impossible to distinguish between
   A|A,A,B,B|B and A|A,A,A,B|B

See also Ensembl/ensembl-vep#1351
  • Loading branch information
pd3 committed Feb 10, 2023
1 parent f4fac3c commit f0ad6aa
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 12 deletions.
5 changes: 5 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,11 @@ Changes affecting specific commands:

- New `-H, --print-header` option to print the header with `-f`

- Work around a bug in the LOFTEE VEP plugin used to annotate gnomAD VCFs. There the
LoF_info subfield contains commas which, in general, makes it impossible to parse the
VEP subfields. The +split-vep plugin can now work with such files, replacing the offending
commas with slash (/) characters. See also https://github.com/Ensembl/ensembl-vep/issues/1351

* bcftools stats

- The per-sample stats (PSC) would not be computed when `-i/-e` filtering options and
Expand Down
63 changes: 51 additions & 12 deletions plugins/split-vep.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* The MIT License
Copyright (c) 2019-2022 Genome Research Ltd.
Copyright (c) 2019-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
Expand Down Expand Up @@ -1095,6 +1095,55 @@ static void restrict_csqs_to_genes(args_t *args)
}
args->ncols_csq = nhit;
}

// Split the VEP annotation by transcript and by field, then check if the number of subfields looks alright.
// Unfortunately, we cannot enforce the number of subfields to match the header definition because that can
// be variable: `bcftools csq` outputs different number of fields for different consequence types.
// So we need to distinguish between this reasonable case and incorrectly formated consequences such
// as those reported for LoF_info subfield here https://github.com/Ensembl/ensembl-vep/issues/1351.
static void split_csq_fields(args_t *args, bcf1_t *rec, int csq_str_len)
{
int i;
args->cols_tr = cols_split(args->csq_str, args->cols_tr, ',');
if ( args->cols_tr->n > args->mcols_csq )
{
args->cols_csq = (cols_t**)realloc(args->cols_csq,args->cols_tr->n*sizeof(*args->cols_csq));
for (i=args->mcols_csq; i<args->cols_tr->n; i++) args->cols_csq[i] = NULL;
args->mcols_csq = args->cols_tr->n;
}
args->ncols_csq = args->cols_tr->n;
int nfield_diff = 0, need_fix = 0;
for (i=0; i<args->cols_tr->n; i++)
{
args->cols_csq[i] = cols_split(args->cols_tr->off[i], args->cols_csq[i], '|');
if ( args->csq_idx >= args->cols_csq[i]->n ) need_fix = 1;
if ( nfield_diff < abs(args->cols_csq[i]->n - args->nfield) ) nfield_diff = args->cols_csq[i]->n - args->nfield;
}
if ( !csq_str_len ) return; // called 2nd time, don't attempt to fix
if ( !need_fix ) return;

static int warned = 0;
if ( !warned )
fprintf(stderr,"Warning: The number of INFO/%s subfields at %s:%"PRIhts_pos" does not match the header definition,\n"
" expected %d subfields, found as %s as %d. (This warning is printed only once.)\n",
args->vep_tag,bcf_seqname(args->hdr,rec),rec->pos+1,args->nfield,nfield_diff>0?"much":"few",args->nfield+nfield_diff);
warned = 1;

// One known failure mode is LoF_info subfield which can contain commas. Work around this by relying on
// the number of pipe delimiters matching the header definition, replacing offending commas with slash
// characters. This assumes that there is never a comma in the first subfield, otherwise it would be
// impossible to distinguish between A|A,A,B,B|B and A|A,A,A,B|B
int npipe = 0;
while ( csq_str_len > 0 )
{
csq_str_len--;
if ( args->csq_str[csq_str_len]=='|' ) { npipe++; continue; }
if ( args->csq_str[csq_str_len]!=',' ) continue;
if ( npipe && !(npipe % (args->nfield-1)) ) { npipe = 0; continue; } // wholesome number of pipes encountered, this is a valid comma
args->csq_str[csq_str_len] = '/';
}
split_csq_fields(args,rec,0); // try once more
}
static void process_record(args_t *args, bcf1_t *rec)
{
int i,len = bcf_get_info_string(args->hdr,rec,args->vep_tag,&args->csq_str,&args->ncsq_str);
Expand All @@ -1108,17 +1157,7 @@ static void process_record(args_t *args, bcf1_t *rec)
return;
}

// split by transcript and by field
args->cols_tr = cols_split(args->csq_str, args->cols_tr, ',');
if ( args->cols_tr->n > args->mcols_csq )
{
args->cols_csq = (cols_t**)realloc(args->cols_csq,args->cols_tr->n*sizeof(*args->cols_csq));
for (i=args->mcols_csq; i<args->cols_tr->n; i++) args->cols_csq[i] = NULL;
args->mcols_csq = args->cols_tr->n;
}
args->ncols_csq = args->cols_tr->n;
for (i=0; i<args->cols_tr->n; i++)
args->cols_csq[i] = cols_split(args->cols_tr->off[i], args->cols_csq[i], '|');
split_csq_fields(args,rec,len);

// restrict to -g genes
if ( args->genes ) restrict_csqs_to_genes(args);
Expand Down
18 changes: 18 additions & 0 deletions test/split-vep.broken-LoF.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
chr21:5032064 missense_variant .
chr21:5032064 missense_variant .
chr21:5032064 missense_variant .
chr21:5032064 3_prime_UTR_variant&NMD_transcript_variant .
chr21:5032064 missense_variant .
chr21:5032064 missense_variant .
chr21:5032064 missense_variant .
chr21:5032064 missense_variant .
chr21:5032064 missense_variant .
chr21:5032064 frameshift_variant PERCENTILE:0.773118279569892/GERP_DIST:-366.377766615897/BP_DIST:218/DIST_FROM_LAST_EXON:187/50_BP_RULE:PASS/ANN_ORF:-698.745/MAX_ORF:-698.745
chr21:5032064 frameshift_variant PERCENTILE:0.635578583765112/GERP_DIST:-366.377766615897/BP_DIST:218/DIST_FROM_LAST_EXON:187/50_BP_RULE:PASS/ANN_ORF:-698.745/MAX_ORF:-698.745
chr21:5032064 frameshift_variant PERCENTILE:0.659498207885305/GERP_DIST:-372.525567065179/BP_DIST:197/DIST_FROM_LAST_EXON:187/50_BP_RULE:PASS/ANN_ORF:-698.745/MAX_ORF:-698.745
chr21:5032064 3_prime_UTR_variant&NMD_transcript_variant .
chr21:5032064 frameshift_variant PERCENTILE:0.790979097909791/GERP_DIST:-372.525567065179/BP_DIST:197/DIST_FROM_LAST_EXON:187/50_BP_RULE:PASS/ANN_ORF:-698.745/MAX_ORF:-698.745
chr21:5032064 frameshift_variant PERCENTILE:0.790979097909791/GERP_DIST:-372.525567065179/BP_DIST:197/DIST_FROM_LAST_EXON:187/50_BP_RULE:PASS/PHYLOCSF_TOO_SHORT
chr21:5032064 frameshift_variant PERCENTILE:0.463571889103804/GERP_DIST:-1141.14512844086/BP_DIST:840/DIST_FROM_LAST_EXON:152/50_BP_RULE:PASS/PHYLOCSF_TOO_SHORT
chr21:5032064 frameshift_variant PERCENTILE:0.662062615101289/GERP_DIST:-354.294564935565/BP_DIST:374/DIST_FROM_LAST_EXON:187/50_BP_RULE:PASS/PHYLOCSF_TOO_SHORT
chr21:5032064 frameshift_variant PERCENTILE:0.785792349726776/GERP_DIST:-391.862466733158/BP_DIST:203/DIST_FROM_LAST_EXON:187/50_BP_RULE:PASS/PHYLOCSF_TOO_SHORT
6 changes: 6 additions & 0 deletions test/split-vep.broken-LoF.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
##fileformat=VCFv4.2
##INFO=<ID=vep,Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|ALLELE_NUM|DISTANCE|STRAND|VARIANT_CLASS|MINIMISED|SYMBOL_SOURCE|HGNC_ID|CANONICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|GENE_PHENO|SIFT|PolyPhen|DOMAINS|HGVS_OFFSET|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|LoF|LoF_filter|LoF_flags|LoF_info">
##contig=<ID=chr21,length=46709983,assembly=gnomAD_GRCh38>
#CHROM POS ID REF ALT QUAL FILTER INFO
chr21 5032064 . G A . . vep=A|missense_variant|MODERATE|FP565260.3|ENSG00000277117|Transcript|ENST00000612610|protein_coding|5/7||ENST00000612610.4:c.709G>A|ENSP00000483732.1:p.Gly237Arg|896|709|237|G/R|Gga/Aga|1||1|SNV||Clone_based_ensembl_gene|||1|A2||ENSP00000483732|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|||||||||,A|missense_variant|MODERATE|FP565260.3|ENSG00000277117|Transcript|ENST00000620481|protein_coding|4/6||ENST00000620481.4:c.358G>A|ENSP00000484302.1:p.Gly120Arg|545|358|120|G/R|Gga/Aga|1||1|SNV||Clone_based_ensembl_gene|||5|||ENSP00000484302|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|||||||||,A|missense_variant|MODERATE|FP565260.3|ENSG00000277117|Transcript|ENST00000623795|protein_coding|4/6||ENST00000623795.1:c.358G>A|ENSP00000485649.1:p.Gly120Arg|505|358|120|G/R|Gga/Aga|1||1|SNV||Clone_based_ensembl_gene|||2|||ENSP00000485649|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|||||||||,A|3_prime_UTR_variant&NMD_transcript_variant|MODIFIER|FP565260.3|ENSG00000277117|Transcript|ENST00000623903|nonsense_mediated_decay|5/7||ENST00000623903.3:c.*323G>A||706|||||1||1|SNV||Clone_based_ensembl_gene|||2|||ENSP00000485557||||||||||||||||,A|missense_variant|MODERATE|FP565260.3|ENSG00000277117|Transcript|ENST00000623960|protein_coding|5/7||ENST00000623960.4:c.709G>A|ENSP00000485129.1:p.Gly237Arg|858|709|237|G/R|Gga/Aga|1||1|SNV||Clone_based_ensembl_gene||YES|1|P2|CCDS86973.1|ENSP00000485129|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|||||||||,A|missense_variant|MODERATE|LOC102723996|102723996|Transcript|NM_001363770.2|protein_coding|5/7||NM_001363770.2:c.709G>A|NP_001350699.1:p.Gly237Arg|858|709|237|G/R|Gga/Aga|1||1|SNV||EntrezGene||YES||||NP_001350699.1||||||||||||||||,A|missense_variant|MODERATE|LOC102723996|102723996|Transcript|XM_006723899.2|protein_coding|5/6||XM_006723899.2:c.709G>A|XP_006723962.1:p.Gly237Arg|1345|709|237|G/R|Gga/Aga|1||1|SNV||EntrezGene||||||XP_006723962.1||||||||||||||||,A|missense_variant|MODERATE|LOC102723996|102723996|Transcript|XM_011546078.2|protein_coding|5/7||XM_011546078.2:c.709G>A|XP_011544380.1:p.Gly237Arg|1345|709|237|G/R|Gga/Aga|1||1|SNV||EntrezGene||||||XP_011544380.1||||||||||||||||,A|missense_variant|MODERATE|LOC102723996|102723996|Transcript|XM_011546079.1|protein_coding|5/7||XM_011546079.1:c.709G>A|XP_011544381.1:p.Gly237Arg|1345|709|237|G/R|Gga/Aga|1||1|SNV||EntrezGene||||||XP_011544381.1||||||||||||||||
chr21 5032064 . G GGA . . vep=GA|frameshift_variant|HIGH|FP565260.3|ENSG00000277117|Transcript|ENST00000612610|protein_coding|5/7||ENST00000612610.4:c.718_719dup|ENSP00000483732.1:p.Asp240GlufsTer35|896-897|709-710|237|G/GX|gga/gGAga|1||1|insertion||Clone_based_ensembl_gene|||1|A2||ENSP00000483732|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|10|||||HC||PHYLOCSF_WEAK|PERCENTILE:0.773118279569892,GERP_DIST:-366.377766615897,BP_DIST:218,DIST_FROM_LAST_EXON:187,50_BP_RULE:PASS,ANN_ORF:-698.745,MAX_ORF:-698.745,GA|frameshift_variant|HIGH|FP565260.3|ENSG00000277117|Transcript|ENST00000620481|protein_coding|4/6||ENST00000620481.4:c.367_368dup|ENSP00000484302.1:p.Asp123GlufsTer35|545-546|358-359|120|G/GX|gga/gGAga|1||1|insertion||Clone_based_ensembl_gene|||5|||ENSP00000484302|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|10|||||HC||PHYLOCSF_WEAK|PERCENTILE:0.635578583765112,GERP_DIST:-366.377766615897,BP_DIST:218,DIST_FROM_LAST_EXON:187,50_BP_RULE:PASS,ANN_ORF:-698.745,MAX_ORF:-698.745,GA|frameshift_variant|HIGH|FP565260.3|ENSG00000277117|Transcript|ENST00000623795|protein_coding|4/6||ENST00000623795.1:c.367_368dup|ENSP00000485649.1:p.Asp123GlufsTer35|505-506|358-359|120|G/GX|gga/gGAga|1||1|insertion||Clone_based_ensembl_gene|||2|||ENSP00000485649|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|10|||||HC||PHYLOCSF_WEAK|PERCENTILE:0.659498207885305,GERP_DIST:-372.525567065179,BP_DIST:197,DIST_FROM_LAST_EXON:187,50_BP_RULE:PASS,ANN_ORF:-698.745,MAX_ORF:-698.745,GA|3_prime_UTR_variant&NMD_transcript_variant|MODIFIER|FP565260.3|ENSG00000277117|Transcript|ENST00000623903|nonsense_mediated_decay|5/7||ENST00000623903.3:c.*332_*333dup||706-707|||||1||1|insertion||Clone_based_ensembl_gene|||2|||ENSP00000485557||||||||||||||||,GA|frameshift_variant|HIGH|FP565260.3|ENSG00000277117|Transcript|ENST00000623960|protein_coding|5/7||ENST00000623960.4:c.718_719dup|ENSP00000485129.1:p.Asp240GlufsTer35|858-859|709-710|237|G/GX|gga/gGAga|1||1|insertion||Clone_based_ensembl_gene||YES|1|P2|CCDS86973.1|ENSP00000485129|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|10|||||HC||PHYLOCSF_WEAK|PERCENTILE:0.790979097909791,GERP_DIST:-372.525567065179,BP_DIST:197,DIST_FROM_LAST_EXON:187,50_BP_RULE:PASS,ANN_ORF:-698.745,MAX_ORF:-698.745,GA|frameshift_variant|HIGH|LOC102723996|102723996|Transcript|NM_001363770.2|protein_coding|5/7||NM_001363770.2:c.718_719dup|NP_001350699.1:p.Asp240GlufsTer35|858-859|709-710|237|G/GX|gga/gGAga|1||1|insertion||EntrezGene||YES||||NP_001350699.1||||||||10|||||HC|||PERCENTILE:0.790979097909791,GERP_DIST:-372.525567065179,BP_DIST:197,DIST_FROM_LAST_EXON:187,50_BP_RULE:PASS,PHYLOCSF_TOO_SHORT,GA|frameshift_variant|HIGH|LOC102723996|102723996|Transcript|XM_006723899.2|protein_coding|5/6||XM_006723899.2:c.718_719dup|XP_006723962.1:p.Asp240GlufsTer35|1345-1346|709-710|237|G/GX|gga/gGAga|1||1|insertion||EntrezGene||||||XP_006723962.1||||||||10|||||HC|||PERCENTILE:0.463571889103804,GERP_DIST:-1141.14512844086,BP_DIST:840,DIST_FROM_LAST_EXON:152,50_BP_RULE:PASS,PHYLOCSF_TOO_SHORT,GA|frameshift_variant|HIGH|LOC102723996|102723996|Transcript|XM_011546078.2|protein_coding|5/7||XM_011546078.2:c.718_719dup|XP_011544380.1:p.Asp240GlufsTer35|1345-1346|709-710|237|G/GX|gga/gGAga|1||1|insertion||EntrezGene||||||XP_011544380.1||||||||10|||||HC|||PERCENTILE:0.662062615101289,GERP_DIST:-354.294564935565,BP_DIST:374,DIST_FROM_LAST_EXON:187,50_BP_RULE:PASS,PHYLOCSF_TOO_SHORT,GA|frameshift_variant|HIGH|LOC102723996|102723996|Transcript|XM_011546079.1|protein_coding|5/7||XM_011546079.1:c.718_719dup|XP_011544381.1:p.Asp240GlufsTer35|1345-1346|709-710|237|G/GX|gga/gGAga|1||1|insertion||EntrezGene||||||XP_011544381.1||||||||10|||||HC|||PERCENTILE:0.785792349726776,GERP_DIST:-391.862466733158,BP_DIST:203,DIST_FROM_LAST_EXON:187,50_BP_RULE:PASS,PHYLOCSF_TOO_SHORT
1 change: 1 addition & 0 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,7 @@
run_test(\&test_vcf_plugin,$opts,in=>'split-vep.gene-list',out=>'split-vep.gene-list.1.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Gene %Consequence\\n']);
run_test(\&test_vcf_plugin,$opts,in=>'split-vep.gene-list',out=>'split-vep.gene-list.2.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Gene %Consequence\\n' -g {PATH}/split-vep.gene-list.txt]);
run_test(\&test_vcf_plugin,$opts,in=>'split-vep.gene-list',out=>'split-vep.gene-list.3.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Gene %Consequence\\n' -g +{PATH}/split-vep.gene-list.txt]);
run_test(\&test_vcf_plugin,$opts,in=>'split-vep.broken-LoF',out=>'split-vep.broken-LoF.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Consequence %LoF_info\\n' -a vep]);
run_test(\&test_vcf_plugin,$opts,in=>'parental-origin',out=>'parental-origin.1.out',cmd=>'+parental-origin',args=>qq[-r 20:100 -p proband,father,mother -t del | grep -v ^#]);
run_test(\&test_vcf_plugin,$opts,in=>'parental-origin',out=>'parental-origin.2.out',cmd=>'+parental-origin',args=>qq[-r 20:101 -p proband,father,mother -t del | grep -v ^#]);
run_test(\&test_vcf_plugin,$opts,in=>'parental-origin',out=>'parental-origin.3.out',cmd=>'+parental-origin',args=>qq[-r 20:102 -p proband,father,mother -t del | grep -v ^#]);
Expand Down

0 comments on commit f0ad6aa

Please sign in to comment.