-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathatom.h
506 lines (451 loc) · 17.1 KB
/
atom.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
/*
Pheniqs : PHilology ENcoder wIth Quality Statistics
Copyright (C) 2018 Lior Galanti
NYU Center for Genetics and System Biology
Author: Lior Galanti <lior.galanti@nyu.edu>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef PHENIQS_ATOM_H
#define PHENIQS_ATOM_H
#include "include.h"
#include "json.h"
enum class HtsSortOrder : int8_t;
enum class HtsGrouping : int8_t;
enum class Platform : uint8_t;
enum class Algorithm : uint8_t;
enum class HtsTagCode : uint16_t;
/* SAM format flags
0x1 PAIRED read is paired in sequencing, no matter whether it is mapped in a pair
0x2 PROPER_PAIR read is mapped in a proper pair
0x4 UNMAP read itself is unmapped; conflictive with PROPER_PAIR
0x8 MUNMAP mate is unmapped
0x10 REVERSE read is mapped to the reverse strand
0x20 MREVERSE mate is mapped to the reverse strand
0x40 READ1 the first segment in the template
0x80 READ2 the last segment in the template
0x100 SECONDARY not primary alignment
0x200 QCFAIL QC failure
0x400 DUP optical or PCR duplicate
0x800 SUPPLEMENTARY supplementary alignment
*/
/* @HD The header line
VN Format version
SO Sorting order of alignments
unknown
unsorted
queryname
coordinate
GO Grouping of alignments
Indicating that similar alignment records are grouped together but the file is not necessarily sorted overall.
none
query grouped by QNAME
reference grouped by RNAME/POS
SS Sub-sorting order of alignments. Valid values are of the form sort-order:sub-sort,
where sort- order is the same value stored in the SO tag and sub-sort is an implementation-dependent
colon-separated string further describing the sort order.
For example, if an algorithm relies on a coordinate sort that, at each coordinate,
is further sorted by query name the header could contain @HD SO:coordinate SS:coordinate:queryname.
If the primary sort is not one of the predefined primary sort orders, then unsorted should be used
and the subsort is effectively the major sort.
For example, if sorted by an auxillary tag MI then by coordinate then the header could contain
@HD SO:unsorted SS:unsorted:MI:coordinate.
Regular expression: (coordinate|queryname|unsorted)(:[A-Za-z0-9_-]+)+
*/
class HeadHDAtom {
friend class HtsHead;
friend ostream& operator<<(ostream& o, const HeadHDAtom& hd);
public:
kstring_t VN;
kstring_t SO;
kstring_t GO;
kstring_t SS;
HeadHDAtom();
HeadHDAtom(const Value& ontology);
HeadHDAtom(const HeadHDAtom& other);
~HeadHDAtom();
void set_alignment_sort_order(const HtsSortOrder& order);
void set_alignment_grouping(const HtsGrouping& grouping);
void set_version(const htsFormat& format);
HeadHDAtom& operator=(const HeadHDAtom& other);
private:
void encode(kstring_t& buffer) const;
char* decode(char* position, const char* end);
};
ostream& operator<<(ostream& o, const HeadHDAtom& hd);
/* @SQ Reference Sequence
index corresponds to tid in bam1_core_t
SN Reference sequence identifier.
The value of this field is used in the alignment records in RNAME and RNEXT fields.
LN Reference sequence length. 32 bit signed.
AH Indicates that this sequence is an alternate locus.
The value is the locus in the primary assembly for which this sequence is an alternative,
in the format ‘chr:start-end’, ‘chr’ (if known), or ‘*’ (if unknown), where ‘chr’ is a sequence in the primary assembly.
Must not be present on sequences in the primary assembly.
AN Alternative reference sequence names.
AS Genome assembly identifier.
DS Description. UTF-8 encoding may be used.
M5 MD5 checksum of the sequence in the uppercase, excluding spaces but including pads (as ‘*’s).
SP Species.
TP Molecule topology. Valid values: linear (default) and circular.
UR URI of the sequence. This value may start with one of the standard protocols, e.g http: or ftp:.
If it does not start with one of these protocols, it is assumed to be a file-system path.
*/
class HeadSQAtom {
friend class HtsHead;
friend ostream& operator<<(ostream& o, const HeadSQAtom& program);
public:
int32_t index;
kstring_t SN;
int32_t LN;
kstring_t AH;
kstring_t AN;
kstring_t AS;
kstring_t DS;
kstring_t M5;
kstring_t SP;
kstring_t TP;
kstring_t UR;
HeadSQAtom();
HeadSQAtom(const Value& ontology);
HeadSQAtom(const HeadSQAtom& other);
~HeadSQAtom();
operator string() const;
HeadSQAtom& operator=(const HeadSQAtom& other);
private:
void encode(kstring_t& buffer) const;
char* decode(char* position, const char* end);
void update(const HeadSQAtom& other);
};
ostream& operator<<(ostream& o, const HeadSQAtom& sq);
/* @RG Read Group
ID Read group identifier
Each @RG line must have a unique ID.
The value of ID is used in the RG tags of alignment records.
Must be unique among all read groups in header section.
Read group IDs may be modified when merging SAM files in order to handle collisions.
LB Library
SM Sample. Use pool name where a pool is being sequenced.
PU Platform unit Unique identifier. e.g. flowcell-barcode.lane for Illumina.
CN Name of sequencing center producing the read
DS Description
DT Date the run was produced ISO8601 date or datetime.
PI Predicted median insert size.
PL Platform or technology used to produce the reads.
UNKNOWN, CAPILLARY, DNBSEQ, ELEMENT, HELICOS, ILLUMINA, IONTORRENT, LS454, ONT, PACBIO, SINGULAR, SOLID, ULTIMA
PM Platform model
PG Programs used for processing the read group
FO Flow order.
The array of nucleotide bases that correspond to the nucleotides used for each flow of each read.
Multi-base flows are encoded in IUPAC format, and non-nucleotide flows by various other characters.
Format: /\*|[ACMGRSVTWYHKDBN]+/
KS The array of nucleotide bases that correspond to the key sequence of each read.
From GATK: https://software.broadinstitute.org/gatk/guide/article?id=6472
ID Read group identifier, RG-ID, identifies which read group each read belongs to using a reference in the RG auxiliary tag.
RG-ID must be unique in a SAM file.
The default convention for illumina is the same as PU.
When used in BQSR, RG-ID is the lowest denominator that differentiates factors contributing to technical batch effects:
therefore, a read group is effectively treated as a separate run of the instrument in data processing steps
such as base quality score recalibration, since they are assumed to share the same error model.
PL Platform used to produce the read. identifies the sequencing technology used to generate the sequencing data.
PU Platform Unit, RG-PU, is unique read group identifier in the platform vendor dialect.
The default convention for illumina seuqnce reads is <flowcell id>:<flowcell lane number>:<sample barcode>
making it a globally unique identifier across all sequencing data in the world.
PU is not required by GATK but takes precedence over ID for base recalibration if present.
SM The name of the sample sequenced in this read group.
GATK tools treat all read groups with the same SM value as containing sequencing data for the same sample,
and this is also the name that will be used for the sample column in the VCF file.
When sequencing pools of samples, use a pool name instead of an individual sample name.
LB DNA preparation library identifier.
MarkDuplicates uses the LB field to determine which read groups might contain molecular duplicates,
in case the same DNA library was sequenced on multiple lanes.
also informative: http://gatkforums.broadinstitute.org/gatk/discussion/6472/read-groups
*/
class HeadRGAtom {
friend class HtsHead;
friend ostream& operator<<(ostream& o, const HeadRGAtom& read_group);
public:
int32_t index;
kstring_t ID;
kstring_t BC;
kstring_t CN;
kstring_t DS;
kstring_t DT;
kstring_t FO;
kstring_t KS;
kstring_t LB;
kstring_t PG;
kstring_t PI;
kstring_t PL;
kstring_t PM;
kstring_t PU;
kstring_t SM;
HeadRGAtom();
HeadRGAtom(const Value& ontology);
HeadRGAtom(const HeadRGAtom& other);
~HeadRGAtom();
operator string() const;
void set_platform(const Platform& value);
HeadRGAtom& operator=(const HeadRGAtom& other);
private:
void encode(kstring_t& buffer) const;
char* decode(char* position, const char* end);
void update(const HeadRGAtom& other);
};
ostream& operator<<(ostream& o, const HeadRGAtom& rg);
template<> bool decode_value_by_key< vector< HeadRGAtom > >(const Value::Ch* key, vector< HeadRGAtom >& value, const Value& container);
bool encode_value(const HeadRGAtom& value, Value& container, Document& document);
bool encode_key_value(const string& key, const HeadRGAtom& value, Value& container, Document& document);
bool encode_key_value(const string& key, const list< HeadRGAtom >& value, Value& container, Document& document);
/* @PG Program
ID Program record identifier.
Each @PG line must have a unique ID.
The value of ID is used in the alignment PG tag and PP tags of other @PG lines.
PG IDs may be modified when merging SAM files in order to handle collisions.
PN Program name
CL Command line
PP Previous @PG:ID. Must be valid reference to another @PG:ID
DS Description.
VN Program version
*/
class HeadPGAtom {
friend class HtsHead;
friend ostream& operator<<(ostream& o, const HeadPGAtom& program);
public:
int32_t index;
kstring_t ID;
kstring_t PN;
kstring_t CL;
kstring_t PP;
kstring_t DS;
kstring_t VN;
HeadPGAtom();
HeadPGAtom(const Value& ontology);
HeadPGAtom(const HeadPGAtom& other);
~HeadPGAtom();
operator string() const;
HeadPGAtom& operator=(const HeadPGAtom& other);
private:
void encode(kstring_t& buffer) const;
char* decode(char* position, const char* end);
void update(const HeadPGAtom& other);
};
ostream& operator<<(ostream& o, const HeadPGAtom& pg);
template<> HeadPGAtom decode_value_by_key< HeadPGAtom >(const Value::Ch* key, const Value& container);
/* @CO free text comment */
class HeadCOAtom {
friend class HtsHead;
friend ostream& operator<<(ostream& o, const HeadCOAtom& co);
public:
kstring_t CO;
HeadCOAtom();
HeadCOAtom(const Value& ontology);
HeadCOAtom(const HeadCOAtom& other);
~HeadCOAtom();
HeadCOAtom& operator=(const HeadCOAtom& other);
private:
void encode(kstring_t& buffer) const;
char* decode(char* position, const char* end);
};
ostream& operator<<(ostream& o, const HeadCOAtom& co);
enum class HtsSortOrder : int8_t {
UNKNOWN = -1,
UNSORTED = 0,
QUERYNAME = 1,
COORDINATE = 2,
};
string to_string(const HtsSortOrder& value);
bool from_string(const char* value, HtsSortOrder& result);
void to_kstring(const HtsSortOrder& value, kstring_t& result);
bool from_string(const string& value, HtsSortOrder& result);
ostream& operator<<(ostream& o, const HtsSortOrder& value);
void encode_key_value(const string& key, const HtsSortOrder& value, Value& container, Document& document);
enum class HtsGrouping : int8_t {
UNKNOWN = -1,
NONE = 0,
QUERY = 1,
REFERENCE = 2,
};
string to_string(const HtsGrouping& value);
bool from_string(const char* value, HtsGrouping& result);
bool from_string(const string& value, HtsGrouping& result);
void to_kstring(const HtsGrouping& value, kstring_t& result);
ostream& operator<<(ostream& o, const HtsGrouping& value);
void encode_key_value(const string& key, const HtsGrouping& value, Value& container, Document& document);
enum class Platform : uint8_t {
UNKNOWN,
CAPILLARY,
DNBSEQ,
ELEMENT,
HELICOS,
ILLUMINA,
IONTORRENT,
LS454,
ONT,
PACBIO,
SINGULAR,
SOLID,
ULTIMA
};
string to_string(const Platform& value);
bool from_string(const char* value, Platform& result);
void to_kstring(const Platform& value, kstring_t& result);
bool from_string(const string& value, Platform& result);
ostream& operator<<(ostream& o, const Platform& value);
void encode_key_value(const string& key, const Platform& value, Value& container, Document& document);
enum class Algorithm : uint8_t {
UNKNOWN,
MDD,
PAMLD,
NAIVE,
PASSTHROUGH,
BENCHMARK,
};
string to_string(const Algorithm& value);
bool from_string(const char* value, Algorithm& result);
void to_kstring(const Algorithm& value, kstring_t& result);
bool from_string(const string& value, Algorithm& result);
ostream& operator<<(ostream& o, const Algorithm& value);
bool encode_key_value(const string& key, const Algorithm& value, Value& container, Document& document);
/* defined in htslib/hts.h */
ostream& operator<<(ostream& o, const htsFormat& hts_format);
string to_string(const htsFormatCategory& value);
bool from_string(const char* value, htsFormatCategory& result);
ostream& operator<<(ostream& o, const htsFormatCategory& hts_format_category);
string to_string(const htsExactFormat& value);
bool from_string(const char* value, htsExactFormat& result);
ostream& operator<<(ostream& o, const htsExactFormat& hts_exact_format);
string to_string(const htsCompression& value);
bool from_string(const char* value, htsCompression& result);
ostream& operator<<(ostream& o, const htsCompression& hts_compression);
/* 16bit enumeration of 2 letter hts tag name */
enum class HtsTagCode : uint16_t {
AH = 0x4148,
AM = 0x414d,
AN = 0x414e,
AS = 0x4153,
BC = 0x4243,
BQ = 0x4251,
BX = 0x4258,
BZ = 0x425a,
CB = 0x4342,
CC = 0x4343,
CG = 0x4347,
CL = 0x434c,
CM = 0x434d,
CN = 0x434e,
CO = 0x434f,
CP = 0x4350,
CQ = 0x4351,
CR = 0x4352,
CS = 0x4353,
CT = 0x4354,
CY = 0x4359,
DQ = 0x4451,
DS = 0x4453,
DT = 0x4454,
E2 = 0x4532,
EE = 0x4545,
FI = 0x4649,
FO = 0x464f,
FS = 0x4653,
FZ = 0x465a,
GC = 0x4743,
GO = 0x474f,
GQ = 0x4751,
GS = 0x4753,
H0 = 0x4830,
H1 = 0x4831,
H2 = 0x4832,
HD = 0x4844,
HI = 0x4849,
ID = 0x4944,
IH = 0x4948,
KS = 0x4b53,
LB = 0x4c42,
LN = 0x4c4e,
M5 = 0x4d35,
MC = 0x4d43,
MD = 0x4d44,
MF = 0x4d46,
MI = 0x4d49,
MQ = 0x4d51,
NH = 0x4e48,
NM = 0x4e4d,
OC = 0x4f43,
OP = 0x4f50,
OQ = 0x4f51,
OX = 0x4f58,
PG = 0x5047,
PI = 0x5049,
PL = 0x504c,
PM = 0x504d,
PN = 0x504e,
PP = 0x5050,
PQ = 0x5051,
PT = 0x5054,
PU = 0x5055,
PX = 0x5058,
Q2 = 0x5132,
QT = 0x5154,
QX = 0x5158,
R2 = 0x5232,
RG = 0x5247,
RT = 0x5254,
RX = 0x5258,
S2 = 0x5332,
SA = 0x5341,
SM = 0x534d,
SN = 0x534e,
SO = 0x534f,
SP = 0x5350,
SQ = 0x5351,
SS = 0x5353,
TC = 0x5443,
TP = 0x5450,
U2 = 0x5532,
UQ = 0x5551,
UR = 0x5552,
VN = 0x564e,
XB = 0x5842,
XC = 0x5843,
XD = 0x5844,
XL = 0x584c,
XM = 0x584d,
XO = 0x584f,
XP = 0x5850,
XQ = 0x5851,
XR = 0x5852,
XZ = 0x585a,
YD = 0x5944,
};
/* HTS header */
class HtsHead {
friend ostream& operator<<(ostream& o, const HtsHead& head);
public:
void decode(const sam_hdr_t* hdr);
void encode(sam_hdr_t* hdr) const;
void add_reference(const HeadSQAtom& sq);
void add_read_group(const HeadRGAtom& rg);
void add_program(const HeadPGAtom& pg);
void add_comment(const HeadCOAtom& co);
void set_format_version(const htsFormat& format) {
hd.set_version(format);
};
private:
HeadHDAtom hd;
unordered_map< string, HeadSQAtom > SQ_by_id;
unordered_map< string, HeadRGAtom > RG_by_id;
unordered_map< string, HeadPGAtom > PG_by_id;
list< HeadCOAtom > comment_by_index;
};
ostream& operator<<(ostream& o, const HtsHead& head);
#endif /* PHENIQS_ATOM_H */