-
Notifications
You must be signed in to change notification settings - Fork 13
/
alleles.go
155 lines (141 loc) · 4.6 KB
/
alleles.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/*
* alleles.go
* allhic
*
* Created by Haibao Tang on 06/28/19
* Copyright © 2019 Haibao Tang. All rights reserved.
*/
package allhic
import (
"bufio"
"io"
"sort"
"strconv"
"strings"
)
// Alleler is responsible for building the allele table
type Alleler struct {
PafFile string // ex. "genome.paf"
ReFile string // ex. "genome.counts_GATC.txt"
Paf PAFFile // The PAF data
ReCounts RECountsFile // The RE data
}
// Tag represents the additional info in the 12+ columns in the PAF
// file. The type of the tag is dynamically determined
//
// See also:
// https://github.com/lh3/minimap2/blob/master/minimap2.1
//
// The following tags are supported
// Tag Type Description
// _
// tp A Type of aln: P/primary, S/secondary and I,i/inversion
// cm i Number of minimizers on the chain
// s1 i Chaining score
// s2 i Chaining score of the best secondary chain
// NM i Total number of mismatches and gaps in the alignment
// MD Z To generate the ref sequence in the alignment
// AS i DP alignment score
// ms i DP score of the max scoring segment in the alignment
// nn i Number of ambiguous bases in the alignment
// ts A Transcript strand (splice mode only)
// cg Z CIGAR string (only in PAF)
// cs Z Difference string
// dv f Approximate per-base sequence divergence
// de f Gap-compressed per-base sequence divergence
// rl i Length of query regions harboring repetitive seeds
type Tag = interface{}
// PAFRecord holds one line in the PAF file
// The file spec:
// https://github.com/lh3/miniasm/blob/master/PAF.md
type PAFRecord struct {
Query string // Query sequence name
QueryLength int // Query sequence length
QueryStart int // Query start (0-based)
QueryEnd int // Query end (0-based)
RelativeStrand byte // `+' if query and target on the same strand; `-' if opposite
Target string // Target sequence name
TargetLength int // Target sequence length
TargetStart int // Target start on original strand (0-based)
TargetEnd int // Target end on original strand (0-based)
NumMatches int // Number of matching bases in the mapping
AlignmentLength int // Number bases, including gaps, in the mapping
MappingQuality uint8 // Mapping quality (0-255 with 255 for missing)
Tags map[string]Tag // Tags, e.g. tp, cm etc.
}
// PAFFile parses the PAF file into a set of records
type PAFFile struct {
PafFile string // File path of the paf
Records []PAFRecord // List of PAF records
}
// ParseRecords collects all records in memory
func (r *PAFFile) ParseRecords() {
r.Records = []PAFRecord{}
fh := mustOpen(r.PafFile)
log.Noticef("Parse paffile `%s`", r.PafFile)
reader := bufio.NewReader(fh)
var rec PAFRecord
for {
row, err := reader.ReadString('\n')
row = strings.TrimSpace(row)
if row == "" && err == io.EOF {
break
}
words := strings.Split(row, "\t")
if len(words) < 12 || len(words[4]) < 1 {
continue
}
// Parse the first 12 columns
rec.Query = words[0]
rec.QueryLength, _ = strconv.Atoi(words[1])
rec.QueryStart, _ = strconv.Atoi(words[2])
rec.QueryEnd, _ = strconv.Atoi(words[3])
rec.RelativeStrand = words[4][0]
rec.Target = words[5]
rec.TargetLength, _ = strconv.Atoi(words[6])
rec.TargetStart, _ = strconv.Atoi(words[7])
rec.TargetEnd, _ = strconv.Atoi(words[8])
rec.NumMatches, _ = strconv.Atoi(words[9])
rec.AlignmentLength, _ = strconv.Atoi(words[10])
mappingQuality, _ := strconv.Atoi(words[11])
rec.MappingQuality = uint8(mappingQuality)
rec.Tags = map[string]Tag{}
var tag Tag
// Parse columns 12+
for i := 12; i < len(words); i++ {
tokens := strings.Split(words[i], ":")
if len(tokens) < 3 {
continue
}
tagName := tokens[0]
value := tokens[2]
switch tokens[1] {
case "i":
tag, _ = strconv.Atoi(value)
case "f":
tag, _ = strconv.ParseFloat(value, 32)
default:
tag = value
}
rec.Tags[tagName] = tag
}
r.Records = append(r.Records, rec)
}
}
// extractAllelicPairs collects Extract allelic pairs
func (r *Alleler) extractAllelicPairs() {
// Sort the contigs by sizes, starting from shortest
sort.Slice(r.ReCounts.Records, func(i, j int) bool {
return r.ReCounts.Records[i].Length < r.ReCounts.Records[j].Length
})
// Find significant matches of small-big allelic contig pairs
}
// Run kicks off the Alleler
func (r *Alleler) Run() {
r.Paf = PAFFile{PafFile: r.PafFile}
r.Paf.ParseRecords()
r.ReCounts = RECountsFile{Filename: r.ReFile}
r.ReCounts.ParseRecords()
r.extractAllelicPairs()
log.Notice("Success")
}