-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathagp.go
133 lines (122 loc) · 3.2 KB
/
agp.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
/**
* Filename: /Users/bao/code/allhic/agp.go
* Path: /Users/bao/code/allhic
* Created Date: Monday, February 26th 2018, 8:30:12 pm
* Author: bao
*
* Copyright (c) 2018 Haibao Tang
*/
package allhic
import (
"bufio"
"bytes"
"strconv"
"strings"
"github.com/shenwei356/bio/seq"
"github.com/shenwei356/bio/seqio/fastx"
"github.com/shenwei356/xopen"
)
const (
// LineWidth specifies how many bases to show per line
LineWidth = 60
// LargeSequence will notify the writer to send a notification
LargeSequence = 1000000
)
// AGPLine is a line in the AGP file
type AGPLine struct {
object string
objectBeg int
objectEnd int
partNumber int
componentType byte
isGap bool
strand byte
// As a gap
gapLength int
gapType string
linkage string
linkageEvidence string
// As a sequence chunk
componentID string
componentBeg int
componentEnd int
}
// AGP is a collection of AGPLines
type AGP struct {
lines []AGPLine
}
// Add adds an AGPLine to the collection
func (r *AGP) Add(row string) {
words := strings.Fields(row)
var line AGPLine
line.object = words[0]
line.objectBeg, _ = strconv.Atoi(words[1])
line.objectEnd, _ = strconv.Atoi(words[2])
line.partNumber, _ = strconv.Atoi(words[3])
line.componentType = words[4][0]
line.isGap = line.componentType == 'N' || line.componentType == 'U'
if line.isGap {
line.gapLength, _ = strconv.Atoi(words[5])
line.gapType = words[6]
line.linkage = words[7]
line.linkageEvidence = words[8]
} else {
line.componentID = words[5]
line.componentBeg, _ = strconv.Atoi(words[6])
line.componentEnd, _ = strconv.Atoi(words[7])
line.strand = words[8][0]
}
r.lines = append(r.lines, line)
}
// buildFasta builds target FASTA based on info from agpfile
func buildFasta(agpfile string, seqs map[string]*seq.Seq) {
log.Noticef("Parse agpfile `%s`", agpfile)
file := mustOpen(agpfile)
agp := new(AGP)
scanner := bufio.NewScanner(file)
for scanner.Scan() {
agp.Add(scanner.Text())
}
var buf bytes.Buffer
outFile := RemoveExt(agpfile) + ".fasta"
outfh, _ := xopen.Wopen(outFile)
prevObject := ""
for _, line := range agp.lines {
if line.object != prevObject {
if prevObject != "" {
writeRecord(prevObject, buf, outfh)
}
prevObject = line.object
buf.Reset()
}
if line.isGap {
buf.Write(bytes.Repeat([]byte("N"), line.gapLength))
} else {
if s, ok := seqs[line.componentID]; ok {
// fmt.Printf("name: %s seq: %s\n", line.componentID, s.SubSeq(1, 10))
s = s.SubSeq(line.componentBeg, line.componentEnd)
if line.strand == '-' {
s.RevComInplace()
}
buf.Write(s.Seq)
} else {
log.Errorf("Cannot locate %s", line.componentID)
}
}
}
// Last one
writeRecord(prevObject, buf, outfh)
buf.Reset()
log.Noticef("Assembly FASTA file `%s` built", outFile)
}
// writeRecord writes the FASTA record to the file
func writeRecord(object string, buf bytes.Buffer, outfh *xopen.Writer) {
record, _ := fastx.NewRecordWithoutValidation(seq.DNA, []byte{}, []byte(object),
[]byte{}, buf.Bytes())
size := record.Seq.Length()
if size > LargeSequence {
log.Noticef("Write sequence %s (size = %d bp)", record.Name, size)
}
record.FormatToWriter(outfh, LineWidth)
outfh.Flush()
}