-
Notifications
You must be signed in to change notification settings - Fork 0
/
sgRNA_count.py
58 lines (44 loc) · 1.4 KB
/
sgRNA_count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# usage: python sgRNA_count.py key sam.file
import sys, fileinput, re
# import pysnooper
# @pysnooper.snoop()
# spacer sequence between U6 promoter and sgRNA
# CGAAACACC or CGAAACACCG
key = sys.argv[1]
# dict to list
def dict2list(dic):
keys = dic.keys()
values = dic.values()
return [(key, value) for key, value in zip(keys, values)]
sam = sys.argv[2]
# count of reads without key+sgRNA+scaffold+terminator pattern
no_sgrna = 0
# dict contains all possible transcripts
output = {}
for line in fileinput.input(sam):
if line[0] != "@":
line_split = line.split("\t")
# ref_pos = line_split[3]
# cigar = line_split[5]
fastq_seq = line_split[9]
# sgRNA extraction (18 ~ 22 bp)
# U6 terminator: TTTTTT
sgrna = re.findall(r"%s([ATGC]{0,}?)TTTTTT" %(key), fastq_seq)
if len(sgrna) == 0:
no_sgrna += 1
continue
else:
sgrna = sgrna[0]
# add sgRNA to dict
if sgrna in output:
output[sgrna] +=1
else:
output[sgrna] = 1
else:
continue
# sort dict by count
output_tuple = sorted(dict2list(output), key = lambda x:x[1], reverse=True)
# count of reads without key+sgRNA+scaffold+terminator pattern
print('# Number of reads could not be successfully transcripted: %d' %(no_sgrna))
for i in output_tuple:
print(i[0] + "\t" + str(i[1]))