-
Notifications
You must be signed in to change notification settings - Fork 0
/
yaseminsucu_csc_219_project_3_covid19tree.py
505 lines (382 loc) · 15.3 KB
/
yaseminsucu_csc_219_project_3_covid19tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
# -*- coding: utf-8 -*-
"""YaseminSucu_CSC 219_Project 3_Covid19Tree.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1PfzQSCf8M7mkw2t50oZHYP1J5hF6IVaU
#<h1>Project 3 - Covid19Tree </h1>
##CSC 219 Project 3 (total 250 points)
**Due date: on Canvas**
"""
from google.colab import drive
drive.mount('/content/drive')
"""#Code for Loading Data"""
!pip install ete3
seqsList = []
namesList = []
# input: file
# output: list of file names and sequece list
# loadSeq reads the covidstrainpractice.fasta and builds the file names list and sequence list
def loadSeq(filename):
#Read file lines
path = ''
f = open(filename) # opens the file
linesList = f.readlines()[:] # reads into the lines
tempList = []
# Process lines
# Get sequence from the list and process it
for i in range(len(linesList)): # iterates through each line
if '>' in linesList[i]:
if tempList:
k = ''.join(tempList)
seqsList.append(k)
del tempList[:] # empties tempList
if '>' in linesList[i]:
tempName = linesList[i].split('/')
#print("tempName", tempName)
firstString = tempName[0]
lastSpace = firstString.rfind(' ')
finalName = firstString[lastSpace + 1:] + '_' + '_'.join(tempName[1:4])
#print("finalName", finalName)
if ',' in finalName:
index = finalName.index(',')
finalName = finalName[:index]
namesList.append(finalName)
else:
cleanString = linesList[i].strip('\n')
tempList.append(cleanString)
if i == (len(linesList)) - 1: # accounts for very last sequence
k = ''.join(tempList)
seqsList.append(k)
return seqsList, namesList
# compareSeq return a dictionary of differnces in sequences
def compareSeq(seqList):
checkArr = ['A', 'T', 'G', 'C']
diffDict = {}
#set up columns
length = len(seqList)
#set up rows
for row in range(1, length):
#compare 2 sequences
for col in range(row):
colSeq = seqList[col]
rowSeq = seqList[row]
#set diff counter to already include length diff
diff = 0 + abs(len(colSeq) - len(rowSeq))
#set end - when to stop comparing
if len(colSeq) < len(rowSeq):
end = len(colSeq)
elif len(rowSeq) < len(colSeq):
end = len(rowSeq)
else:#if the same length
end = len(rowSeq)
#iterate thru seqs
for i in range(end):
if rowSeq[i] != colSeq[i]:
diff += 1
#creating dict
#if diff key not in dict already, add it
if diffDict.get(diff)==None:
diffDict[diff] = [(row,col)]
#if dict key already exists, add append tuple
else:
diffDict[diff].append((row,col))
print(str(diff).rjust(6, ' '), end=" ")
print("\r")
return diffDict
# Start processing
# You can download covidstrainpractice.fasta from the course Canvas site
path = '/content/drive/MyDrive/Colab Notebooks/CSC219/PROJECT 1 /datafiles/' # GoogleDrive or your directory path to .fasta file
seqsList, namesList = loadSeq(path + 'covidstrainpractice.fasta')
print(namesList)
print(seqsList)
# compareSeq(seqsList)
newDict = compareSeq(seqsList)
print(newDict)
"""#Code for Tree"""
class Node:
def __init__(self, value):
self.value = value
self.left = None
self.right = None
class Tree:
def __init__(self):
self.root = None
self.output = []
self.value = []
def insert(self, value):
if self.root is None:
self.root = Node(value)
self.output.append(self.root)
else:
cur_node = self.root
self.__insert(value, cur_node)
def __insert(self, value, current):
# traverse to the left first
if value[0] < current.value[0]:
if current.left is not None:
self.__insert(value, current.left)
else: #current.left is None
current.left = Node(value)
self.output.append(current.left)
elif value[0] > current.value[0]: #value larger then the current. value
if current.right is not None:
self.__insert(value, current.right)
else:
current.right = Node(value)
self.output.append(current.right)
else:
pass # meaning no code to execute
# if value == current.value
# don't do anything
def buildList(self, start, phyloList, namesList):
temp = start
for i in range(2, len(phyloList)):
nested = []
index = phyloList[i][0] # Access the first element of the tuple as the index
nested.append(namesList[index])
nested.append(temp)
temp = list(nested)
return temp
def get_tree(self,type): # prints the trees values and returns the tree as a list of node objects
current = self.root
if current is None:
return "Tree is empty"
else:
#traverse the tree in DFS, inorder traversal
output_arr=[]
if type.lower() == 'inorder':
print(self.inorder_traversal(current, output_arr))
elif type.lower() == 'postorder':
print(self.postorder_traversal(current, output_arr))
elif type.lower() == 'preorder':
print(self.preorder_traversal(current, output_arr))
return output_arr
# Inorder traversal
# Left -> Root -> Right
def inorder_traversal(self,root, output):
if root:
#traverse to the left
self.inorder_traversal(root.left,output)
#print the value of root
output.append(root.value)
#traverse to the left
self.inorder_traversal(root.right, output)
return output
def inorder_traversal_phylo(self, root, output):
if root:
# traverse to the left
self.inorder_traversal(root.left, output)
# print the value of root
output.append(root.value)
# traverse to the left
self.inorder_traversal(root.right, output)
return output
# Preorder traversal
# Root -> Left -> Right
def preorder_traversal(self,root, output):
if root:
# print the value of root
output.append(root.value)
#traverse to the left
self.preorder_traversal(root.left,output)
#traverse to the right
self.preorder_traversal(root.right, output)
return output
# Postorder traversal
# Left -> Right -> Root
def postorder_traversal(self,root, output):
if root:
#traverse to the left
self.postorder_traversal(root.left,output)
#traverse to the right
self.postorder_traversal(root.right, output)
#print the value of root
output.append(root.value)
return output
def buildList(self, start, phyloList, namesList):
temp = start
for i in range(2, len(phyloList)):
nested = []
index = phyloList[i][0] # Access the first element of the tuple as the index
nested.append(namesList[index])
nested.append(temp)
temp = list(nested)
return temp
def stringNames(self, nl):
if type(nl[1]) is list:
return '(' + nl[0] + ',' + self.stringNames(nl[1])
else:
return '(' + nl[0] + ',' + nl[1]
def drawTree(self, start, phyloList, namesList):
from ete3 import Tree
result = self.buildList(start, phyloList, namesList)
tuplestring = str(result) # Convert result to a string
tuplestring = tuplestring.replace('[', '(').replace(']', ')') # Replace brackets with parentheses
tuplestring = tuplestring.replace('/', '_') # Replace forward slashes with underscores
TextTree = Tree(tuplestring + ';')
print("Mytree", TextTree)
# 1. To build a tree, use your dictionary, and sort the keys
def loadSeq(filename):
seqsList = []
namesList = []
tempList = []
f= open(filename, "r")
for line in f:
if '>' in line:
line = line[:line.find(",")]
if line.rfind("/") < line.rfind(" "):
temp = line[:line.finds("/")]
line = line[temp.rfind(" ")+1:]
line = line[:line.find(" ")]
else:
line = line[line.rfind(" ")+1:]
namesList.append(line)
if tempList:
seqsList.append(' '.join(tempList))
del tempList[:]
else:
templine = line.strip('\n')
tempList.append(tempLine)
seqsList.append(''.join(tempList))
del tempList[:]
f.close()
return seqsList, namesList
path = '/content/drive/MyDrive/Colab Notebooks/CSC219/PROJECT 1 /datafiles/'
seqsList, namesList = loadSeq(path + 'covidstrainpractice.fasta')
print(namesList)
def compareSeq(seqsList):
difDic = {}
length = len(seqList)
print('\t', end="")
for i in range(length):
print(str(i).rjust(6, ''), end="")
print('\r')
for row in range(1, length):
print(row, ('\t'), end="")
for col in range(row):
colSeq = seqList[col]
rowSeq = seqList[row]
diff = abs(len(colSeq)-len(rowSeq))
if len(colSeq) < len(rowSeq):
end = len(colSeq)
else:
end = len(rowSeq)
for i in range(end):
if rowSeq[i] != colSeq[i]:
diff +=1
print(str(diff).rjust(6, ''), end="")
if diff in diffDict:
diffDict[diff].append((row, col))
else:
diffDic[diff] = [(row, col)]
diff = 0
print("\r")
return difDic
"""#Assignment for Project 3
##1. **[40 points]** Using insert(), build a tree called "newTree" of values by iterating through the your dictionary.keys() list. (Feel free to call your newTree another name. When I reference "newTree" below, just insert your own treename)
Each node of your tree will contain the value. The value is a tuple.
Each tuple has a key (Sequence difference) and tupleList
The insert() will sort the nodes, based on the difference key,
```
For example these are values each node will contain
Value : (6, [(3, 1), (3, 2)]); 6 is sequence difference key , [(3,1),(3,2)] is the tupleList
Value : (10, [(2, 1)])
```
"""
# strain and value with the tuples
newDict = compareSeq(seqsList)
print(newDict)
newTree = Tree()
for k, v in newDict.items():
tuples = (k, v)
print(tuples)
newTree.insert(tuples)
"""##2.**[45 points]** Use get_tree() with the 'inorder' type to generate a list of sorted tree. Save it into a list.
When printed the output should look like this. Note that it is already sorted because we used inorder traversal.
```
outputlist :
[(3, [(8, 6)]), (4, [(4, 3)]), (6, [(3, 1), (3, 2)]), (8, [(4, 1), (4, 2)]), ...
```
"""
myList = newTree.get_tree('inorder')
"""##3.**[45 points]** Once your output list is sorted, then iterate through it and create a list that will contain only unique values of the sequence indices.
When printed the list would look like this below. I called mine a "phylolist".
```
phylolist:
[8, 6, 3, 4, 1, 2, 0, 5, 7, 9]
"""
print(phyloList)
phyloList = []
seen_indices = set()
for value in myList:
indices = value[1] # Get the tupleList containing the indices
for index in indices:
if index not in seen_indices:
phyloList.append(index)
seen_indices.add(index)
print(phyloList)
"""##4. **[45 points]** Add the following method in your Tree class to build a list of lists that contain the pairs:
```
def buildList(self, start, phyloList,namesList):
for i in range(2, len(phyloList)):
temp = []
temp.append(namesList[phyloList[i]])
temp.append(start)
start = temp
return start
```
Use the buildList method to start the tree with the first 2 values from names list by extracting the first 2 values from phylolist to get the corresponding names.
```
start = [namesList[phyloList[0]],namesList[phyloList[1]]]
nestedList = newTree.buildList(start,phyloList,namesList)
```
Once printed your nestedList should look like this
"""
start = [namesList[phyloList[0][0]], namesList[phyloList[1][0]]] # Access the first element of each tuple
nestedList = newTree.buildList(start, phyloList, namesList)
print(nestedList)
['SARS-CoV-2_human_USA_CA-CZB046', ['SARS-CoV-2_human_USA_CA-CZB-1237', ['SARS-CoV-2_human_USA_CA-CZB-1045', ['BetaCoV_Wuhan_IPBCAMS-WH-04_2019', ['SARS-CoV-2_human_France_40002VJ', ['SARS-CoV-2_human_CHN_SARS-CoV-2-MZ01', ['SARS-CoV-2_human_USA_CA-CZB-1024', ['SARS-CoV-2_human_USA_CA-CDC-0139', ['SARS-CoV-2_human_USA_CA-CZB-1239', 'SARS-CoV-2_human_USA_CA-CZB-1048']]]]]]]]]
"""##*5*.**[45 points]** Create a drawTree method that will draw the tree using ete3 library
```
def drawTree(self, start, phyloList,namesList):
from ete3 import Tree
*call the buildlist method and save it in result
*turn the result into a string
*replace brackets with parentheses
*once you have this format (A(B(C(D,E)))) in your string then enter into the Tree method. I called mine "tuplestring", see below.
*calling the Tree method from ete3
TextTree = Tree(tuplestring + ';')
print("Mytree", TextTree)
```
When the drawTree method is called by the following code it should have a printout like the below
newTree.drawTree(start,phyloList,namesList)
"""
start = [namesList[phyloList[0][0]], namesList[phyloList[1][0]]] # Access the first element of each tuple
newTree.drawTree(start, phyloList, namesList)
Mytree
/-'SARS-CoV-2_human_USA_CA-CZB046'
--|
| /-'SARS-CoV-2_human_USA_CA-CZB-1237'
\-|
| /-'SARS-CoV-2_human_USA_CA-CZB-1045'
\-|
| /-'BetaCoV_Wuhan_IPBCAMS-WH-04_2019'
\-|
| /-'SARS-CoV-2_human_France_40002VJ'
\-|
| /-'SARS-CoV-2_human_CHN_SARS-CoV-2-MZ01'
\-|
| /-'SARS-CoV-2_human_USA_CA-CZB-1024'
\-|
| /-'SARS-CoV-2_human_USA_CA-CDC-0139'
\-|
| /-'SARS-CoV-2_human_USA_CA-CZB-1239'
\-|
\-'SARS-CoV-2_human_USA_CA-CZB-1048'
"""#Submit your ipynb file and your learning journey by the deadline on Canvas.
##1. Please submit in 1 Jupyter Notebook file, .ipynb file, in Canvas.
Name your jupyter notebook as “csc_219_proj3_firstname.ipynb”.
Please use your first name in place of "firstname".**[220 points]**
##2. Submit your learning journey
And then submit the link and your reflection report (500 words) to Canvas by the deadline. Your reflection report is about your learning journey – what you learned and what are the challenges utilizing the each step listed above. **[30 points]**
"""