yaseminsucu_csc_219_project_3_covid19tree.py

# -*- coding: utf-8 -*-
"""YaseminSucu_CSC 219_Project 3_Covid19Tree.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1PfzQSCf8M7mkw2t50oZHYP1J5hF6IVaU

#<h1>Project 3 - Covid19Tree  </h1>

##CSC 219 Project 3 (total 250 points)
**Due date: on Canvas**
"""

from google.colab import drive
drive.mount('/content/drive')

"""#Code for Loading Data"""

!pip install ete3

seqsList = []
namesList = []

# input: file
# output: list of file names and sequece list
# loadSeq reads the covidstrainpractice.fasta and builds the file names list and sequence list
def loadSeq(filename):
  #Read file lines
    path = ''
    f = open(filename)  # opens the file
    linesList = f.readlines()[:] # reads into the lines
    tempList = []

    # Process lines
    # Get sequence from the list and process it
    for i in range(len(linesList)): # iterates through each line
        if '>' in linesList[i]:
            if tempList:
                k = ''.join(tempList)
                seqsList.append(k)
                del tempList[:] # empties tempList
            if '>' in linesList[i]:
                tempName = linesList[i].split('/')
                #print("tempName", tempName)
                firstString = tempName[0]
                lastSpace = firstString.rfind(' ')
                finalName = firstString[lastSpace + 1:] + '_' + '_'.join(tempName[1:4])
                #print("finalName", finalName)

            if ',' in finalName:
                index = finalName.index(',')
                finalName = finalName[:index]

            namesList.append(finalName)


        else:
            cleanString = linesList[i].strip('\n')
            tempList.append(cleanString)
            if i == (len(linesList)) - 1: # accounts for very last sequence
                k = ''.join(tempList)
                seqsList.append(k)

    return seqsList, namesList

# compareSeq return a dictionary of differnces in sequences
def compareSeq(seqList):
    checkArr = ['A', 'T', 'G', 'C']
    diffDict = {}

    #set up columns
    length = len(seqList)

    #set up rows
    for row in range(1, length):
        #compare  2 sequences
        for col in range(row):
            colSeq = seqList[col]
            rowSeq = seqList[row]
            #set diff counter to already include length diff
            diff = 0 + abs(len(colSeq) - len(rowSeq))

            #set end - when to stop comparing
            if len(colSeq) < len(rowSeq):
                end = len(colSeq)
            elif len(rowSeq) < len(colSeq):
                end = len(rowSeq)
            else:#if the same length
                end = len(rowSeq)

            #iterate thru seqs
            for i in range(end):
                if rowSeq[i] != colSeq[i]:
                  diff += 1

            #creating dict
            #if diff key not in dict already, add it
            if diffDict.get(diff)==None:
                diffDict[diff] = [(row,col)]
            #if dict key already exists, add append tuple
            else:
                diffDict[diff].append((row,col))


            print(str(diff).rjust(6, ' '), end=" ")
        print("\r")
    return diffDict

# Start processing
# You can download covidstrainpractice.fasta from the course Canvas site
path = '/content/drive/MyDrive/Colab Notebooks/CSC219/PROJECT 1 /datafiles/'  # GoogleDrive or your directory path to .fasta file
seqsList, namesList = loadSeq(path + 'covidstrainpractice.fasta')
print(namesList)
print(seqsList)

# compareSeq(seqsList)
newDict = compareSeq(seqsList)
print(newDict)

"""#Code for Tree"""

class Node:
    def __init__(self, value):
        self.value = value
        self.left = None
        self.right = None

class Tree:
    def __init__(self):
        self.root = None
        self.output = []
        self.value = []

    def insert(self, value):

        if self.root is None:
            self.root = Node(value)
            self.output.append(self.root)
        else:

            cur_node = self.root
            self.__insert(value, cur_node)

    def __insert(self, value, current):
        # traverse to the left first
        if value[0] < current.value[0]:
            if current.left is not None:
                self.__insert(value, current.left)
            else: #current.left is None
                current.left = Node(value)
                self.output.append(current.left)
        elif value[0] > current.value[0]: #value larger then the current. value
            if current.right is not None:
                self.__insert(value, current.right)
            else:
                current.right = Node(value)
                self.output.append(current.right)
        else:
            pass # meaning no code to execute
        # if value == current.value
            # don't do anything

    def buildList(self, start, phyloList, namesList):
        temp = start
        for i in range(2, len(phyloList)):
            nested = []
            index = phyloList[i][0]  # Access the first element of the tuple as the index
            nested.append(namesList[index])
            nested.append(temp)
            temp = list(nested)
        return temp


    def get_tree(self,type): # prints the trees values and returns the tree as a list of node objects
        current = self.root

        if current is None:
            return "Tree is empty"
        else:
            #traverse the tree in DFS, inorder traversal
            output_arr=[]
            if type.lower() == 'inorder':
                print(self.inorder_traversal(current, output_arr))
            elif type.lower() == 'postorder':
                print(self.postorder_traversal(current, output_arr))
            elif type.lower() == 'preorder':
                print(self.preorder_traversal(current, output_arr))


            return output_arr


    # Inorder traversal
    # Left -> Root -> Right
    def inorder_traversal(self,root, output):
        if root:
            #traverse to the left
            self.inorder_traversal(root.left,output)
            #print the value of root
            output.append(root.value)
            #traverse to the left
            self.inorder_traversal(root.right, output)
        return output

    def inorder_traversal_phylo(self, root, output):
        if root:
            # traverse to the left
            self.inorder_traversal(root.left, output)
            # print the value of root
            output.append(root.value)
            # traverse to the left
            self.inorder_traversal(root.right, output)
        return output

    # Preorder traversal
    # Root -> Left -> Right
    def preorder_traversal(self,root, output):
        if root:
            # print the value of root
            output.append(root.value)
            #traverse to the left
            self.preorder_traversal(root.left,output)
            #traverse to the right
            self.preorder_traversal(root.right, output)
        return output

    # Postorder traversal
    # Left -> Right -> Root
    def postorder_traversal(self,root, output):
        if root:
            #traverse to the left
            self.postorder_traversal(root.left,output)
            #traverse to the right
            self.postorder_traversal(root.right, output)
            #print the value of root
            output.append(root.value)
        return output

    def buildList(self, start, phyloList, namesList):
        temp = start
        for i in range(2, len(phyloList)):
            nested = []
            index = phyloList[i][0]  # Access the first element of the tuple as the index
            nested.append(namesList[index])
            nested.append(temp)
            temp = list(nested)
        return temp

    def stringNames(self, nl):
        if type(nl[1]) is list:
           return '(' + nl[0] + ',' + self.stringNames(nl[1])
        else:
          return '(' + nl[0] + ',' + nl[1]

    def drawTree(self, start, phyloList, namesList):
        from ete3 import Tree

        result = self.buildList(start, phyloList, namesList)
        tuplestring = str(result)  # Convert result to a string
        tuplestring = tuplestring.replace('[', '(').replace(']', ')')  # Replace brackets with parentheses
        tuplestring = tuplestring.replace('/', '_')  # Replace forward slashes with underscores

        TextTree = Tree(tuplestring + ';')
        print("Mytree", TextTree)


# 1. To build a tree, use your dictionary, and sort the keys

def loadSeq(filename):
  seqsList = []
  namesList = []
  tempList = []
  f= open(filename, "r")
  for line in f:

      if '>' in line:
          line = line[:line.find(",")]

          if line.rfind("/") < line.rfind(" "):
              temp = line[:line.finds("/")]
              line = line[temp.rfind(" ")+1:]
              line = line[:line.find(" ")]

          else:
              line = line[line.rfind(" ")+1:]
          namesList.append(line)

          if tempList:
              seqsList.append(' '.join(tempList))
              del tempList[:]

          else:

              templine = line.strip('\n')
              tempList.append(tempLine)
      seqsList.append(''.join(tempList))
      del tempList[:]
      f.close()

      return seqsList, namesList

  path = '/content/drive/MyDrive/Colab Notebooks/CSC219/PROJECT 1 /datafiles/'
  seqsList, namesList = loadSeq(path + 'covidstrainpractice.fasta')
  print(namesList)

  def compareSeq(seqsList):
      difDic = {}
      length = len(seqList)
      print('\t', end="")

      for i in range(length):
          print(str(i).rjust(6, ''), end="")
      print('\r')

      for row in range(1, length):
          print(row, ('\t'), end="")

          for col in range(row):
              colSeq = seqList[col]
              rowSeq = seqList[row]
              diff = abs(len(colSeq)-len(rowSeq))

              if len(colSeq) < len(rowSeq):
                 end = len(colSeq)
              else:
                 end = len(rowSeq)

              for i in range(end):
                  if rowSeq[i] != colSeq[i]:
                      diff +=1
              print(str(diff).rjust(6, ''), end="")

              if diff in diffDict:
                  diffDict[diff].append((row, col))
              else:
                  diffDic[diff] = [(row, col)]

              diff = 0
          print("\r")
      return difDic

"""#Assignment for Project 3

##1. **[40 points]** Using insert(), build a tree called "newTree" of values by iterating through the your dictionary.keys() list. (Feel free to call your newTree another name. When I reference "newTree" below, just insert your own treename)

Each node of your tree will contain the value. The value is a tuple.

Each tuple has a key (Sequence difference) and tupleList

The insert() will sort the nodes, based on the difference key,

```
For example these are values each node will contain
Value : (6, [(3, 1), (3, 2)]); 6 is sequence difference key , [(3,1),(3,2)] is the tupleList
Value : (10, [(2, 1)])
```
"""

# strain and value with the tuples

newDict = compareSeq(seqsList)
print(newDict)

newTree = Tree()

for k, v in newDict.items():
    tuples = (k, v)
    print(tuples)
    newTree.insert(tuples)

"""##2.**[45 points]** Use get_tree() with the 'inorder' type to generate a list of sorted tree. Save it into a list.

When printed the output should look like this. Note that it is already sorted because we used inorder traversal.

```
outputlist :

[(3, [(8, 6)]), (4, [(4, 3)]), (6, [(3, 1), (3, 2)]), (8, [(4, 1), (4, 2)]), ...

```
"""

myList = newTree.get_tree('inorder')

"""##3.**[45 points]** Once your output list is sorted, then iterate through it and create a list that will contain only unique values of the sequence indices.

When printed the list would look like this below. I called mine a "phylolist".

```
phylolist:

[8, 6, 3, 4, 1, 2, 0, 5, 7, 9]


"""

print(phyloList)

phyloList = []
seen_indices = set()

for value in myList:
    indices = value[1]  # Get the tupleList containing the indices
    for index in indices:
        if index not in seen_indices:
            phyloList.append(index)
            seen_indices.add(index)

print(phyloList)

"""##4. **[45 points]** Add the following method in your Tree class to build a list of lists that contain the pairs:

```
    def buildList(self, start, phyloList,namesList):
        for i in range(2, len(phyloList)):
            temp = []
            temp.append(namesList[phyloList[i]])
            temp.append(start)
            start = temp  
        return start

```

Use the buildList method to start the tree with the first 2 values from names list by extracting the first 2 values from phylolist to get the corresponding names.

```
start = [namesList[phyloList[0]],namesList[phyloList[1]]]
nestedList = newTree.buildList(start,phyloList,namesList)
```

Once printed your nestedList should look like this
"""

start = [namesList[phyloList[0][0]], namesList[phyloList[1][0]]]  # Access the first element of each tuple
nestedList = newTree.buildList(start, phyloList, namesList)
print(nestedList)

['SARS-CoV-2_human_USA_CA-CZB046', ['SARS-CoV-2_human_USA_CA-CZB-1237', ['SARS-CoV-2_human_USA_CA-CZB-1045', ['BetaCoV_Wuhan_IPBCAMS-WH-04_2019', ['SARS-CoV-2_human_France_40002VJ', ['SARS-CoV-2_human_CHN_SARS-CoV-2-MZ01', ['SARS-CoV-2_human_USA_CA-CZB-1024', ['SARS-CoV-2_human_USA_CA-CDC-0139', ['SARS-CoV-2_human_USA_CA-CZB-1239', 'SARS-CoV-2_human_USA_CA-CZB-1048']]]]]]]]]

"""##*5*.**[45 points]** Create a drawTree method that will draw the tree using ete3 library

```

 def drawTree(self, start, phyloList,namesList):
        from ete3 import Tree

        *call the buildlist method and save it in result
        *turn the result into a string
        *replace brackets with parentheses
        
        *once you have this format (A(B(C(D,E)))) in your string then enter into the Tree method. I called mine "tuplestring", see below.

        *calling the Tree method from ete3
        TextTree = Tree(tuplestring + ';')
        print("Mytree", TextTree)
```

When the drawTree method is called by the following code it should have a printout like the below

newTree.drawTree(start,phyloList,namesList)

"""

start = [namesList[phyloList[0][0]], namesList[phyloList[1][0]]]  # Access the first element of each tuple

newTree.drawTree(start, phyloList, namesList)

Mytree
   /-'SARS-CoV-2_human_USA_CA-CZB046'
--|
  |   /-'SARS-CoV-2_human_USA_CA-CZB-1237'
   \-|
     |   /-'SARS-CoV-2_human_USA_CA-CZB-1045'
      \-|
        |   /-'BetaCoV_Wuhan_IPBCAMS-WH-04_2019'
         \-|
           |   /-'SARS-CoV-2_human_France_40002VJ'
            \-|
              |   /-'SARS-CoV-2_human_CHN_SARS-CoV-2-MZ01'
               \-|
                 |   /-'SARS-CoV-2_human_USA_CA-CZB-1024'
                  \-|
                    |   /-'SARS-CoV-2_human_USA_CA-CDC-0139'
                     \-|
                       |   /-'SARS-CoV-2_human_USA_CA-CZB-1239'
                        \-|
                           \-'SARS-CoV-2_human_USA_CA-CZB-1048'

"""#Submit your ipynb file and your learning journey by the deadline on Canvas.

##1. Please submit in 1 Jupyter Notebook file, .ipynb file, in Canvas.
Name your jupyter notebook as “csc_219_proj3_firstname.ipynb”.

Please use your first name in place of "firstname".**[220 points]**


##2. Submit your learning journey
And then submit the link and your reflection report (500 words) to Canvas by the deadline. Your reflection report is about your learning journey – what you learned and what are the challenges utilizing the each step listed above. **[30 points]**

"""