Skip to content

Commit

Permalink
low coverage edges subcommand that takes a TSV of nodes and their cor…
Browse files Browse the repository at this point in the history
…risponding chromosomes and a GFA with edge counts like output of MBG and reports the edges that are connecting two different chromosomes
  • Loading branch information
fawaz-dabbaghieh committed May 3, 2022
1 parent 2286385 commit ab46b45
Show file tree
Hide file tree
Showing 6 changed files with 267 additions and 124 deletions.
66 changes: 47 additions & 19 deletions GFASubgraph/Graph.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from GFASubgraph.graph_io import read_gfa, write_gfa
from GFASubgraph.connected_components import all_components
from GFASubgraph.bfs import bfs
from GFASubgraph.Node import Node
import sys
import logging
import os
Expand All @@ -11,55 +12,73 @@ class Graph:
Graph object containing the important information about the graph
"""

__slots__ = ['nodes', 'b_chains', 'child_parent']
__slots__ = ['nodes', 'edge_counts']

def __init__(self, graph_file=None):
def __init__(self, graph_file=None, edge_count=False):
if graph_file is not None:
if not os.path.exists(graph_file):
print("graph file {} does not exist".format(graph_file))
print("Error! Check log file.")
logging.error("graph file {} does not exist".format(graph_file))
sys.exit()
# loading nodes from file
self.nodes = read_gfa(gfa_file_path=graph_file)
if edge_count:
self.edge_counts = self.get_edges_counts(graph_file)
else:
self.nodes = dict()
# elif graph_file.endswith(".vg"):
# self.nodes = read_vg(vg_file_path=graph_file, k=k, modified=modified, coverage=coverage)

self.b_chains = set() # list of BubbleChain objects
# self.bubbles = set()
# self.k = 1
self.child_parent = dict()
self.edge_counts = dict()

def __len__(self):
"""
overloading the length function
"""

return len(self.nodes)

def __str__(self):
"""
overloading the string function for printing
"""

return "The graph has {} Nodes and {} chains".format(
len(self.nodes), len(self.b_chains))
return "The graph has {} Nodes".format(len(self.nodes))

def __contains__(self, key):
"""
overloading the in operator to check if node exists in graph
"""

return key in self.nodes

def __getitem__(self, key):
"""
overloading the bracket operator
"""
try:
return self.nodes[key]
except KeyError:
return None

def __setitem__(self, key, value):
"""
overloading setting an item in nodes
"""
if isinstance(value, Node):
self.nodes[key] = value
else:
raise ValueError("the object given to set should be a Node object")

def __delitem__(self, key):
"""
overloading deleting item
"""
del self.nodes[key]

def reset_visited(self):
"""
resets all nodes.visited to false
"""

for n in self.nodes.values():
n.visited = False

# todo make remove start and remove end separate so I can use the same functions
# for removing one edge
def remove_node(self, n_id):
"""
remove a node and its corresponding edges
Expand All @@ -86,9 +105,7 @@ def remove_lonely_nodes(self):
"""
remove singular nodes with no neighbors
"""

nodes_to_remove = [n.id for n in self.nodes.values() if len(n.neighbors()) == 0]

for i in nodes_to_remove:
self.remove_node(i)

Expand All @@ -109,7 +126,6 @@ def write_graph(self, set_of_nodes=None,
write_gfa(self, set_of_nodes=set_of_nodes, output_file=output_file,
append=append)


def bfs(self, start, size):
"""
Returns a neighborhood of size given around start node
Expand All @@ -134,3 +150,15 @@ def output_components(self, output_dir):
logging.info("Writing Component {}...".format(output_file))

self.write_graph(set_of_nodes=cc, output_file=output_file, append=False)

def remove_edge(self, edge):
n1, side1, n2, side2, overlap = edge
if side1 == 0:
self.nodes[n1].remove_from_start(n2, side2, overlap)
else:
self.nodes[n1].remove_from_end(n2, side2, overlap)

if side2 == 0:
self.nodes[n2].remove_from_start(n1, side1, overlap)
else:
self.nodes[n2].remove_from_end(n1, side1, overlap)
45 changes: 35 additions & 10 deletions GFASubgraph/Node.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,34 @@
import sys
import logging


class Node:
__slots__ = ['id', 'seq', 'seq_len', 'start', 'end', 'visited', 'optional']
__slots__ = ['id', 'seq', 'seq_len', 'start', 'end', 'visited', 'optional', 'chromosome']

def __init__(self, identifier, kc=0, km=0):
def __init__(self, identifier):
self.id = identifier
self.seq = ""
self.seq_len = 0
self.start = []
self.end = []
self.seq_len = 0
self.start = set()
self.end = set()
self.visited = False
self.optional = ""
self.chromosome = None

def __sizeof__(self):
size = self.id.__sizeof__() + self.seq_len.__sizeof__() + self.visited.__sizeof__()

if len(self.start) == 0:
size += self.start.__sizeof__()
else:
for i in range(len(self.start)):
size += sys.getsizeof(self.start[i])
for i in self.start:
size += sys.getsizeof(i)

if len(self.end) == 0:
size += self.end.__sizeof__()
else:
for i in range(len(self.end)):
size += sys.getsizeof(self.end[i])
for i in self.end:
size += sys.getsizeof(i)

return size

Expand All @@ -42,7 +44,9 @@ def in_direction(self, node, direction):
"""
returns true if node is a neighbor in that direction
"""

# todo if I make start and end into dicts
# I can then easily check if the node in that direction by check if (node, 0) or (node, 1) in self.start
# same goes for self.end
if direction == 0:
if node in [x[0] for x in self.start]:
return True
Expand All @@ -63,3 +67,24 @@ def children(self, direction):
return [x[0] for x in self.end]
else:
raise Exception("Trying to access a wrong direction in node {}".format(self.id))

# todo add functions to add edges to start and end that graph_io can then use
# should maybe have an option whether the edge is (neighbor, direction, overlap) or
# (neighbor, direction, overlap, count)
def remove_from_start(self, neighbor, side, overlap):
"""
remove the neighbor edge from the start going to side in neighbor
"""
try:
self.start.remove((neighbor, side, overlap))
except KeyError:
logging.warning(f"Could not remove edge {(neighbor, side, overlap)} from {self.id}'s start as it does not exist")

def remove_from_end(self, neighbor, side, overlap):
"""
remove the neighbor edge from the end going to side in neighbor
"""
try:
self.end.remove((neighbor, side, overlap))
except KeyError:
logging.warning(f"Could not remove edge {(neighbor, side, overlap)} from {self.id}'s end as it does not exist")
99 changes: 86 additions & 13 deletions GFASubgraph/graph_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ def write_gfa(graph, set_of_nodes=None,

# else:
if nodes[n1].optional: # if there are extra tags, write them as is
line = str("\t".join(["S", str(n1), nodes[n1].seq + nodes[n1].optional]))
line = str("\t".join(["S", str(n1), nodes[n1].seq, nodes[n1].optional]))
# line = str("\t".join(("S", str(n1), nodes[n1].seq, nodes[n1].optional)))
else:
line = str("\t".join(["S", str(n1), nodes[n1].seq]))

f.write(line + "\n")
f.write(line)

# writing edges
edges = []
Expand Down Expand Up @@ -107,10 +107,17 @@ def read_gfa(gfa_file_path):
for e in edges:
line = e.split()

k = str(line[1])
# I take the overlap in 5 and see if there are any more tags and make a dict out of them
k = line[1]
if k not in nodes: # if the edge is there but not the node
continue

overlap = int(line[5][:-1])

neighbor = str(line[3])
neighbor = line[3]
if neighbor not in nodes:
continue

if line[2] == "-":
from_start = True
else:
Expand All @@ -121,32 +128,98 @@ def read_gfa(gfa_file_path):
else:
to_end = False

if from_start and to_end : # from start to end L x - y -
if from_start and to_end: # from start to end L x - y -
if (neighbor, 1, overlap) not in nodes[k].start:
nodes[k].start.append((neighbor, 1, overlap))
nodes[k].start.add((neighbor, 1, overlap))
if (k, 0, overlap) not in nodes[neighbor].end:
nodes[neighbor].end.append((k, 0, overlap))
nodes[neighbor].end.add((k, 0, overlap))

elif from_start and not to_end: # from start to start L x - y +

if (neighbor, 0, overlap) not in nodes[k].start:
nodes[k].start.append((neighbor, 0, overlap))
nodes[k].start.add((neighbor, 0, overlap))

if (k, 0, overlap) not in nodes[neighbor].start:
nodes[neighbor].start.append((k, 0, overlap))
nodes[neighbor].start.add((k, 0, overlap))

elif not from_start and not to_end: # from end to start L x + y +
if (neighbor, 0, overlap) not in nodes[k].end:
nodes[k].end.append((neighbor, 0, overlap))
nodes[k].end.add((neighbor, 0, overlap))

if (k, 1, overlap) not in nodes[neighbor].start:
nodes[neighbor].start.append((k, 1, overlap))
nodes[neighbor].start.add((k, 1, overlap))

elif not from_start and to_end: # from end to end L x + y -
if (neighbor, 1, overlap) not in nodes[k].end:
nodes[k].end.append((neighbor, 1, overlap))
nodes[k].end.add((neighbor, 1, overlap))

if (k, 1, overlap) not in nodes[neighbor].end:
nodes[neighbor].end.append((k, 1, overlap))
nodes[neighbor].end.add((k, 1, overlap))

return nodes


def get_edges_counts(graph_file):
edge_counts = dict()
with open(graph_file, "r") as infile:
for l in infile:
count = -1
if l.startswith("L"):

line = l.split()
if len(line) < 7:
logging.warning(f"the edge {' '.join(line)} does not have counts")
count = 0
else:
if not line[6].startswith("ec"):
logging.warning(f'the edge {" ".join(line)} does not have the ec tag')
count = 0

# I take the overlap in 5 and see if there are any more tags and make a dict out of them
k = str(line[1])
overlap = int(line[5][:-1])
if count == -1:
count = int(line[6].split(":")[-1])

neighbor = str(line[3])
if line[2] == "-":
from_start = True
else:
from_start = False

if line[4] == "-":
to_end = True
else:
to_end = False

if from_start and to_end: # from start to end L x - y -
edge_counts[(k, 0, neighbor, 1, overlap)] = count
# if (neighbor, 1, overlap) not in nodes[k].start:
# nodes[k].start.add((neighbor, 1, overlap))
# if (k, 0, overlap) not in nodes[neighbor].end:
# nodes[neighbor].end.add((k, 0, overlap))

elif from_start and not to_end: # from start to start L x - y +
edge_counts[(k, 0, neighbor, 0, overlap)] = count
# if (neighbor, 0, overlap) not in nodes[k].start:
# nodes[k].start.add((neighbor, 0, overlap))
#
# if (k, 0, overlap) not in nodes[neighbor].start:
# nodes[neighbor].start.add((k, 0, overlap))

elif not from_start and not to_end: # from end to start L x + y +
edge_counts[(k, 1, neighbor, 0, overlap)] = count
# if (neighbor, 0, overlap) not in nodes[k].end:
# nodes[k].end.add((neighbor, 0, overlap))
#
# if (k, 1, overlap) not in nodes[neighbor].start:
# nodes[neighbor].start.add((k, 1, overlap))

elif not from_start and to_end: # from end to end L x + y -
edge_counts[(k, 1, neighbor, 1, overlap)] = count
# if (neighbor, 1, overlap) not in nodes[k].end:
# nodes[k].end.add((neighbor, 1, overlap))
#
# if (k, 1, overlap) not in nodes[neighbor].end:
# nodes[neighbor].end.add((k, 1, overlap))
return edge_counts
Loading

0 comments on commit ab46b45

Please sign in to comment.