Skip to content

Commit afe2a94

Browse files
author
Hamid Gasmi
committed
#199 is in progress
1 parent 1f424f5 commit afe2a94

File tree

3 files changed

+375
-0
lines changed

3 files changed

+375
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,369 @@
1+
import sys
2+
from collections import namedtuple
3+
4+
Edge = namedtuple('Edge', ['sink', 'weight'])
5+
6+
Latest_Visited_Node = namedtuple('Latest_Visited_Node', ['node', 'adjacent_index', 'path_index'])
7+
8+
class Knuth_Morris_Pratt:
9+
10+
def build_text(self, text, pattern):
11+
12+
text_list = [ c for c in pattern ]
13+
text_list.append("$")
14+
for c in text:
15+
text_list.append(c)
16+
17+
return text_list
18+
19+
def are_sharing_kmer(self, text, pattern, k):
20+
21+
text_list = [ pattern[i] for i in range(len(pattern) - k, len(pattern)) ]
22+
text_list.append("$")
23+
for c in text:
24+
text_list.append(c)
25+
26+
sharing_kmer = False
27+
border = 0
28+
prefix = [0 for _ in range(len(text_list))]
29+
for i in range(1, len(text_list)):
30+
while border > 0 and text_list[i] != text_list[border]:
31+
border = prefix[border - 1]
32+
33+
if text_list[i] == text_list[border]:
34+
border += 1
35+
else:
36+
border = 0
37+
38+
prefix[i] = border
39+
40+
if prefix[i] == k:
41+
sharing_kmer = True
42+
break
43+
44+
#print(k, text, pattern, text_list, sharing_kmer, prefix)
45+
return sharing_kmer
46+
47+
def compute_prefix(self, text_list):
48+
49+
border = 0
50+
prefix = [0 for _ in range(len(text_list))]
51+
for i in range(1, len(text_list)):
52+
while border > 0 and text_list[i] != text_list[border]:
53+
border = prefix[border - 1]
54+
55+
if text_list[i] == text_list[border]:
56+
border += 1
57+
else:
58+
border = 0
59+
60+
prefix[i] = border
61+
62+
return prefix
63+
64+
def max_overlap(self, read_1, read_2):
65+
66+
text_list = self.build_text(read_1, read_2)
67+
68+
prefix = self.compute_prefix(text_list)
69+
70+
return(prefix[len(prefix) - 1])
71+
72+
# Class to store Trie(Patterns)
73+
# It handles all cases particularly the case where a pattern Pi is a subtext of a pattern Pj for i != j
74+
class Trie_Patterns:
75+
def __init__(self, patterns, start, end):
76+
self.build_trie(patterns, start, end)
77+
78+
# The trie will be a dictionary of dictionaries where:
79+
# ... The key of the external dictionary is the node ID (integer),
80+
# ... The internal dictionary:
81+
# ...... It contains all the trie edges outgoing from the corresponding node
82+
# ...... Its keys are the letters on those edges
83+
# ...... Its values are the node IDs to which these edges lead
84+
# Time Complexity: O(|patterns|)
85+
# Space Complexity: O(|patterns|)
86+
def build_trie(self, patterns, start, end):
87+
88+
self.trie = dict()
89+
self.trie[0] = dict()
90+
self.node_patterns_mapping = dict()
91+
self.max_node_no = 0
92+
for i in range(len(patterns)):
93+
self.insert(patterns[i] + '$', i, start, end + 1) # to handle the case where Pi is a substring of Pj for i != j
94+
95+
def insert(self, pattern, pattern_no, start, end):
96+
97+
(index, node) = self.search_text(pattern, start, end)
98+
#print(pattern, pattern_no, index, node)
99+
#if index == end + 1:
100+
# the text is already in the Trie
101+
#if not node in self.node_patterns_mapping:
102+
# self.node_patterns_mapping[node] = []
103+
# self.node_patterns_mapping[node].append(pattern_no)
104+
# return
105+
106+
for i in range(index, end + 1):
107+
c = pattern[i]
108+
self.max_node_no += 1
109+
self.trie[node][c] = self.max_node_no
110+
self.trie[self.max_node_no] = dict()
111+
node = self.max_node_no
112+
113+
if not node in self.node_patterns_mapping:
114+
self.node_patterns_mapping[node] = []
115+
self.node_patterns_mapping[node].append(pattern_no)
116+
117+
def search_text(self, pattern, start, end):
118+
if len(self.trie) == 0:
119+
return (0, -1)
120+
121+
node = 0
122+
i = start
123+
while i <= end:
124+
125+
c = pattern[i]
126+
127+
if pattern[i] in self.trie[node]:
128+
node = self.trie[node][c]
129+
i += 1
130+
continue
131+
132+
else:
133+
break
134+
135+
return (i, node)
136+
137+
# Prints the trie in the form of a dictionary of dictionaries
138+
# E.g. For the following patterns: ["AC", "T"] {0:{'A':1,'T':2},1:{'C':3}}
139+
def print_tree(self):
140+
for node in self.trie:
141+
for c in self.trie[node]:
142+
print("{}->{}:{}".format(node, self.trie[node][c], c))
143+
#print(self.node_patterns_mapping)
144+
145+
def search_in_pattern(self, text, start, end):
146+
if len(self.trie) == 0:
147+
return False
148+
149+
node = 0
150+
index = start
151+
while index <= end:
152+
153+
c = text[index]
154+
if not c in self.trie[node]:
155+
return False
156+
157+
node = self.trie[node][c]
158+
if '$' in self.trie[node]:
159+
return True
160+
else:
161+
index += 1
162+
163+
return False
164+
165+
# Time Complexity: O(|text| * |longest pattern|)
166+
def multi_pattern_matching(self, text, start, end):
167+
168+
if len(self.trie) == 0:
169+
return []
170+
171+
(i, node) = self.search_text(text[start : end + 1] + '$', start, end + 1)
172+
173+
#print("Yes", text[start:end + 1], start, end, node, self.node_patterns_mapping[node])
174+
175+
return self.node_patterns_mapping[node]
176+
177+
class Overlap_Graph:
178+
179+
def __init__(self, reads):
180+
181+
self._read_size = len(reads[0])
182+
183+
self._build_nodes(reads)
184+
self._build_adjacency_list_trie_kmp(reads)
185+
186+
def _build_nodes(self, reads):
187+
self.nodes = []
188+
self.adjacency_list = []
189+
190+
node_kmer_no_dict = dict()
191+
for read in reads:
192+
if read in node_kmer_no_dict:
193+
continue
194+
195+
node_kmer_no_dict[read] = len(self.nodes)
196+
self.nodes.append(read)
197+
self.adjacency_list.append([])
198+
199+
def _build_adjacency_list_naive(self, reads):
200+
201+
# 2. Build adjacency list: 2 reads are joined by a directed edge of weight = to the length of the max overlap of these 2 reads
202+
kmp = Knuth_Morris_Pratt()
203+
204+
for u in range(len(self.nodes)):
205+
206+
max_overlap = -1
207+
max_overlap_node_index = []
208+
209+
for v in range(len(self.nodes)):
210+
if u == v:
211+
continue
212+
213+
overlap = kmp.max_overlap(self.nodes[v], self.nodes[u])
214+
if overlap > max_overlap:
215+
max_overlap_node_index = [v]
216+
max_overlap = overlap
217+
218+
elif overlap == max_overlap:
219+
max_overlap_node_index.append(v)
220+
221+
for v in max_overlap_node_index:
222+
self.adjacency_list[v].append(Edge(u, max_overlap))
223+
224+
def _build_adjacency_list_kmp(self, reads):
225+
226+
# 2. Build adjacency list: 2 reads are joined by a directed edge of weight = to the length of the max overlap of these 2 reads
227+
kmp = Knuth_Morris_Pratt()
228+
229+
sharing_kmer_size = 12 if len(self.nodes[0]) > 12 else 1
230+
231+
for u in range(len(self.nodes)):
232+
sharing_kmer_reads = []
233+
234+
for v in range(len(self.nodes)):
235+
if u == v:
236+
continue
237+
238+
if kmp.are_sharing_kmer(self.nodes[u], self.nodes[v], sharing_kmer_size):
239+
sharing_kmer_reads.append(v)
240+
241+
max_overlap = -1
242+
max_overlap_node_index = []
243+
244+
for v in sharing_kmer_reads:
245+
246+
overlap = kmp.max_overlap(self.nodes[v], self.nodes[u])
247+
if overlap > max_overlap:
248+
max_overlap_node_index = [v]
249+
max_overlap = overlap
250+
251+
elif overlap == max_overlap:
252+
max_overlap_node_index.append(v)
253+
254+
for v in max_overlap_node_index:
255+
self.adjacency_list[v].append(Edge(u, max_overlap))
256+
257+
def _build_adjacency_list_trie_kmp(self, reads):
258+
259+
# Build adjacency list: 2 reads are joined by a directed edge of weight = to the length of the max overlap of these 2 reads
260+
sharing_kmer_size = 12 if self._read_size > 12 else 0
261+
262+
kmp = Knuth_Morris_Pratt()
263+
264+
trie = Trie_Patterns(self.nodes, self._read_size - sharing_kmer_size, self._read_size - 1)
265+
266+
for u in range(len(self.nodes)):
267+
268+
sharing_kmer_reads = trie.multi_pattern_matching(self.nodes[u], 0, sharing_kmer_size - 1)
269+
#print("1st step: ", self.nodes[u], sharing_kmer_reads)
270+
max_overlap = -1
271+
max_overlap_node_index = []
272+
273+
for v in sharing_kmer_reads:
274+
275+
if u == v:
276+
continue
277+
278+
overlap = kmp.max_overlap(self.nodes[v], self.nodes[u])
279+
if overlap > max_overlap:
280+
max_overlap_node_index = [v]
281+
max_overlap = overlap
282+
283+
elif overlap == max_overlap:
284+
max_overlap_node_index.append(v)
285+
286+
for v in max_overlap_node_index:
287+
self.adjacency_list[v].append(Edge(u, max_overlap))
288+
289+
def str_adjacency_list(self):
290+
291+
result_list = []
292+
for node in range(len(self.nodes)):
293+
if len(self.adjacency_list[node]) == 0:
294+
continue
295+
296+
node_adjacents = [ self.nodes[node] ]
297+
node_adjacents.append('->')
298+
for a in range(len(self.adjacency_list[node])):
299+
node_adjacents.append('(' + self.nodes[self.adjacency_list[node][a].sink] + ', ' + str(self.adjacency_list[node][a].weight) + ')')
300+
if a < len(self.adjacency_list[node]) - 1:
301+
node_adjacents.append(',')
302+
result_list.append(''.join(node_adjacents))
303+
304+
return '\n'.join(result_list)
305+
306+
def hamiltonian_path(self):
307+
308+
path = []
309+
visited = [False for _ in range(len(self.nodes))]
310+
311+
nodes_unvisited_edges_stack = [Latest_Visited_Node(0, -1, 0)]
312+
visited[0] = True
313+
path = [Edge(0, 0)]
314+
while len(nodes_unvisited_edges_stack) != 0:
315+
316+
latest_visited_node = nodes_unvisited_edges_stack.pop(len(nodes_unvisited_edges_stack) - 1)
317+
318+
# The path build so far isn't hamiltonian: we need to go back to the latest node with unvisited edges
319+
for i in range(len(path) - 1, latest_visited_node.path_index, -1):
320+
321+
visited[ path[i].sink ] = False
322+
path.pop(i)
323+
324+
node = self.adjacency_list[latest_visited_node.node][latest_visited_node.adjacent_index + 1].sink
325+
edge_node_weight = self.adjacency_list[latest_visited_node.node][latest_visited_node.adjacent_index + 1].weight
326+
if (latest_visited_node.adjacent_index + 1) < (len(self.adjacency_list[latest_visited_node.node]) - 1):
327+
#print("Yes: ", len(self.adjacency_list[latest_visited_node.node]), latest_visited_node)
328+
nodes_unvisited_edges_stack.append(Latest_Visited_Node(latest_visited_node.node, latest_visited_node.adjacent_index + 1, latest_visited_node.path_index))
329+
#print("node, next node: ", latest_visited_node.node, self.nodes[latest_visited_node.node], self.nodes[node], node, nodes_unvisited_edges_stack, path)
330+
331+
while not visited[node]:
332+
path.append(Edge(node, edge_node_weight))
333+
visited[node] = True
334+
text = '(' + str(node) + ' , ' + self.nodes[node] + ')'
335+
if len(self.adjacency_list[node]) > 1:
336+
nodes_unvisited_edges_stack.append(Latest_Visited_Node(node, 0, len(path) - 1))
337+
338+
edge_node_weight = self.adjacency_list[node][0].weight
339+
node = self.adjacency_list[node][0].sink
340+
text += ', (' + str(node) + ' , ' + self.nodes[node] + ')'
341+
#print("node, next node: ", text, nodes_unvisited_edges_stack, path)
342+
343+
if len(path) == len(self.nodes):
344+
path[0] = Edge(path[0].sink, edge_node_weight)
345+
break
346+
347+
node = path[0].sink
348+
genome_list = [self.nodes[node]]
349+
for i in range(1, len(path)):
350+
351+
node = path[i].sink
352+
overlap = path[i].weight
353+
if i == len(path) - 1:
354+
genome_list.append(self.nodes[node][overlap:(len(self.nodes[node]) - path[0].weight)])
355+
else:
356+
genome_list.append(self.nodes[node][overlap:])
357+
358+
return ''.join(genome_list)
359+
360+
if __name__ == '__main__':
361+
362+
reads = sys.stdin.read().strip().splitlines()
363+
364+
overlap_graph = Overlap_Graph(reads)
365+
366+
#print(overlap_graph.str_adjacency_list())
367+
368+
print(overlap_graph.hamiltonian_path())
369+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ACGTTCGA
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
AAC
2+
ACG
3+
GAA
4+
GTT
5+
TCG

0 commit comments

Comments
 (0)