|
| 1 | +import sys |
| 2 | +from collections import namedtuple |
| 3 | + |
| 4 | +Edge = namedtuple('Edge', ['sink', 'weight']) |
| 5 | + |
| 6 | +Latest_Visited_Node = namedtuple('Latest_Visited_Node', ['node', 'adjacent_index', 'path_index']) |
| 7 | + |
| 8 | +class Knuth_Morris_Pratt: |
| 9 | + |
| 10 | + def build_text(self, text, pattern): |
| 11 | + |
| 12 | + text_list = [ c for c in pattern ] |
| 13 | + text_list.append("$") |
| 14 | + for c in text: |
| 15 | + text_list.append(c) |
| 16 | + |
| 17 | + return text_list |
| 18 | + |
| 19 | + def are_sharing_kmer(self, text, pattern, k): |
| 20 | + |
| 21 | + text_list = [ pattern[i] for i in range(len(pattern) - k, len(pattern)) ] |
| 22 | + text_list.append("$") |
| 23 | + for c in text: |
| 24 | + text_list.append(c) |
| 25 | + |
| 26 | + sharing_kmer = False |
| 27 | + border = 0 |
| 28 | + prefix = [0 for _ in range(len(text_list))] |
| 29 | + for i in range(1, len(text_list)): |
| 30 | + while border > 0 and text_list[i] != text_list[border]: |
| 31 | + border = prefix[border - 1] |
| 32 | + |
| 33 | + if text_list[i] == text_list[border]: |
| 34 | + border += 1 |
| 35 | + else: |
| 36 | + border = 0 |
| 37 | + |
| 38 | + prefix[i] = border |
| 39 | + |
| 40 | + if prefix[i] == k: |
| 41 | + sharing_kmer = True |
| 42 | + break |
| 43 | + |
| 44 | + #print(k, text, pattern, text_list, sharing_kmer, prefix) |
| 45 | + return sharing_kmer |
| 46 | + |
| 47 | + def compute_prefix(self, text_list): |
| 48 | + |
| 49 | + border = 0 |
| 50 | + prefix = [0 for _ in range(len(text_list))] |
| 51 | + for i in range(1, len(text_list)): |
| 52 | + while border > 0 and text_list[i] != text_list[border]: |
| 53 | + border = prefix[border - 1] |
| 54 | + |
| 55 | + if text_list[i] == text_list[border]: |
| 56 | + border += 1 |
| 57 | + else: |
| 58 | + border = 0 |
| 59 | + |
| 60 | + prefix[i] = border |
| 61 | + |
| 62 | + return prefix |
| 63 | + |
| 64 | + def max_overlap(self, read_1, read_2): |
| 65 | + |
| 66 | + text_list = self.build_text(read_1, read_2) |
| 67 | + |
| 68 | + prefix = self.compute_prefix(text_list) |
| 69 | + |
| 70 | + return(prefix[len(prefix) - 1]) |
| 71 | + |
| 72 | +# Class to store Trie(Patterns) |
| 73 | +# It handles all cases particularly the case where a pattern Pi is a subtext of a pattern Pj for i != j |
| 74 | +class Trie_Patterns: |
| 75 | + def __init__(self, patterns, start, end): |
| 76 | + self.build_trie(patterns, start, end) |
| 77 | + |
| 78 | + # The trie will be a dictionary of dictionaries where: |
| 79 | + # ... The key of the external dictionary is the node ID (integer), |
| 80 | + # ... The internal dictionary: |
| 81 | + # ...... It contains all the trie edges outgoing from the corresponding node |
| 82 | + # ...... Its keys are the letters on those edges |
| 83 | + # ...... Its values are the node IDs to which these edges lead |
| 84 | + # Time Complexity: O(|patterns|) |
| 85 | + # Space Complexity: O(|patterns|) |
| 86 | + def build_trie(self, patterns, start, end): |
| 87 | + |
| 88 | + self.trie = dict() |
| 89 | + self.trie[0] = dict() |
| 90 | + self.node_patterns_mapping = dict() |
| 91 | + self.max_node_no = 0 |
| 92 | + for i in range(len(patterns)): |
| 93 | + self.insert(patterns[i] + '$', i, start, end + 1) # to handle the case where Pi is a substring of Pj for i != j |
| 94 | + |
| 95 | + def insert(self, pattern, pattern_no, start, end): |
| 96 | + |
| 97 | + (index, node) = self.search_text(pattern, start, end) |
| 98 | + #print(pattern, pattern_no, index, node) |
| 99 | + #if index == end + 1: |
| 100 | + # the text is already in the Trie |
| 101 | + #if not node in self.node_patterns_mapping: |
| 102 | + # self.node_patterns_mapping[node] = [] |
| 103 | + # self.node_patterns_mapping[node].append(pattern_no) |
| 104 | + # return |
| 105 | + |
| 106 | + for i in range(index, end + 1): |
| 107 | + c = pattern[i] |
| 108 | + self.max_node_no += 1 |
| 109 | + self.trie[node][c] = self.max_node_no |
| 110 | + self.trie[self.max_node_no] = dict() |
| 111 | + node = self.max_node_no |
| 112 | + |
| 113 | + if not node in self.node_patterns_mapping: |
| 114 | + self.node_patterns_mapping[node] = [] |
| 115 | + self.node_patterns_mapping[node].append(pattern_no) |
| 116 | + |
| 117 | + def search_text(self, pattern, start, end): |
| 118 | + if len(self.trie) == 0: |
| 119 | + return (0, -1) |
| 120 | + |
| 121 | + node = 0 |
| 122 | + i = start |
| 123 | + while i <= end: |
| 124 | + |
| 125 | + c = pattern[i] |
| 126 | + |
| 127 | + if pattern[i] in self.trie[node]: |
| 128 | + node = self.trie[node][c] |
| 129 | + i += 1 |
| 130 | + continue |
| 131 | + |
| 132 | + else: |
| 133 | + break |
| 134 | + |
| 135 | + return (i, node) |
| 136 | + |
| 137 | + # Prints the trie in the form of a dictionary of dictionaries |
| 138 | + # E.g. For the following patterns: ["AC", "T"] {0:{'A':1,'T':2},1:{'C':3}} |
| 139 | + def print_tree(self): |
| 140 | + for node in self.trie: |
| 141 | + for c in self.trie[node]: |
| 142 | + print("{}->{}:{}".format(node, self.trie[node][c], c)) |
| 143 | + #print(self.node_patterns_mapping) |
| 144 | + |
| 145 | + def search_in_pattern(self, text, start, end): |
| 146 | + if len(self.trie) == 0: |
| 147 | + return False |
| 148 | + |
| 149 | + node = 0 |
| 150 | + index = start |
| 151 | + while index <= end: |
| 152 | + |
| 153 | + c = text[index] |
| 154 | + if not c in self.trie[node]: |
| 155 | + return False |
| 156 | + |
| 157 | + node = self.trie[node][c] |
| 158 | + if '$' in self.trie[node]: |
| 159 | + return True |
| 160 | + else: |
| 161 | + index += 1 |
| 162 | + |
| 163 | + return False |
| 164 | + |
| 165 | + # Time Complexity: O(|text| * |longest pattern|) |
| 166 | + def multi_pattern_matching(self, text, start, end): |
| 167 | + |
| 168 | + if len(self.trie) == 0: |
| 169 | + return [] |
| 170 | + |
| 171 | + (i, node) = self.search_text(text[start : end + 1] + '$', start, end + 1) |
| 172 | + |
| 173 | + #print("Yes", text[start:end + 1], start, end, node, self.node_patterns_mapping[node]) |
| 174 | + |
| 175 | + return self.node_patterns_mapping[node] |
| 176 | + |
| 177 | +class Overlap_Graph: |
| 178 | + |
| 179 | + def __init__(self, reads): |
| 180 | + |
| 181 | + self._read_size = len(reads[0]) |
| 182 | + |
| 183 | + self._build_nodes(reads) |
| 184 | + self._build_adjacency_list_trie_kmp(reads) |
| 185 | + |
| 186 | + def _build_nodes(self, reads): |
| 187 | + self.nodes = [] |
| 188 | + self.adjacency_list = [] |
| 189 | + |
| 190 | + node_kmer_no_dict = dict() |
| 191 | + for read in reads: |
| 192 | + if read in node_kmer_no_dict: |
| 193 | + continue |
| 194 | + |
| 195 | + node_kmer_no_dict[read] = len(self.nodes) |
| 196 | + self.nodes.append(read) |
| 197 | + self.adjacency_list.append([]) |
| 198 | + |
| 199 | + def _build_adjacency_list_naive(self, reads): |
| 200 | + |
| 201 | + # 2. Build adjacency list: 2 reads are joined by a directed edge of weight = to the length of the max overlap of these 2 reads |
| 202 | + kmp = Knuth_Morris_Pratt() |
| 203 | + |
| 204 | + for u in range(len(self.nodes)): |
| 205 | + |
| 206 | + max_overlap = -1 |
| 207 | + max_overlap_node_index = [] |
| 208 | + |
| 209 | + for v in range(len(self.nodes)): |
| 210 | + if u == v: |
| 211 | + continue |
| 212 | + |
| 213 | + overlap = kmp.max_overlap(self.nodes[v], self.nodes[u]) |
| 214 | + if overlap > max_overlap: |
| 215 | + max_overlap_node_index = [v] |
| 216 | + max_overlap = overlap |
| 217 | + |
| 218 | + elif overlap == max_overlap: |
| 219 | + max_overlap_node_index.append(v) |
| 220 | + |
| 221 | + for v in max_overlap_node_index: |
| 222 | + self.adjacency_list[v].append(Edge(u, max_overlap)) |
| 223 | + |
| 224 | + def _build_adjacency_list_kmp(self, reads): |
| 225 | + |
| 226 | + # 2. Build adjacency list: 2 reads are joined by a directed edge of weight = to the length of the max overlap of these 2 reads |
| 227 | + kmp = Knuth_Morris_Pratt() |
| 228 | + |
| 229 | + sharing_kmer_size = 12 if len(self.nodes[0]) > 12 else 1 |
| 230 | + |
| 231 | + for u in range(len(self.nodes)): |
| 232 | + sharing_kmer_reads = [] |
| 233 | + |
| 234 | + for v in range(len(self.nodes)): |
| 235 | + if u == v: |
| 236 | + continue |
| 237 | + |
| 238 | + if kmp.are_sharing_kmer(self.nodes[u], self.nodes[v], sharing_kmer_size): |
| 239 | + sharing_kmer_reads.append(v) |
| 240 | + |
| 241 | + max_overlap = -1 |
| 242 | + max_overlap_node_index = [] |
| 243 | + |
| 244 | + for v in sharing_kmer_reads: |
| 245 | + |
| 246 | + overlap = kmp.max_overlap(self.nodes[v], self.nodes[u]) |
| 247 | + if overlap > max_overlap: |
| 248 | + max_overlap_node_index = [v] |
| 249 | + max_overlap = overlap |
| 250 | + |
| 251 | + elif overlap == max_overlap: |
| 252 | + max_overlap_node_index.append(v) |
| 253 | + |
| 254 | + for v in max_overlap_node_index: |
| 255 | + self.adjacency_list[v].append(Edge(u, max_overlap)) |
| 256 | + |
| 257 | + def _build_adjacency_list_trie_kmp(self, reads): |
| 258 | + |
| 259 | + # Build adjacency list: 2 reads are joined by a directed edge of weight = to the length of the max overlap of these 2 reads |
| 260 | + sharing_kmer_size = 12 if self._read_size > 12 else 0 |
| 261 | + |
| 262 | + kmp = Knuth_Morris_Pratt() |
| 263 | + |
| 264 | + trie = Trie_Patterns(self.nodes, self._read_size - sharing_kmer_size, self._read_size - 1) |
| 265 | + |
| 266 | + for u in range(len(self.nodes)): |
| 267 | + |
| 268 | + sharing_kmer_reads = trie.multi_pattern_matching(self.nodes[u], 0, sharing_kmer_size - 1) |
| 269 | + #print("1st step: ", self.nodes[u], sharing_kmer_reads) |
| 270 | + max_overlap = -1 |
| 271 | + max_overlap_node_index = [] |
| 272 | + |
| 273 | + for v in sharing_kmer_reads: |
| 274 | + |
| 275 | + if u == v: |
| 276 | + continue |
| 277 | + |
| 278 | + overlap = kmp.max_overlap(self.nodes[v], self.nodes[u]) |
| 279 | + if overlap > max_overlap: |
| 280 | + max_overlap_node_index = [v] |
| 281 | + max_overlap = overlap |
| 282 | + |
| 283 | + elif overlap == max_overlap: |
| 284 | + max_overlap_node_index.append(v) |
| 285 | + |
| 286 | + for v in max_overlap_node_index: |
| 287 | + self.adjacency_list[v].append(Edge(u, max_overlap)) |
| 288 | + |
| 289 | + def str_adjacency_list(self): |
| 290 | + |
| 291 | + result_list = [] |
| 292 | + for node in range(len(self.nodes)): |
| 293 | + if len(self.adjacency_list[node]) == 0: |
| 294 | + continue |
| 295 | + |
| 296 | + node_adjacents = [ self.nodes[node] ] |
| 297 | + node_adjacents.append('->') |
| 298 | + for a in range(len(self.adjacency_list[node])): |
| 299 | + node_adjacents.append('(' + self.nodes[self.adjacency_list[node][a].sink] + ', ' + str(self.adjacency_list[node][a].weight) + ')') |
| 300 | + if a < len(self.adjacency_list[node]) - 1: |
| 301 | + node_adjacents.append(',') |
| 302 | + result_list.append(''.join(node_adjacents)) |
| 303 | + |
| 304 | + return '\n'.join(result_list) |
| 305 | + |
| 306 | + def hamiltonian_path(self): |
| 307 | + |
| 308 | + path = [] |
| 309 | + visited = [False for _ in range(len(self.nodes))] |
| 310 | + |
| 311 | + nodes_unvisited_edges_stack = [Latest_Visited_Node(0, -1, 0)] |
| 312 | + visited[0] = True |
| 313 | + path = [Edge(0, 0)] |
| 314 | + while len(nodes_unvisited_edges_stack) != 0: |
| 315 | + |
| 316 | + latest_visited_node = nodes_unvisited_edges_stack.pop(len(nodes_unvisited_edges_stack) - 1) |
| 317 | + |
| 318 | + # The path build so far isn't hamiltonian: we need to go back to the latest node with unvisited edges |
| 319 | + for i in range(len(path) - 1, latest_visited_node.path_index, -1): |
| 320 | + |
| 321 | + visited[ path[i].sink ] = False |
| 322 | + path.pop(i) |
| 323 | + |
| 324 | + node = self.adjacency_list[latest_visited_node.node][latest_visited_node.adjacent_index + 1].sink |
| 325 | + edge_node_weight = self.adjacency_list[latest_visited_node.node][latest_visited_node.adjacent_index + 1].weight |
| 326 | + if (latest_visited_node.adjacent_index + 1) < (len(self.adjacency_list[latest_visited_node.node]) - 1): |
| 327 | + #print("Yes: ", len(self.adjacency_list[latest_visited_node.node]), latest_visited_node) |
| 328 | + nodes_unvisited_edges_stack.append(Latest_Visited_Node(latest_visited_node.node, latest_visited_node.adjacent_index + 1, latest_visited_node.path_index)) |
| 329 | + #print("node, next node: ", latest_visited_node.node, self.nodes[latest_visited_node.node], self.nodes[node], node, nodes_unvisited_edges_stack, path) |
| 330 | + |
| 331 | + while not visited[node]: |
| 332 | + path.append(Edge(node, edge_node_weight)) |
| 333 | + visited[node] = True |
| 334 | + text = '(' + str(node) + ' , ' + self.nodes[node] + ')' |
| 335 | + if len(self.adjacency_list[node]) > 1: |
| 336 | + nodes_unvisited_edges_stack.append(Latest_Visited_Node(node, 0, len(path) - 1)) |
| 337 | + |
| 338 | + edge_node_weight = self.adjacency_list[node][0].weight |
| 339 | + node = self.adjacency_list[node][0].sink |
| 340 | + text += ', (' + str(node) + ' , ' + self.nodes[node] + ')' |
| 341 | + #print("node, next node: ", text, nodes_unvisited_edges_stack, path) |
| 342 | + |
| 343 | + if len(path) == len(self.nodes): |
| 344 | + path[0] = Edge(path[0].sink, edge_node_weight) |
| 345 | + break |
| 346 | + |
| 347 | + node = path[0].sink |
| 348 | + genome_list = [self.nodes[node]] |
| 349 | + for i in range(1, len(path)): |
| 350 | + |
| 351 | + node = path[i].sink |
| 352 | + overlap = path[i].weight |
| 353 | + if i == len(path) - 1: |
| 354 | + genome_list.append(self.nodes[node][overlap:(len(self.nodes[node]) - path[0].weight)]) |
| 355 | + else: |
| 356 | + genome_list.append(self.nodes[node][overlap:]) |
| 357 | + |
| 358 | + return ''.join(genome_list) |
| 359 | + |
| 360 | +if __name__ == '__main__': |
| 361 | + |
| 362 | + reads = sys.stdin.read().strip().splitlines() |
| 363 | + |
| 364 | + overlap_graph = Overlap_Graph(reads) |
| 365 | + |
| 366 | + #print(overlap_graph.str_adjacency_list()) |
| 367 | + |
| 368 | + print(overlap_graph.hamiltonian_path()) |
| 369 | + |
0 commit comments