-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path29-FindPattern.py
More file actions
362 lines (297 loc) · 11.9 KB
/
29-FindPattern.py
File metadata and controls
362 lines (297 loc) · 11.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
#input
#AATCGGGTTCAATCGGGGT
#ATCG
#GGGT
#output
#1 4 11 15
__author__ = 'masaki'
import sys, itertools
oo = sys.maxint / 2
class Node:
def __init__( self, start, end ):
self.start = start
self.end = end
self.edges = {} # key: single character, value: pointer to node
self.link = 0
def edge_length( self, position ):
return min( self.end, position + 1 ) - self.start
def get_edges( self ):
return self.edges
def set_start( self, start ):
self.start = start
def get_start( self ):
return self.start
def get_end( self ):
return self.end
def set_end( self, end ):
self.end = end
def put_edge( self, key, val ):
self.edges[ key ] = val
def set_link( self, node ):
self.link = node
def get_link( self ):
return self.link
def get_edge( self, key ):
if key in self.edges:
return self.edges[ key ]
else:
return None
def str_val( self ):
string = "start: " + str( self.start ) + " end: " + str( self.end ) + " edges: " + str( self.edges )
return string
def increment_start( self, val ):
self.start += val
class SuffixTree:
def __init__( self, t_len ):
self.cur_node = 0
self.nodes_added = 0
self.nodes = [ Node( 0, oo ) ] * ( 2 * t_len + 2 )
self.text = ""
self.root = self.new_node( -1, -1 )
self.pos = -1
self.need_link = 0
self.r = 0 # the remainder of suffixes to add
self.active_node = self.root
self.active_length = 0
self.active_edge = 0
def add_char( self, c ):
self.text += c
self.pos += 1
self.need_link = -1
self.r += 1
# while there are suffixes to be added
while self.r > 0:
# set the active edge
# if the active edge is 0, set it to the current position
if self.active_length == 0:
self.active_edge = self.pos
# if active_edge is in active_nodes list of edges
if self.get_active_edge() not in self.nodes[self.active_node].get_edges(): # self.active_edge is equivalent to self.get_active_edge()
# the dege doesn't exist, create a new leaf node
leaf = self.new_node(self.pos, oo)
# add the leaf node as a child node and use the active_edge as the key for the edge
self.nodes[self.active_node].put_edge(self.get_active_edge(), leaf)
# check to see if you need to add a suffix link to the active node
self.add_suffix_link(self.active_node)
# else
else:
# the edge does exist
# set nex = the node to walk down
nex = self.nodes[self.active_node].get_edge(self.get_active_edge())
# walk_down nex
if self.walk_down(nex):
continue
# if the current character is the same as the character as pos active_length
# in the node nex
if self.text[self.nodes[nex].get_start() + self.active_length] == c:
# it is, so we increment the active length
self.active_length += 1
# we add a suffix link to the active node
self.add_suffix_link(self.active_node)
# no longer need to continue as the current string is subsumed in the
# the current node, so we break out of the while loop
break
# if the current character is not subsumed within the current node,
# this means that we must split a node
# create a new node called 'split' where:
# start = nex's start
# end = nex's start + the active_length
split = self.new_node(self.nodes[nex].get_start(), self.nodes[nex].get_start() + self.active_length)
# add split as an edge in active_node where the key is the active_edge
self.nodes[self.active_node].put_edge(self.get_active_edge(), split)
# createa new leaf, node( current_position, #)
leaf = self.new_node(self.pos, oo)
# add leaf as an edge to the node split with c as the key
self.nodes[split].put_edge(c, leaf)
# change the start position of nex by incrementing it by active_length
self.nodes[nex].increment_start(self.active_length)
# add nex as an edge to split using the character referred to by
# the new start position of nex
self.nodes[split].put_edge(self.text[self.nodes[nex].get_start()], nex)
# add a suffix link
self.add_suffix_link(split)
# decrement remainder because we've added a suffix
self.r -= 1
# if the active node is root and the active_length is > 0
if self.active_node == self.root and self.active_length > 0:
# decrement the active_length
self.active_length -= 1
# set active_edge to the current position - the number of suffixes to add + 1
self.active_edge = self.pos - self.r + 1
# else
else:
# set the active node to the node pointed to by the active_node's link if it has a link
self.active_node = self.nodes[self.active_node].get_link() if self.nodes[self.active_node].get_link() > 0 else self.root
# if it doesn't have a link, set it to root
def walk_down( self, nex ):
if self.active_length >= self.nodes[ nex ].edge_length( self.pos ):
self.active_edge += self.nodes[ nex ].edge_length( self.pos )
self.active_length -= self.nodes[ nex ].edge_length( self.pos )
self.active_node = nex
return True
return False
def in_depth_traverse( self, node, count, sa ):
if len(node.edges) == 0:
sa.append( node.start - count )
return node.start - count
count += node.end - node.start
for n in sorted(node.edges):
child = node.edges[n]
self.in_depth_traverse(self.nodes[ child ], count, sa)
def get_suffix_array( self, sa ):
self.in_depth_traverse( self.nodes[ self.root ], 0, sa )
return sa
def get_active_edge( self ):
return self.text[ self.active_edge ]
def new_node( self, start, end ):
self.cur_node += 1
self.nodes[ self.cur_node ] = Node( start, end )
self.nodes_added += 1
return self.cur_node
def add_suffix_link( self, node ):
if self.need_link > 0:
self.nodes[ self.need_link ].set_link( node )
self.need_link = node
def edge_string( self, node ):
start = self.nodes[ node ].get_start()
end = min( self.pos + 1, self.nodes[ node ].get_end() )
return text[ start : end ]
def print_tree(self ):
self.print_edges( self.root )
def print_graphviz_tree( self ):
print "digraph {"
print "\trankdir = LR;"
print "\tedge [arrowsize=0.4,fontsize=10]"
print "\tnode1 [label=\"\",style=filled,fillcolor=lightgrey,shape=circle,width=.1,height=.1];"
print "//------leaves------"
self.print_gv_leaves( self.root )
print "//------internal nodes------"
self.print_gv_internal_nodes( self.root )
print "//------edges------"
self.print_gv_edges( self.root )
print "//------suffix links------"
#self.print_gv_suffix_links( self.root )
print "}"
def print_edges( self, x ):
for key, child in self.nodes[ x ].get_edges().iteritems():
print self.edge_string( child )
self.print_edges( child )
"""def inorderTraversal( self, x ):
for key, child in self.nodes[ x ].get_edges().iteritems():
print child
self.inorderTraversal( child )"""
def print_gv_internal_nodes( self, x ):
if x != self.root and len( self.nodes[ x ].get_edges() ) > 0:
print "\tnode" + str( x ) + " [label=\"\",style=filled,fillcolor=lightgrey,shape=circle,width=.07,height=.07]"
for key, child in self.nodes[ x ].get_edges().iteritems():
self.print_gv_internal_nodes( child )
def print_gv_suffix_links( self, x ):
if self.nodes[ x ].get_link() > 0:
print "\tnode" + str( x ) + " -> node" + str( self.nodes[ x ].get_link() ) + " [label=\"\",weight=1,style=dotted]"
for key, child in self.nodes[ x ].get_edges().iteritems():
self.print_gv_suffix_links( child )
def print_gv_edges( self, x ):
for key, child in self.nodes[ x ].get_edges().iteritems():
print "\tnode" + str( x ) + " -> node" + str( child ) + " [label=\"" + self.edge_string( child ) + "\",weight=3]"
self.print_gv_edges( child )
def print_gv_leaves( self, x ):
if len( self.nodes[ x ].get_edges() ) == 0:
print "\tnode" + str( x ) + " [label=\"\",shape=point]"
else:
for key, child in self.nodes[ x ].get_edges().iteritems():
self.print_gv_leaves( child )
def genFirstBWT(sa, seq):
firstBWT = []
for i in sa:
firstBWT.append(seq[i])
return firstBWT
def genBWT(sa, seq):
BWT = []
for i in sa:
BWT.append(seq[i - 1])
return BWT
def genFirstToLast(firstBWT, BWT):
firstToLast = []
bpIndexes = dict()
bpIndexes["$"] = 0
bpIndexes["A"] = firstBWT.index("A")
bpIndexes["C"] = firstBWT.index("C")
bpIndexes["G"] = firstBWT.index("G")
bpIndexes["T"] = firstBWT.index("T")
for i in BWT:
firstToLast.append(bpIndexes[i])
bpIndexes[i] += 1
return firstToLast
def getUpperRange(char, list):
for i in reversed(range(len(list))):
if list[i] == char:
return i
def getLowerRange(char, list):
for i in range(len(list)):
if list[i] == char:
return i
def getIndexesInRange(char, BWT, firstToLast, lowerRange, upperRange):
indexesInBWT = []
indexesInFirstToLast = []
for i in xrange(lowerRange, upperRange + 1):
if BWT[i] == char:
indexesInBWT.append(firstToLast[i])
return indexesInBWT
def getLowestIndex(list):
if len(list) == 0:
return
lowestValue = list[0]
for i in list:
if i < lowestValue:
lowestValue = i
return lowestValue
def getHighestIndex(list):
if len(list) == 0:
return
highestValue = list[0]
for i in list:
if i > highestValue:
highestValue = i
return highestValue
def findSeq(query, sa, firstBWT, BWT, firstToLast):
occurences = []
lowerRange = getLowerRange(query[-1], firstBWT)
upperRange = getUpperRange(query[-1], firstBWT)
for char in reversed(query[:-1]):
indexesOfNextChar = getIndexesInRange(char, BWT, firstToLast, lowerRange, upperRange)
if len(indexesOfNextChar) == 0:
return None
lowerRange = getLowestIndex(indexesOfNextChar)
upperRange = getHighestIndex(indexesOfNextChar)
occurences.extend(sa[indexesOfNextChar[0]:indexesOfNextChar[-1] + 1])
return occurences
with open( sys.argv[ 1 ] ) as fh:
seq = fh.next().strip()
seq += "$"
queries = []
for query in fh:
queries.append(query.strip())
tree = SuffixTree( len( seq ) )
for char in seq:
tree.add_char( char )
if len( sys.argv ) > 2:
if sys.argv[ 2 ] == "gv":
tree.print_graphviz_tree()
#else:
#tree.print_tree()
sa = []
tree.get_suffix_array(sa)
firstBWT = genFirstBWT(sa, seq)
BWT = genBWT(sa, seq)
firstToLast = genFirstToLast(firstBWT, BWT)
occurences = []
for query in queries:
o = findSeq(query, sa, firstBWT, BWT, firstToLast)
if o != None:
occurences.extend(o)
for i in occurences:
print i,
#print sa
#print firstBWT
#print BWT
#print firstToLast