Skip to content

Commit c440157

Browse files
committed
add ahoCorasick, hoping to compile on ubuntu
1 parent 856ec0a commit c440157

File tree

4 files changed

+141
-0
lines changed

4 files changed

+141
-0
lines changed

ahoCorasick/ahoc.c

+86
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
2+
typedef struct Node {
3+
struct Node* parent;
4+
struct Node* next[26];
5+
char letter;
6+
struct Node* longest_suffix;
7+
int count;
8+
} Node;
9+
10+
Node* getNext(Node* node, char letter);
11+
12+
Node* getLongestSuffix(Node* node) {
13+
if(node->longest_suffix == 0) {
14+
if(node->parent) { //The top node is supposed to have no longest suffix
15+
Node* longest = getLongestSuffix(node->parent);
16+
if(longest) { //Our parent is not the root node
17+
node->longest_suffix = getNext(longest, node->letter);
18+
} else { //Our parent is the root node, thus our longest suffix is empty string
19+
node->longest_suffix = node->parent;
20+
}
21+
}
22+
}
23+
return node->longest_suffix;
24+
}
25+
26+
Node* getNext(Node* node, char letter) {
27+
if(node->next[letter] == 0) {
28+
if(node->parent) {
29+
//Any not that isn't the root at least has the root as longest suffix
30+
Node* longest_suffix = getLongestSuffix(node);
31+
node->next[letter] = getNext(longest_suffix, letter);
32+
} else {
33+
node->next[letter] = node; //We have no suffix because we are root. Ignore this letter
34+
}
35+
}
36+
return node->next[letter];
37+
}
38+
39+
int getScore(Node* node) {
40+
if(node->count < 0) {
41+
node->count += 100000;
42+
if(node->parent)
43+
node->count += getScore(getLongestSuffix(node));
44+
}
45+
return node->count;
46+
}
47+
48+
int string_match(char* dna, int dnalen, char* segments, int segment_count, void* data) {
49+
int usedNodes = 0;
50+
Node* nodes = (Node*) data;
51+
52+
Node* root = &nodes[usedNodes++];
53+
54+
for(int i = 0; i < segment_count; i++) {
55+
Node* node = root;
56+
char c;
57+
while((c = *segments)) {
58+
c-=65;
59+
if(node->next[c] == 0) {
60+
Node* newNode = &nodes[usedNodes++];
61+
newNode->parent = node;
62+
newNode->letter = c;
63+
newNode->count = -100000;
64+
node->next[c] = newNode;
65+
}
66+
node = node->next[c];
67+
segments++;
68+
}
69+
node->count++;
70+
segments++;
71+
}
72+
73+
int score = 0;
74+
Node* node = root;
75+
for(int i = 0; i < dnalen; i++) {
76+
char c = dna[i]-65;
77+
node = getNext(node, c);
78+
score += getScore(node);
79+
}
80+
81+
return score;
82+
}
83+
84+
int main()
85+
{
86+
}

ahoCorasick/ahoc.py

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import ctypes
2+
import mmap
3+
import array
4+
5+
libc = ctypes.cdll.LoadLibrary(None)
6+
mmap_function = libc.mmap
7+
mmap_function.restype = ctypes.c_void_p
8+
mmap_function.argtypes = (ctypes.c_void_p, ctypes.c_size_t,
9+
ctypes.c_int, ctypes.c_int,
10+
ctypes.c_int, ctypes.c_size_t)
11+
CODE_SIZE = 10000
12+
DATA_SIZE = 100000
13+
code_address = mmap_function(None, CODE_SIZE,
14+
mmap.PROT_READ | mmap.PROT_WRITE | mmap.PROT_EXEC,
15+
mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS,
16+
-1, 0)
17+
if code_address == -1:
18+
raise OSError('mmap failed to allocate memory')
19+
20+
import base64
21+
code=base64.standard_b64decode(b'8w8e+jHtSYnRXkiJ4kiD5PBQVEyNBZYEAABIjQ0fBAAASI09DQQAAP8Vki8AAPSQSI090S8AAEiNBcovAABIOfh0FUiLBW4vAABIhcB0Cf/gDx+AAAAAAMMPH4AAAAAASI09oS8AAEiNNZovAABIKf5IifBIwe4/SMH4A0gBxkjR/nQUSIsFRS8AAEiFwHQI/+BmDx9EAADDDx+AAAAAAPMPHvqAPV0vAAAAdTNVSIM9Ii8AAABIieV0DUiLPT4vAAD/FRAvAADoY////8YFNC8AAAFdw2YuDx+EAAAAAADDZmYuDx+EAAAAAAAPH0AA8w8e+uln////VUiJ5UiD7CBIiX3oSItF6EiLgOAAAABIhcB1YUiLRehIiwBIhcB0VUiLRehIiwBIicfoyf///0iJRfhIg334AHQpSItF6A+2gNgAAAAPvtBIi0X4idZIicfoLAAAAEiLVehIiYLgAAAA6xJIi0XoSIsQSItF6EiJkOAAAABIi0XoSIuA4AAAAMnDVUiJ5VNIg+woSIl92InwiEXUD75V1EiLRdhIY9JIi0TQCEiFwHVUSItF2EiLAEiFwHQ0SItF2EiJx+g3////SIlF6A++VdQPvl3USItF6InWSInH6Kf///9Ii1XYSGPLSIlEygjrFA++VdRIi0XYSGPSSItN2EiJTNAID75V1EiLRdhIY9JIi0TQCEiLXfjJw1VIieVIg+wQSIl9+EiLRfiLgOgAAACFwHlSSItF+IuA6AAAAI2QoIYBAEiLRfiJkOgAAABIi0X4SIsASIXAdCxIi0X4SInH6Jn+//9IicforP///4nCSItF+IuA6AAAAAHCSItF+ImQ6AAAAEiLRfiLgOgAAADJw1VIieVIg+xgSIl9uIl1tEiJVaiJTbBMiUWgx0XIAAAAAEiLRaBIiUXoi0XIjVABiVXISGPQSInQSMHgBEgp0EjB4ARIicJIi0XoSAHQSIlF8MdFzAAAAADp3AAAAEiLRfBIiUXY6ZsAAAAPtkXHg+hBiEXHD75Vx0iLRdhIY9JIi0TQCEiFwHVji0XIjVABiVXISGPQSInQSMHgBEgp0EjB4ARIicJIi0XoSAHQSIlF+EiLRfhIi1XYSIkQSItF+A+2VceIkNgAAABIi0X4x4DoAAAAYHn+/w++VcdIi0XYSGPSSItN+EiJTNAID75Vx0iLRdhIY9JIi0TQCEiJRdhIg0WoAUiLRagPtgCIRceAfccAD4VR////SItF2IuA6AAAAI1QAUiLRdiJkOgAAABIg0WoAYNFzAGLRcw7RbAPjBj////HRdAAAAAASItF8EiJReDHRdQAAAAA6z+LRdRIY9BIi0W4SAHQD7YAg+hBiEXGD75VxkiLReCJ1kiJx+hy/f//SIlF4EiLReBIicfo8/3//wFF0INF1AGLRdQ7RbR8uYtF0MnDVUiJ5bgAAAAAXcPzDx76QVdMjT27KQAAQVZJidZBVUmJ9UFUQYn8VUiNLawpAABTTCn9SIPsCOhv+///SMH9A3QfMdsPH4AAAAAATInyTInuRInnQf8U30iDwwFIOd116kiDxAhbXUFcQV1BXkFfw2ZmLg8fhAAAAAAA8w8e+sM=')
22+
assert len(code) <= CODE_SIZE
23+
ctypes.memmove(code_address, code, len(code))
24+
25+
string_match_type = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_void_p)
26+
string_match_c = ctypes.cast(code_address+0x12ac-0x1020, string_match_type)
27+
28+
data = ctypes.create_string_buffer(DATA_SIZE)
29+
data_address = ctypes.cast(data,ctypes.POINTER(ctypes.c_void_p))
30+
ctypes.memset(data_address, DATA_SIZE, 0)
31+
32+
def string_match(dna, segments):
33+
dna_buf = dna.encode('utf-8')
34+
seg_buf = ('\0'.join(segments)+'\0').encode('utf-8')
35+
36+
dna_len = len(dna_buf)
37+
seg_count = len(segments)
38+
return string_match_c(dna_buf, dna_len, seg_buf, seg_count, data_address)
39+
40+
highscore = False
41+
42+
dna = "ACTTACTGG"
43+
segments = ["A", "ACT", "GG"]
44+
print(string_match(dna, segments))

ahoCorasick/extract.py

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#!/usr/bin/env python3
2+
3+
from subprocess import call
4+
import base64
5+
6+
call(["gcc", "ahoc.c", "-o", "ahoc.o"])
7+
call(["objcopy", "--only-section=.text", "-O", "binary", "ahoc.o", "text.o"])
8+
with open("text.o", "rb") as fil:
9+
with open("text.py", "w") as ut:
10+
ut.write(f"base64.standard_b64decode({base64.standard_b64encode(fil.read())})")

ahoCorasick/text.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
base64.standard_b64decode(b'8w8e+jHtSYnRXkiJ4kiD5PBQVEyNBZYEAABIjQ0fBAAASI09DQQAAP8Vki8AAPSQSI090S8AAEiNBcovAABIOfh0FUiLBW4vAABIhcB0Cf/gDx+AAAAAAMMPH4AAAAAASI09oS8AAEiNNZovAABIKf5IifBIwe4/SMH4A0gBxkjR/nQUSIsFRS8AAEiFwHQI/+BmDx9EAADDDx+AAAAAAPMPHvqAPV0vAAAAdTNVSIM9Ii8AAABIieV0DUiLPT4vAAD/FRAvAADoY////8YFNC8AAAFdw2YuDx+EAAAAAADDZmYuDx+EAAAAAAAPH0AA8w8e+uln////VUiJ5UiD7CBIiX3oSItF6EiLgOAAAABIhcB1YUiLRehIiwBIhcB0VUiLRehIiwBIicfoyf///0iJRfhIg334AHQpSItF6A+2gNgAAAAPvtBIi0X4idZIicfoLAAAAEiLVehIiYLgAAAA6xJIi0XoSIsQSItF6EiJkOAAAABIi0XoSIuA4AAAAMnDVUiJ5VNIg+woSIl92InwiEXUD75V1EiLRdhIY9JIi0TQCEiFwHVUSItF2EiLAEiFwHQ0SItF2EiJx+g3////SIlF6A++VdQPvl3USItF6InWSInH6Kf///9Ii1XYSGPLSIlEygjrFA++VdRIi0XYSGPSSItN2EiJTNAID75V1EiLRdhIY9JIi0TQCEiLXfjJw1VIieVIg+wQSIl9+EiLRfiLgOgAAACFwHlSSItF+IuA6AAAAI2QoIYBAEiLRfiJkOgAAABIi0X4SIsASIXAdCxIi0X4SInH6Jn+//9IicforP///4nCSItF+IuA6AAAAAHCSItF+ImQ6AAAAEiLRfiLgOgAAADJw1VIieVIg+xgSIl9uIl1tEiJVaiJTbBMiUWgx0XIAAAAAEiLRaBIiUXoi0XIjVABiVXISGPQSInQSMHgBEgp0EjB4ARIicJIi0XoSAHQSIlF8MdFzAAAAADp3AAAAEiLRfBIiUXY6ZsAAAAPtkXHg+hBiEXHD75Vx0iLRdhIY9JIi0TQCEiFwHVji0XIjVABiVXISGPQSInQSMHgBEgp0EjB4ARIicJIi0XoSAHQSIlF+EiLRfhIi1XYSIkQSItF+A+2VceIkNgAAABIi0X4x4DoAAAAYHn+/w++VcdIi0XYSGPSSItN+EiJTNAID75Vx0iLRdhIY9JIi0TQCEiJRdhIg0WoAUiLRagPtgCIRceAfccAD4VR////SItF2IuA6AAAAI1QAUiLRdiJkOgAAABIg0WoAYNFzAGLRcw7RbAPjBj////HRdAAAAAASItF8EiJReDHRdQAAAAA6z+LRdRIY9BIi0W4SAHQD7YAg+hBiEXGD75VxkiLReCJ1kiJx+hy/f//SIlF4EiLReBIicfo8/3//wFF0INF1AGLRdQ7RbR8uYtF0MnDVUiJ5bgAAAAAXcPzDx76QVdMjT27KQAAQVZJidZBVUmJ9UFUQYn8VUiNLawpAABTTCn9SIPsCOhv+///SMH9A3QfMdsPH4AAAAAATInyTInuRInnQf8U30iDwwFIOd116kiDxAhbXUFcQV1BXkFfw2ZmLg8fhAAAAAAA8w8e+sM=')

0 commit comments

Comments
 (0)