Skip to content

Commit e29a9d8

Browse files
committed
fix Aho-Corasick code to be callable sveral times
1 parent c440157 commit e29a9d8

File tree

4 files changed

+12
-6
lines changed

4 files changed

+12
-6
lines changed

ahoCorasick/ahoc.c

+2
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ int string_match(char* dna, int dnalen, char* segments, int segment_count, void*
8181
return score;
8282
}
8383

84+
//#include <stdio.h>
8485
int main()
8586
{
87+
//printf("%d\n", sizeof(Node));
8688
}

ahoCorasick/ahoc.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
ctypes.c_int, ctypes.c_int,
1010
ctypes.c_int, ctypes.c_size_t)
1111
CODE_SIZE = 10000
12-
DATA_SIZE = 100000
12+
DATA_SIZE = 50000000
1313
code_address = mmap_function(None, CODE_SIZE,
1414
mmap.PROT_READ | mmap.PROT_WRITE | mmap.PROT_EXEC,
1515
mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS,
@@ -18,23 +18,27 @@
1818
raise OSError('mmap failed to allocate memory')
1919

2020
import base64
21-
code=base64.standard_b64decode(b'8w8e+jHtSYnRXkiJ4kiD5PBQVEyNBZYEAABIjQ0fBAAASI09DQQAAP8Vki8AAPSQSI090S8AAEiNBcovAABIOfh0FUiLBW4vAABIhcB0Cf/gDx+AAAAAAMMPH4AAAAAASI09oS8AAEiNNZovAABIKf5IifBIwe4/SMH4A0gBxkjR/nQUSIsFRS8AAEiFwHQI/+BmDx9EAADDDx+AAAAAAPMPHvqAPV0vAAAAdTNVSIM9Ii8AAABIieV0DUiLPT4vAAD/FRAvAADoY////8YFNC8AAAFdw2YuDx+EAAAAAADDZmYuDx+EAAAAAAAPH0AA8w8e+uln////VUiJ5UiD7CBIiX3oSItF6EiLgOAAAABIhcB1YUiLRehIiwBIhcB0VUiLRehIiwBIicfoyf///0iJRfhIg334AHQpSItF6A+2gNgAAAAPvtBIi0X4idZIicfoLAAAAEiLVehIiYLgAAAA6xJIi0XoSIsQSItF6EiJkOAAAABIi0XoSIuA4AAAAMnDVUiJ5VNIg+woSIl92InwiEXUD75V1EiLRdhIY9JIi0TQCEiFwHVUSItF2EiLAEiFwHQ0SItF2EiJx+g3////SIlF6A++VdQPvl3USItF6InWSInH6Kf///9Ii1XYSGPLSIlEygjrFA++VdRIi0XYSGPSSItN2EiJTNAID75V1EiLRdhIY9JIi0TQCEiLXfjJw1VIieVIg+wQSIl9+EiLRfiLgOgAAACFwHlSSItF+IuA6AAAAI2QoIYBAEiLRfiJkOgAAABIi0X4SIsASIXAdCxIi0X4SInH6Jn+//9IicforP///4nCSItF+IuA6AAAAAHCSItF+ImQ6AAAAEiLRfiLgOgAAADJw1VIieVIg+xgSIl9uIl1tEiJVaiJTbBMiUWgx0XIAAAAAEiLRaBIiUXoi0XIjVABiVXISGPQSInQSMHgBEgp0EjB4ARIicJIi0XoSAHQSIlF8MdFzAAAAADp3AAAAEiLRfBIiUXY6ZsAAAAPtkXHg+hBiEXHD75Vx0iLRdhIY9JIi0TQCEiFwHVji0XIjVABiVXISGPQSInQSMHgBEgp0EjB4ARIicJIi0XoSAHQSIlF+EiLRfhIi1XYSIkQSItF+A+2VceIkNgAAABIi0X4x4DoAAAAYHn+/w++VcdIi0XYSGPSSItN+EiJTNAID75Vx0iLRdhIY9JIi0TQCEiJRdhIg0WoAUiLRagPtgCIRceAfccAD4VR////SItF2IuA6AAAAI1QAUiLRdiJkOgAAABIg0WoAYNFzAGLRcw7RbAPjBj////HRdAAAAAASItF8EiJReDHRdQAAAAA6z+LRdRIY9BIi0W4SAHQD7YAg+hBiEXGD75VxkiLReCJ1kiJx+hy/f//SIlF4EiLReBIicfo8/3//wFF0INF1AGLRdQ7RbR8uYtF0MnDVUiJ5bgAAAAAXcPzDx76QVdMjT27KQAAQVZJidZBVUmJ9UFUQYn8VUiNLawpAABTTCn9SIPsCOhv+///SMH9A3QfMdsPH4AAAAAATInyTInuRInnQf8U30iDwwFIOd116kiDxAhbXUFcQV1BXkFfw2ZmLg8fhAAAAAAA8w8e+sM=')
21+
code=base64.standard_b64decode(b'McDDZi4PH4QAAAAAAA8fAPMPHvox7UmJ0V5IieJIg+TwUFRMjQUWCAAASI0NnwcAAEiNPcj/////FYIvAAD0kEiNPcEvAABIjQW6LwAASDn4dBVIiwVeLwAASIXAdAn/4A8fgAAAAADDDx+AAAAAAEiNPZEvAABIjTWKLwAASCn+SInwSMHuP0jB+ANIAcZI0f50FEiLBTUvAABIhcB0CP/gZg8fRAAAww8fgAAAAADzDx76gD1NLwAAAHUzVUiDPRIvAAAASInldA1Iiz0uLwAA/xUALwAA6GP////GBSQvAAABXcNmLg8fhAAAAAAAw2ZmLg8fhAAAAAAADx9AAPMPHvrpZ////w8fgAAAAABBVFVTSA++3kiLRN8ISIXAdA5bXUFcw2YPH4QAAAAAAEiJ/UiLP0iF/3QlTIuF4AAAAEQPvuZNhcB0JUSJ5kyJx+i6////SIlE3QhbXUFcw0iJbN0ISInoW11BXMMPHwDoOwAAAEiJx0iFwHQbD7612AAAAOiH////SImF4AAAAEmJwOu2Dx8ATItFAEyJheAAAADrpmZmLg8fhAAAAAAAQVRTSIPsCEyLp+AAAABNheR0DUiDxAhMieBbQVzDZpBMiydIiftNheR06EmLvCTgAAAASIX/dBgPvrPYAAAA6Bz///9IiYPgAAAASYnE68NJizwkSIX/dC7oov///0iJx0iFwHQqQQ++tCTYAAAA6Oz+//9JiYQk4AAAAEiJx0iF/3W0TIsjTImj4AAAAOuDSYs8JEmJvCTgAAAA6+FmkIuH6AAAAIXAeAbDDx9EAABVBaCGAQBTSIn7SIPsCEiLL4mH6AAAAEiF7XQdSIu/4AAAAEiF/3QY6MP///8Dg+gAAACJg+gAAABIg8QIW13DSIu94AAAAEiF/3QcD76z2AAAAOhY/v//SImD4AAAAEiJx+vEDx9AAEiLfQBIhf90NOja/v//SInHSIXAdCwPvrXYAAAA6Cb+//9IiYXgAAAASInHSIX/dbNIiztIibvgAAAA64NIie/r8kiLfQBIib3gAAAA691mDx+EAAAAAABBV0GJ8UFWQVVBVFVTTInDSIPsGIXJD46aAAAASInWQYnKRTHbQbgBAAAADx9EAAAPtgZIidmEwHUV62IPH0AAD7ZGAUiDxgFIidGEwHRPg+hBSA++6EiLVOkISIXSdeBNY+BIg8YBQYPAAUyJ4kjB4gRMKeJIweIESAHaSIkKiILYAAAAx4LoAAAAYHn+/0iJVOkID7YGSInRhMB1sUGDwwGDgegAAAABSIPGAUU52g+Fev///0WFyQ+OlAIAAEGNQf9JifxFMe1MjXQHAesMSYPEAUEBxU055nRsQQ+2BCSD6EFID77oRA+++EiLROsISIXAdGpIicOLg+gAAACFwHnNTIs7BaCGAQCJg+gAAABNhf90ukiLq+AAAABIhe0PhPAAAACLhegAAACFwHhqA4PoAAAASYPEAYmD6AAAAEEBxU055nWUSIPEGESJ6FtdQVxBXUFeQV/DZg8fRAAASIsLSIXJdChIi5PgAAAASIXSD4TFAAAASItE6ghIhcB0XEiJROsISInD6Wr///+QSIlc6wjpX////0iLVQAFoIYBAImF6AAAAEiF0nSCTIu94AAAAE2F/w+EnQEAAEGLh+gAAACFwA+IRgEAAAOF6AAAAImF6AAAAOlS////TIsCTYXAD4QcAQAASIu64AAAAEiF/w+EiAEAAESJ/kiJVCQI6On7//9Ii1QkCEiJROoI6Wz///9Ji7/gAAAASIX/dEIPvrPYAAAA6ML7//9IiYPgAAAASInF6en+//9Ii7ngAAAASIX/dGMPvrPYAAAA6Jv7//9IiYPgAAAASInC6RT///9Jiz9Ihf8PhFABAADoG/z//0iJx0iFwA+EqgEAAEEPvrfYAAAA6GL7//9JiYfgAAAASInHSIX/dYVIiytIiavgAAAA6Xr+//9IizlIhf8PhBABAABIiUwkCOjO+///SItMJAhIhcBIiccPhHYBAAAPvrHYAAAA6BH7//9Ii0wkCEiJx0iJgeAAAABIhf8PhVL///9IixNIiZPgAAAA6XL+//9IiVTqCEiJ0Olv/v//TYsHBaCGAQBBiYfoAAAATYXAD4Si/v//SYu/4AAAAEiF/w+EHQEAAOjw+///QQOH6AAAAEGJh+gAAADpev7//0Ux7ent/f//SIu64AAAAEiF/3RpD7612AAAAOh9+v//SImF4AAAAEmJx+k8/v//TInHSIlUJAjoAfv//0iLVCQISIXASInHD4SaAAAAD76y2AAAAOhE+v//SItUJAhIicdIiYLgAAAA6Tr+//9Mif3p2P7//0iJyukq////SIs6SIX/D4S0AAAASIlUJAjoqvr//0iLVCQISIXASInHD4SNAAAAD76y2AAAAOjt+f//SItUJAhIicdIiYLgAAAASIX/D4VM////TIt9AEyJveAAAADpk/3//0mLP0mJv+AAAADpXv7//0iLOkiJuuAAAADpsf3//0iLOUiJueAAAADplv7//0yJx+gw+v//SInHSIXAdC1BD7632AAAAOh7+f//SYmH4AAAAEiJx+m3/v//SIs6SIm64AAAAOuCSYnX64pJiz9Jib/gAAAA6Zf+//9mLg8fhAAAAAAAZpDzDx76QVdMjT0rJgAAQVZJidZBVUmJ9UFUQYn8VUiNLRwmAABTTCn9SIPsCOjf9///SMH9A3QfMdsPH4AAAAAATInyTInuRInnQf8U30iDwwFIOd116kiDxAhbXUFcQV1BXkFfw2ZmLg8fhAAAAAAA8w8e+sM=')
2222
assert len(code) <= CODE_SIZE
2323
ctypes.memmove(code_address, code, len(code))
2424

2525
string_match_type = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_void_p)
26-
string_match_c = ctypes.cast(code_address+0x12ac-0x1020, string_match_type)
26+
string_match_c = ctypes.cast(code_address+0x1340-0x1020, string_match_type)
2727

2828
data = ctypes.create_string_buffer(DATA_SIZE)
2929
data_address = ctypes.cast(data,ctypes.POINTER(ctypes.c_void_p))
30-
ctypes.memset(data_address, DATA_SIZE, 0)
3130

31+
NODE_SIZE=240
3232
def string_match(dna, segments):
3333
dna_buf = dna.encode('utf-8')
3434
seg_buf = ('\0'.join(segments)+'\0').encode('utf-8')
3535

36+
#We probably hava a lot of node overlap
37+
ctypes.memset(data_address, 0, min(DATA_SIZE, len(seg_buf)*NODE_SIZE))
38+
3639
dna_len = len(dna_buf)
3740
seg_count = len(segments)
41+
3842
return string_match_c(dna_buf, dna_len, seg_buf, seg_count, data_address)
3943

4044
highscore = False

ahoCorasick/extract.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from subprocess import call
44
import base64
55

6-
call(["gcc", "ahoc.c", "-o", "ahoc.o"])
6+
call(["gcc", "-O2", "ahoc.c", "-o", "ahoc.o"])
77
call(["objcopy", "--only-section=.text", "-O", "binary", "ahoc.o", "text.o"])
88
with open("text.o", "rb") as fil:
99
with open("text.py", "w") as ut:

ahoCorasick/text.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
base64.standard_b64decode(b'8w8e+jHtSYnRXkiJ4kiD5PBQVEyNBZYEAABIjQ0fBAAASI09DQQAAP8Vki8AAPSQSI090S8AAEiNBcovAABIOfh0FUiLBW4vAABIhcB0Cf/gDx+AAAAAAMMPH4AAAAAASI09oS8AAEiNNZovAABIKf5IifBIwe4/SMH4A0gBxkjR/nQUSIsFRS8AAEiFwHQI/+BmDx9EAADDDx+AAAAAAPMPHvqAPV0vAAAAdTNVSIM9Ii8AAABIieV0DUiLPT4vAAD/FRAvAADoY////8YFNC8AAAFdw2YuDx+EAAAAAADDZmYuDx+EAAAAAAAPH0AA8w8e+uln////VUiJ5UiD7CBIiX3oSItF6EiLgOAAAABIhcB1YUiLRehIiwBIhcB0VUiLRehIiwBIicfoyf///0iJRfhIg334AHQpSItF6A+2gNgAAAAPvtBIi0X4idZIicfoLAAAAEiLVehIiYLgAAAA6xJIi0XoSIsQSItF6EiJkOAAAABIi0XoSIuA4AAAAMnDVUiJ5VNIg+woSIl92InwiEXUD75V1EiLRdhIY9JIi0TQCEiFwHVUSItF2EiLAEiFwHQ0SItF2EiJx+g3////SIlF6A++VdQPvl3USItF6InWSInH6Kf///9Ii1XYSGPLSIlEygjrFA++VdRIi0XYSGPSSItN2EiJTNAID75V1EiLRdhIY9JIi0TQCEiLXfjJw1VIieVIg+wQSIl9+EiLRfiLgOgAAACFwHlSSItF+IuA6AAAAI2QoIYBAEiLRfiJkOgAAABIi0X4SIsASIXAdCxIi0X4SInH6Jn+//9IicforP///4nCSItF+IuA6AAAAAHCSItF+ImQ6AAAAEiLRfiLgOgAAADJw1VIieVIg+xgSIl9uIl1tEiJVaiJTbBMiUWgx0XIAAAAAEiLRaBIiUXoi0XIjVABiVXISGPQSInQSMHgBEgp0EjB4ARIicJIi0XoSAHQSIlF8MdFzAAAAADp3AAAAEiLRfBIiUXY6ZsAAAAPtkXHg+hBiEXHD75Vx0iLRdhIY9JIi0TQCEiFwHVji0XIjVABiVXISGPQSInQSMHgBEgp0EjB4ARIicJIi0XoSAHQSIlF+EiLRfhIi1XYSIkQSItF+A+2VceIkNgAAABIi0X4x4DoAAAAYHn+/w++VcdIi0XYSGPSSItN+EiJTNAID75Vx0iLRdhIY9JIi0TQCEiJRdhIg0WoAUiLRagPtgCIRceAfccAD4VR////SItF2IuA6AAAAI1QAUiLRdiJkOgAAABIg0WoAYNFzAGLRcw7RbAPjBj////HRdAAAAAASItF8EiJReDHRdQAAAAA6z+LRdRIY9BIi0W4SAHQD7YAg+hBiEXGD75VxkiLReCJ1kiJx+hy/f//SIlF4EiLReBIicfo8/3//wFF0INF1AGLRdQ7RbR8uYtF0MnDVUiJ5bgAAAAAXcPzDx76QVdMjT27KQAAQVZJidZBVUmJ9UFUQYn8VUiNLawpAABTTCn9SIPsCOhv+///SMH9A3QfMdsPH4AAAAAATInyTInuRInnQf8U30iDwwFIOd116kiDxAhbXUFcQV1BXkFfw2ZmLg8fhAAAAAAA8w8e+sM=')
1+
base64.standard_b64decode(b'McDDZi4PH4QAAAAAAA8fAPMPHvox7UmJ0V5IieJIg+TwUFRMjQUWCAAASI0NnwcAAEiNPcj/////FYIvAAD0kEiNPcEvAABIjQW6LwAASDn4dBVIiwVeLwAASIXAdAn/4A8fgAAAAADDDx+AAAAAAEiNPZEvAABIjTWKLwAASCn+SInwSMHuP0jB+ANIAcZI0f50FEiLBTUvAABIhcB0CP/gZg8fRAAAww8fgAAAAADzDx76gD1NLwAAAHUzVUiDPRIvAAAASInldA1Iiz0uLwAA/xUALwAA6GP////GBSQvAAABXcNmLg8fhAAAAAAAw2ZmLg8fhAAAAAAADx9AAPMPHvrpZ////w8fgAAAAABBVFVTSA++3kiLRN8ISIXAdA5bXUFcw2YPH4QAAAAAAEiJ/UiLP0iF/3QlTIuF4AAAAEQPvuZNhcB0JUSJ5kyJx+i6////SIlE3QhbXUFcw0iJbN0ISInoW11BXMMPHwDoOwAAAEiJx0iFwHQbD7612AAAAOiH////SImF4AAAAEmJwOu2Dx8ATItFAEyJheAAAADrpmZmLg8fhAAAAAAAQVRTSIPsCEyLp+AAAABNheR0DUiDxAhMieBbQVzDZpBMiydIiftNheR06EmLvCTgAAAASIX/dBgPvrPYAAAA6Bz///9IiYPgAAAASYnE68NJizwkSIX/dC7oov///0iJx0iFwHQqQQ++tCTYAAAA6Oz+//9JiYQk4AAAAEiJx0iF/3W0TIsjTImj4AAAAOuDSYs8JEmJvCTgAAAA6+FmkIuH6AAAAIXAeAbDDx9EAABVBaCGAQBTSIn7SIPsCEiLL4mH6AAAAEiF7XQdSIu/4AAAAEiF/3QY6MP///8Dg+gAAACJg+gAAABIg8QIW13DSIu94AAAAEiF/3QcD76z2AAAAOhY/v//SImD4AAAAEiJx+vEDx9AAEiLfQBIhf90NOja/v//SInHSIXAdCwPvrXYAAAA6Cb+//9IiYXgAAAASInHSIX/dbNIiztIibvgAAAA64NIie/r8kiLfQBIib3gAAAA691mDx+EAAAAAABBV0GJ8UFWQVVBVFVTTInDSIPsGIXJD46aAAAASInWQYnKRTHbQbgBAAAADx9EAAAPtgZIidmEwHUV62IPH0AAD7ZGAUiDxgFIidGEwHRPg+hBSA++6EiLVOkISIXSdeBNY+BIg8YBQYPAAUyJ4kjB4gRMKeJIweIESAHaSIkKiILYAAAAx4LoAAAAYHn+/0iJVOkID7YGSInRhMB1sUGDwwGDgegAAAABSIPGAUU52g+Fev///0WFyQ+OlAIAAEGNQf9JifxFMe1MjXQHAesMSYPEAUEBxU055nRsQQ+2BCSD6EFID77oRA+++EiLROsISIXAdGpIicOLg+gAAACFwHnNTIs7BaCGAQCJg+gAAABNhf90ukiLq+AAAABIhe0PhPAAAACLhegAAACFwHhqA4PoAAAASYPEAYmD6AAAAEEBxU055nWUSIPEGESJ6FtdQVxBXUFeQV/DZg8fRAAASIsLSIXJdChIi5PgAAAASIXSD4TFAAAASItE6ghIhcB0XEiJROsISInD6Wr///+QSIlc6wjpX////0iLVQAFoIYBAImF6AAAAEiF0nSCTIu94AAAAE2F/w+EnQEAAEGLh+gAAACFwA+IRgEAAAOF6AAAAImF6AAAAOlS////TIsCTYXAD4QcAQAASIu64AAAAEiF/w+EiAEAAESJ/kiJVCQI6On7//9Ii1QkCEiJROoI6Wz///9Ji7/gAAAASIX/dEIPvrPYAAAA6ML7//9IiYPgAAAASInF6en+//9Ii7ngAAAASIX/dGMPvrPYAAAA6Jv7//9IiYPgAAAASInC6RT///9Jiz9Ihf8PhFABAADoG/z//0iJx0iFwA+EqgEAAEEPvrfYAAAA6GL7//9JiYfgAAAASInHSIX/dYVIiytIiavgAAAA6Xr+//9IizlIhf8PhBABAABIiUwkCOjO+///SItMJAhIhcBIiccPhHYBAAAPvrHYAAAA6BH7//9Ii0wkCEiJx0iJgeAAAABIhf8PhVL///9IixNIiZPgAAAA6XL+//9IiVTqCEiJ0Olv/v//TYsHBaCGAQBBiYfoAAAATYXAD4Si/v//SYu/4AAAAEiF/w+EHQEAAOjw+///QQOH6AAAAEGJh+gAAADpev7//0Ux7ent/f//SIu64AAAAEiF/3RpD7612AAAAOh9+v//SImF4AAAAEmJx+k8/v//TInHSIlUJAjoAfv//0iLVCQISIXASInHD4SaAAAAD76y2AAAAOhE+v//SItUJAhIicdIiYLgAAAA6Tr+//9Mif3p2P7//0iJyukq////SIs6SIX/D4S0AAAASIlUJAjoqvr//0iLVCQISIXASInHD4SNAAAAD76y2AAAAOjt+f//SItUJAhIicdIiYLgAAAASIX/D4VM////TIt9AEyJveAAAADpk/3//0mLP0mJv+AAAADpXv7//0iLOkiJuuAAAADpsf3//0iLOUiJueAAAADplv7//0yJx+gw+v//SInHSIXAdC1BD7632AAAAOh7+f//SYmH4AAAAEiJx+m3/v//SIs6SIm64AAAAOuCSYnX64pJiz9Jib/gAAAA6Zf+//9mLg8fhAAAAAAAZpDzDx76QVdMjT0rJgAAQVZJidZBVUmJ9UFUQYn8VUiNLRwmAABTTCn9SIPsCOjf9///SMH9A3QfMdsPH4AAAAAATInyTInuRInnQf8U30iDwwFIOd116kiDxAhbXUFcQV1BXkFfw2ZmLg8fhAAAAAAA8w8e+sM=')

0 commit comments

Comments
 (0)