forked from nmeisburger/LSH-Tables
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.cpp
298 lines (268 loc) · 10.4 KB
/
main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
#include "LSH.h"
#include <iostream>
#include <fstream>
#include <zlib.h>
#include <stdio.h>
#include "kseq.h"
#include <string>
#include <map>
#include <algorithm>
#include "MurMurHash3.h"
#include "config.h"
#include <chrono>
using namespace std;
KSEQ_INIT(int, read)
int len = 32;
struct comparison_info
{
/* data */
int dist_from_query;
string dna_sequence;
};
static int min3(int x, int y, int z);
static int leven_distance(std::string str1, std::string str2);
//uhh before I do this, I need to figure out how to store a sequence of strings
static vector<string> brute_topk(int k, std::string query, vector<string> dna_strings);
static uint32_t getSequenceMinHash(string sequence, uint32_t seed, uint32_t subseq_len);
static unsigned int *getDNAMinhashes(string dna_sequence, int numHashes, unsigned int * dHashes);
static void insert_into_lsh(string dna_sequence, uint32_t string_idx, int numHashes, LSH *lsh_table);
static vector<uint32_t> getVectorMinhashes(string dna_sequence, uint32_t seed, int numHashes);
int main() {
int test_edit;
FILE* fp;
kseq_t *seq;
int n = 0, slen = 0, qlen = 0;
fp = fopen("SRR000001.fastq", "r");
seq = kseq_init(fileno(fp));
int count = 0;
std::string seq0;
uint32_t idx = 0;
map<uint32_t, string> idx_to_string_map;
//Intializing LSH table
LSH *lsh2 = new LSH();
cout << "Done allocating memory for LSH table" << "\n";
//vector to store all the dna sequences to be processed
vector<std::string> dna_arr;
unsigned int seq_hashes[4];
unsigned int seq_top10[10];
// Reads FASTQ file sequence by sequence
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
while (kseq_read(seq) >= 0) {
string str_seq = string(seq->seq.s);
//mapping an index to the read string and updating the index -- needed for Leven distance
idx_to_string_map.insert({idx, str_seq});
idx += 1;
//adding the string to a stored vector of dna sequences
dna_arr.push_back(str_seq);
// heartbeat for insertion
if (count % 10000 == 0){
uint32_t min = getSequenceMinHash(str_seq, 0, 10);
uint32_t full_hash;
int len = 32;
MurmurHash3_x86_32(&str_seq, len, 0, &full_hash);
cout << count << " " << full_hash << " " << min << "\n";
}
insert_into_lsh(str_seq, count, 4, lsh2);
count += 1;
}
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
// below are the test values that we used
unsigned int idxs[10] = {308597, 319039, 464183, 394249, 439143, 442117, 18615, 153262, 260764, 82983};
cout << "Experiment begins" << "\n";
// Retrieving top 10 for each test value
for(int i = 0; i < 10; i++) {
uint32_t idx = idxs[i];
cout << "test";
string seq = idx_to_string_map[idx];
seq_hashes[0] = getSequenceMinHash(seq, 0, 6);
seq_hashes[1] = getSequenceMinHash(seq, 1, 6);
seq_hashes[2] = getSequenceMinHash(seq, 2, 6);
seq_hashes[3] = getSequenceMinHash(seq, 3, 6);
lsh2->top_k(1, 10, seq_hashes, seq_top10);
// Uncomment below to print top k for each sequence in test values
//cout << "Top 10 for Index: " << idx << "\n";
for (int j = 0; j < 10; j++) {
//cout << seq_top10[j] << "\n";
}
}
std::cout << "Time to insert = " << std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count() << "[ms]" << std::endl;
//Brute Force Testing
int keys[10] = {308597, 319039, 464183, 394249, 439143, 442117, 18615, 153262, 260764, 82983};
for ( int i = 0; i < 10; i++) {
string seq_arb = idx_to_string_map.at(keys[i]);
cout << keys[i] << ": " << seq_arb << "\n";
vector<string> brute_10 = brute_topk(10, seq_arb, dna_arr);
for (int j = 0; j < 10; j++) {
cout << j << "th closest string: " << brute_10.at(j) << "\n";
}
}
kseq_destroy(seq);
fclose(fp);
return 0;
}
/*
* Requires:
* dna_sequence is a dna sequence in string form
* string_idx is the corresponding integer index of the string
* lsh_table is the lsh data structure where we'll be inserting the string
*
* Effect:
* Gets the hashes of the string and inserts the index of the string with the corresponding hashes into the lsh table
**/
static void insert_into_lsh(string dna_sequence, uint32_t string_idx, int numHashes, LSH *lsh_table) {
unsigned int sequence_hashes[numHashes];
//get the hashes of the string
getDNAMinhashes(dna_sequence, numHashes, sequence_hashes);
// insert the index with the hashes into the lsh table
lsh_table->insert((unsigned int)string_idx, sequence_hashes);
}
/*
* Requires:
* dna sequence to be hashed
* number of times to hash the string
* integer which is the length of ngram used for the MinHashes
*
* Effect:
* returns vector with num_hashes MinHashes for the sequence
* this vector will have length of len(num_hashes)
**/
static unsigned int *getDNAMinhashes(string dna_sequence, int numHashes, unsigned int * dHashes) {
for (int i = 0; i < numHashes; i++)
dHashes[i] = getSequenceMinHash(dna_sequence, i, NGRAM_LEN);
return dHashes;
}
/*
* Requires:
* vector with sequences in it
* integer which represents number of hashes per sequence ( the total amount needed to be hashed into the L tables)
* integer which is the length of ngram used for the MinHashes
*
* Effect:
* returns vector with num_hashes MinHashes for each sequence that was input in sequences
* this vector will have length of len(sequences)*num_hashes
**/
static vector<uint32_t> getMinHashes(vector<string> sequences, int num_hashes) {
vector<uint32_t> seqMinHashes;
for (string seq: sequences) {
for (int i = 0; i < num_hashes; i++)
seqMinHashes.push_back(getSequenceMinHash(seq, i, NGRAM_LEN));
}
return seqMinHashes;
}
/*
* Requires:
* sequence is string dna sequence to take MinHash of
* seed is seed to be used for hashing
* subseq_len is the length of the n-gram used to take MinHash
*
* Effect:
* Returns 32bit unsigned int MinHash of sequence
**/
static uint32_t getSequenceMinHash(string sequence, uint32_t seed, uint32_t subseq_len) {
uint32_t min = UINT_MAX;
uint32_t subseq_hash;
// Iterates through all ngrams in sequence and returns lowest hash value for given seed
for(int i = 0; i < sequence.length()-subseq_len; i++) {
string temp = sequence.substr(i, i+subseq_len);
uint32_t cur;
MurmurHash3_x86_32(&temp, len, seed, &cur);
if (cur < min)
min = cur;
}
return min % (1 << RANGE_POW);
}
// simple comparison operator that will put the smaller things first
bool compareByDist(const comparison_info &a, const comparison_info &b)
{
return a.dist_from_query < b.dist_from_query;
}
/*
* Requires:
* k is the number of strings to return
* query is the string in question in which we want to find the k strings that are closest to it.
* dna_strings is a vector of strings that we will compare to the query
*
* Effect:
* Finds the top k similar dna strings to the query.
* if multiple dna sequences have the same distance from the query, we only take one of them. Need to ammend this tomorrow.
* Returns of vector of those topk dna sequences
*/
static vector<string> brute_topk(int k, std::string query, vector<string> dna_strings)
{
vector<comparison_info> dna_info_vec;
int cnt = 0; // will be used to tell me if we've inserted the k things in the vector already
//iterating over the dna sequences
for (auto p_dna_seq = dna_strings.begin(); p_dna_seq != dna_strings.end(); ++p_dna_seq) {
int dist_val = leven_distance(query, *p_dna_seq);
if (query.compare(*p_dna_seq) != 0) { //if the strings aren't equal
//new struct
auto dna_struct = new comparison_info; //returns a pointer to a comparison_info struct
//distance of query and dna_seq
dna_struct->dist_from_query = dist_val;
dna_struct->dna_sequence = *p_dna_seq;
dna_info_vec.push_back(*dna_struct);
}
}
// now sorting the information in place
std::sort(dna_info_vec.begin(), dna_info_vec.end(), compareByDist);
vector<string>topk_vec;
int cnt2 = 0;
//putting only the top k dna sequences
for(auto it = dna_info_vec.begin(); cnt2 < k; ++it) {
topk_vec.push_back(it->dna_sequence);
cnt2 += 1;
}
return (topk_vec);
}
//Find the minimum value of 3 integers
int
min3(int x, int y, int z)
{
return std::min(std::min(x, y), z);
}
/*
* Requires:
* str1 and str2 are both genomic sequences represented as strings
* m is the index of the character we're looking at in str1
* n is the index of the character we're looking at in str2
*
* Effect:
* Computes the Levenshtein distance between the two strings(the minimum number of changes to turn str1 into str2)
* returns the levenshtein distance as an integer
*/
int leven_distance(std::string str1, std::string str2)
{
//dynamic programming with storing
int m;
int n;
m = (int)strlen(str1.c_str());
n = (int)strlen(str2.c_str());
//Initializing my dp matrix
int dp[m + 1][n + 1]; //an extra box for when looking at the empty string prefix of either str1 or str2 or both
//filling matrix
for (int i = 0; i <= m; i++) {
for (int j = 0; j <= n; j++){
//if i =0, first string is empty, we need to insert all the characters of string 2 to string 1
if (i == 0) {
// printf("i is 0 \n");
dp[i][j] = j;
continue;
}
//if j =0, first string is empty, we need to insert all the characters of string 2 to string 1
if (j == 0) {
dp[i][j] = i;
continue;
}
//if the previous character of str1 and str2 are equal,
if (str1[i-1] == str2[j-1]) {//remember that the dp[i][j] represents the distance between str1 up to index i-1 and str2 up to index j-1
dp[i][j] = dp[i-1][j-1];
continue;
}
// if the previous characters are different, consider all the possibilities
dp[i][j] = 1 + min3(dp[i-1][j-1], //replace
dp[i-1][j], //remove
dp[i][j-1]); //insert
}
}
return (dp[m][n]);
}