-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathStringDictionaryHTFC.h
291 lines (252 loc) · 11.2 KB
/
StringDictionaryHTFC.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
/* StringDictionaryHTFC.h
* Copyright (C) 2014, Francisco Claude & Rodrigo Canovas & Miguel A. Martinez-Prieto
* all rights reserved.
*
* This class implements a Compressed String Dictionary which differentially
* encodes the strings using (Plain) Front-Coding and the resulting
* representation is finally compressed with Hu-Tucker:
*
* ==========================================================================
* "The Art of Computer Programming, volume 3: Sorting and Searching"
* Donald E. Knuth.
* Addison Wesley, 1973.
* ==========================================================================
*
* This technique is inspired in the Hu-Tucker Front-Coding approach from:
*
* ==========================================================================
* "Compressed String Dictionaries"
* Nieves R. Brisaboa, Rodrigo Canovas, Francisco Claude,
* Miguel A. Martinez-Prieto and Gonzalo Navarro.
* 10th Symposium on Experimental Algorithms (SEA'2011), p.136-147, 2011.
* ==========================================================================
*
* The current approach enhances the original one with efficient locate and
* extract operations ** by prefix **.
*
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*
* Contacting the authors:
* Francisco Claude: [email protected]
* Rodrigo Canovas: [email protected]
* Miguel A. Martinez-Prieto: [email protected]
*/
#ifndef _STRINGDICTIONARY_HTFC_H
#define _STRINGDICTIONARY_HTFC_H
#include <iostream>
#include <vector>
using namespace std;
#include <libcdsBasics.h>
using namespace cds_utils;
#include "StringDictionary.h"
#include "utils/LogSequence.h"
#include "HuTucker/HuTucker.h"
#include "utils/Coder/StatCoder.h"
#include "utils/Coder/DecodingTable.h"
#include "utils/Coder/DecodingTableBuilder.h"
#define MEMALLOC 32768
class StringDictionaryHTFC : public StringDictionary
{
public:
/** Generic Constructor. */
StringDictionaryHTFC();
/** Class Constructor.
@param it: iterator scanning the original set of strings.
@param bucketsize: number of strings represented per bucket.
*/
StringDictionaryHTFC(IteratorDictString *it, uint bucketsize);
/** Retrieves the ID corresponding to the given string.
@param str: the string to be searched.
@param strLen: the string length.
@returns the ID (or NORESULT if it is not in the bucket).
*/
uint locate(uchar *str, uint str_length);
/** Obtains the string associated with the given ID.
@param id: the ID to be extracted.
@param strLen: pointer to the extracted string length.
@returns the requested string (or NULL if it is not in the
dictionary).
*/
uchar* extract(size_t id, uint *strLen);
/** Locates all IDs of those elements prefixed by the given
string.
@param str: the prefix to be searched.
@param strLen: the prefix length.
@returns an iterator for direct scanning of all the IDs.
*/
IteratorDictID* locatePrefix(uchar *str, uint strLen);
/** Locates all IDs of those elements containing the given
substring.
@param str: the substring to be searched.
@param strLen: the substring length.
@returns an iterator for direct scanning of all the IDs.
*/
IteratorDictID* locateSubstr(uchar *str, uint strLen);
/** Retrieves the ID with rank k according to its alphabetical order.
@param rank: the alphabetical ranking.
@returns the ID.
*/
uint locateRank(uint rank);
/** Extracts all elements prefixed by the given string.
@param str: the prefix to be searched.
@param strLen: the prefix length.
@returns an iterator for direct scanning of all the strings.
*/
IteratorDictString* extractPrefix(uchar *str, uint strLen);
/** Extracts all elements containing by the given substring.
@param str: the substring to be searched.
@param strLen: the substring length.
@returns an iterator for direct scanning of all the strings.
*/
IteratorDictString* extractSubstr(uchar *str, uint strLen);
/** Obtains the string with rank k according to its
alphabetical order.
@param id: the ID to be extracted.
@param strLen: pointer to the extracted string length.
@returns the requested string (or NULL if it is not in the
dictionary).
*/
uchar* extractRank(uint rank, uint *strLen);
/** Extracts all strings in the dictionary sorted in
alphabetical order.
@returns an iterator for direct scanning of all the strings.
*/
IteratorDictString* extractTable();
/** Computes the size of the structure in bytes.
@returns the dictionary size in bytes.
*/
size_t getSize();
/** Stores the dictionary into an ofstream.
@param out: the oftstream.
*/
void save(ofstream &out);
/** Loads a dictionary from an ifstream.
@param in: the ifstream.
@returns the loaded dictionary.
*/
static StringDictionary *load(ifstream &in);
/** Generic destructor. */
~StringDictionaryHTFC();
protected:
uint32_t maxcomplength; //! The largest compressed header
uint32_t buckets; //! Number of total buckets in the dictionary
uint32_t bucketsize; //! Number of strings per bucket
uint64_t bytesStrings; //! Length of the strings representation
uchar *textStrings; //! Hu-Tucker over the Front-Coding representation for all remaining strings
LogSequence *blStrings; //! Positional index to the strings representation
StatCoder *coder; //! Coder for Hu-Tucker operations
Codeword *codewords; //! Hu-Tucker codeword assignment
DecodingTable *table; //! Decoding table
/** Obtaining the (encoded) header string for the given bucket.
@param idbucket: the bucket.
@returns the encoded header.
*/
inline uchar *getHeader(size_t idbucket);
/** Performs the Hu-Tucker encoding of the string.
@param str: the string to be encoded.
@param strLen: the string length.
@param encLen: pointer to the encoded string length.
@param offset: pointer to number of bits used in the last
byte.
@returns the encoded string.
*/
inline uchar* encodeString(uchar *str, uint strLen, uint *encLen, uint *offset);
/** Performs the Hu-Tucker encoding of the symbol in the text
jumping, in the current byte, the given number of bits.
@param symbol: the symbol to be encoded.
@param text: the text in which the symbol is encoded.
@param offset: the number of bits to be jumped.
@returns the number of advanced bytes in this operation.
*/
inline uint encodeSymbol(uchar symbol, uchar *text, uint *offset);
/** Decodes the first string in the bucket.
@idbucket: the bucket storing the required header.
@returns the scanning data
*/
inline ChunkScan decodeHeader(size_t idbucket);
/** Decodes the next internal string according to the
scanning data
@c: pointer to the scanning data
returns the number of chars shared with the previous string
*/
inline uint decodeNextString(ChunkScan* c);
/** Decodes the next internal string according to the
scanning data
@c: pointer to the scanning data
returns the number of chars shared with the previous string
*/
inline uint decodeNextString2(ChunkScan* c);
/** Locates the candidate bucket in which the given string can
be represented.
@param str: the encoded string to be located.
@param strLen: the encoded string length.
@param idbucket: pointer to the candidate bucket.
@returns a boolean value telling if the string is the
header of the bucket.
*/
inline bool locateBucket(uchar *str, uint strLen, size_t *idbucket);
/** Locates the buckets which delimits the representation of
all possible elments prefixed by the given string.
@param str: the prefix to be searched.
@param strLen: the prefix length.
@param offset: number of bits used in the last byte.
@param left: pointer to the left boundary bucket.
@param right: pointer to the right boundary bucket.
*/
inline void locateBoundaryBuckets(uchar *str, uint strLen, uint offset, size_t *left, size_t *right);
/** Searches the first string prefixed for the given one.
@param c: pointer to the scanning data
@param scanneable: the number of string to be scanned
@param str: the prefix to be searched.
@param strLen: the prefix length.
@returns the internal ID (of 0 if it is not in the bucket).
*/
inline uint searchPrefix(ChunkScan* c, uint scanneable, uchar *str, uint strLen);
/** Searches the first string non-prefixed for the given one.
@param c: pointer to the scanning data
@param scanneable: the number of string to be scanned
@param str: the prefix to be searched.
@param strLen: the prefix length.
@returns the internal ID (of 0 if it is not in the bucket).
*/
inline uint searchDistinctPrefix(ChunkScan* c, uint scanneable, uchar *str, uint strLen);
/** Resets the scanning value for the given bucket.
@c: pointer to the scanning data
@param idbucket: pointer to the current bucket.
*/
inline void resetScan(ChunkScan *c, size_t idbucket);
/** Processes the next encoded symbol and checks if the current
sequence encodes a decodeable substring. In this case,
information about this substring is stored for building the
decodeable table.
@param symbol: the symbol to be encoded.
@param seq: the current encoded sequence.
@param ptr: number of un-processed bits in the sequence.
@param substr: queue containing the symbols previosuly
processed.
@param lenSubstr: queue containing the lengths of the
symbol previosuly processed.
@param tableSubstr: table containing all substrings
decodeable substrings.
*/
inline void insertDecodeableSubstr(uchar symbol, uint *seq, ushort *ptr, vector<uchar> *substr, vector<ushort> *lenSubstr, DecodeableSubstr *tableSubstr);
/** Similar than insertDecodeableSubstr, but it fills the
sequence with 0s and inserts it in the table using as
decoding information that currently stored in the queues */
inline void insertEndingSubstr(uint *seq, ushort *ptr, vector<uchar> *substr, vector<ushort> *lenSubstr, DecodeableSubstr *tableSubstr);
};
#endif /* _STRINGDICTIONARY_HTFC_H */