Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mqf integration2 #1873

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
3 changes: 3 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -68,9 +68,12 @@ matrix:
osx_image: xcode7.3
env:
- TESTATTR="'not linux and not known_failing and not huge'"
- CXXFLAGS="$CXXFLAGS -nostdinc+"
- CXX="clang++ -stdlib=libc++"
before_install:
- source ci_scripts/install.sh


# command to install common dependencies
install:
- python --version
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -131,7 +131,7 @@ clean: FORCE
cd src/oxli && $(MAKE) clean || true
cd tests && rm -rf khmertest_* || true
rm -f pytests.xml
cd third-party/cqf && make clean || true
cd third-party/MQF && make clean || true
rm -f $(EXTENSION_MODULE)
rm -f khmer/*.pyc scripts/*.pyc tests/*.pyc oxli/*.pyc \
sandbox/*.pyc khmer/__pycache__/* sandbox/__pycache__/* \
4 changes: 2 additions & 2 deletions examples/c++-api/Makefile
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
CXXFLAGS=--std=c++11 \
-I ../../include/ \
-I ../../third-party/smhasher \
-I ../../third-party/cqf \
-I ../../third-party/mqf \
-I ../../third-party/seqan/core/include/ \
-I ../../third-party/rollinghash

TESTS=exact-counting bloom consume

%: %.cc ../../src/oxli/liboxli.a
$(CXX) $(CXXFLAGS) $< ../../src/oxli/liboxli.a -o $@
$(CXX) $(CXXFLAGS) $< ../../src/oxli/liboxli.a -o $@ -lstdc++

../../src/oxli/liboxli.a:
cd ../../src/oxli && make
12 changes: 10 additions & 2 deletions include/oxli/hashtable.hh
Original file line number Diff line number Diff line change
@@ -614,10 +614,18 @@ public:
class QFCounttable : public oxli::MurmurHashtable
{
public:
explicit QFCounttable(WordLength ksize, int size)
: MurmurHashtable(ksize, new QFStorage(size)) { } ;
explicit QFCounttable(WordLength ksize, int size,int slotsize)
: MurmurHashtable(ksize, new QFStorage(size,slotsize)) { } ;
};

class BufferedQFCounttable : public oxli::MurmurHashtable
{
public:
explicit BufferedQFCounttable(WordLength ksize, int size,int slotsize)
: MurmurHashtable(ksize, new BufferedMQFStorage(size,slotsize)) { } ;
};


// Hashtable-derived class with BitStorage.
class Nodetable : public oxli::MurmurHashtable
{
188 changes: 146 additions & 42 deletions include/oxli/storage.hh
Original file line number Diff line number Diff line change
@@ -45,6 +45,7 @@ Contact: [email protected]
using MuxGuard = std::lock_guard<std::mutex>;

#include "gqf.h"
#include "bufferedMQF.h"

namespace oxli {
typedef std::unordered_map<HashIntoType, BoundedCounterType> KmerCountMap;
@@ -410,53 +411,156 @@ public:
*
* \brief A Quotient Filter storage
*/
class QFStorage : public Storage {
class QFStorage : public Storage
{
protected:
QF cf;
QF mf;

public:
QFStorage(int size) {
// size is the power of two to specify the number of slots in
// the filter (2**size). Third argument sets the number of bits used
// in the key (current value of size+8 is copied from the CQF example)
// Final argument is the number of bits allocated for the value, which
// we do not use.
qf_init(&cf, (1ULL << size), size+8, 0);
}

~QFStorage() { qf_destroy(&cf); }

BoundedCounterType test_and_set_bits(HashIntoType khash) {
BoundedCounterType x = get_count(khash);
add(khash);
return !x;
}

//
bool add(HashIntoType khash) {
bool is_new = get_count(khash) == 0;
qf_insert(&cf, khash % cf.range, 0, 1);
return is_new;
}

// get the count for the given k-mer hash.
const BoundedCounterType get_count(HashIntoType khash) const {
return qf_count_key_value(&cf, khash % cf.range, 0);
}

// Accessors for protected/private table info members
// xnslots is larger than nslots. It includes some extra slots to deal
// with some details of how the counting is implemented
std::vector<uint64_t> get_tablesizes() const { return {cf.xnslots}; }
const size_t n_tables() const { return 1; }
const uint64_t n_unique_kmers() const { return cf.ndistinct_elts; }
const uint64_t n_occupied() const { return cf.noccupied_slots; }
void save(std::string outfilename, WordLength ksize);
void load(std::string infilename, WordLength &ksize);

Byte **get_raw_tables() { return nullptr; }
QFStorage(int size,int slotSize)
{
// size is the power of two to specify the number of slots in
// the filter (2**size). Third argument sets the number of bits used
// in the key (current value of size+8 is copied from the CQF example)
// Final argument is the number of bits allocated for the value, which
// we do not use.
_supports_bigcount = true;
qf_init(&mf, (1ULL << size), size+slotSize, 0,2,0,true,"",2038074761);



}

~QFStorage()
{
qf_destroy(&mf);
}

BoundedCounterType test_and_set_bits(HashIntoType khash)
{
BoundedCounterType x = get_count(khash);
add(khash);
return !x;
}

//
bool add(HashIntoType khash)
{
bool is_new = get_count(khash) == 0;
qf_insert(&mf, khash % mf.metadata->range, 1,false,false);
return is_new;
}

// get the count for the given k-mer hash.
const BoundedCounterType get_count(HashIntoType khash) const
{
return qf_count_key(&mf, khash % mf.metadata->range);
}

// Accessors for protected/private table info members
// xnslots is larger than nslots. It includes some extra slots to deal
// with some details of how the counting is implemented
std::vector<uint64_t> get_tablesizes() const
{
return {mf.metadata->xnslots};
}
const size_t n_tables() const
{
return 1;
}
const uint64_t n_unique_kmers() const
{
return mf.metadata->ndistinct_elts;
}
const uint64_t n_occupied() const
{
return mf.metadata->noccupied_slots;
}
void save(std::string outfilename, WordLength ksize);
void load(std::string infilename, WordLength &ksize);

Byte **get_raw_tables()
{
return nullptr;
}
};

class BufferedMQFStorage : public Storage
{
protected:
QF buffer;
bufferedMQF main;

public:
BufferedMQFStorage(int size,int slotSize)
{
// size is the power of two to specify the number of slots in
// the filter (2**size). Third argument sets the number of bits used
// in the key (current value of size+8 is copied from the CQF example)
// Final argument is the number of bits allocated for the value, which
// we do not use.
_supports_bigcount = true;
qf_init(&buffer, (1ULL << 15), 15+slotSize, 0,2,0,true,"",2038074761);
bufferedMQF_init(&main, (1ULL<< (size-2)) ,(1ULL << size), size+slotSize
,0,2,"");


}

~BufferedMQFStorage()
{
qf_destroy(&buffer);
bufferedMQF_destroy(&main);
}

BoundedCounterType test_and_set_bits(HashIntoType khash)
{
BoundedCounterType x = get_count(khash);
add(khash);
return !x;
}

//
bool add(HashIntoType khash)
{
bool is_new = get_count(khash) == 0;
bufferedMQF_insert(&main, khash % main.disk->metadata->range, 1,false,false);
return is_new;
}

// get the count for the given k-mer hash.
const BoundedCounterType get_count(HashIntoType khash) const
{
return bufferedMQF_count_key(&main, khash % main.disk->metadata->range);
}

// Accessors for protected/private table info members
// xnslots is larger than nslots. It includes some extra slots to deal
// with some details of how the counting is implemented
std::vector<uint64_t> get_tablesizes() const
{
return {main.disk->metadata->xnslots};
}
const size_t n_tables() const
{
return 1;
}
const uint64_t n_unique_kmers() const
{
return main.disk->metadata->ndistinct_elts;
}
const uint64_t n_occupied() const
{
return main.disk->metadata->noccupied_slots;
}
void save(std::string outfilename, WordLength ksize);
void load(std::string infilename, WordLength &ksize);

Byte **get_raw_tables()
{
return nullptr;
}
};

/*
* \class ByteStorage
2 changes: 1 addition & 1 deletion khmer/__init__.py
Original file line number Diff line number Diff line change
@@ -64,7 +64,7 @@

from khmer._khmer import FILETYPES

from khmer._oxli.graphs import (Counttable, QFCounttable, Nodetable,
from khmer._oxli.graphs import (Counttable, QFCounttable,BufferedQFCounttable, Nodetable,
CyclicCounttable,
SmallCounttable, Countgraph, SmallCountgraph,
Nodegraph)
10 changes: 8 additions & 2 deletions khmer/_oxli/graphs.pxd
Original file line number Diff line number Diff line change
@@ -122,7 +122,11 @@ cdef extern from "oxli/hashtable.hh" namespace "oxli" nogil:
CpNodetable(WordLength, vector[uint64_t])

cdef cppclass CpQFCounttable "oxli::QFCounttable" (CpHashtable):
CpQFCounttable(WordLength, uint64_t) except +oxli_raise_py_error
CpQFCounttable(WordLength, uint64_t,uint64_t) except +oxli_raise_py_error


cdef cppclass CpBufferedQFCounttable "oxli::BufferedQFCounttable" (CpHashtable):
CpBufferedQFCounttable(WordLength, uint64_t,uint64_t) except +oxli_raise_py_error


cdef extern from "oxli/hashgraph.hh" namespace "oxli" nogil:
@@ -215,7 +219,7 @@ cdef extern from "oxli/labelhash.hh" namespace "oxli":
uint64_t &,
CallbackFn,
void *)
void consume_seqfile_and_tag_with_labels[SeqIO](const string &,
void _seqfile_and_tag_with_labels[SeqIO](const string &,
uint32_t &,
uint64_t &)
void consume_seqfile_and_tag_with_labels[SeqIO](
@@ -259,6 +263,8 @@ cdef class Hashtable:
cdef class QFCounttable(Hashtable):
cdef shared_ptr[CpQFCounttable] _qf_this

cdef class BufferedQFCounttable(Hashtable):
cdef shared_ptr[CpBufferedQFCounttable] _qf_this

cdef class SmallCounttable(Hashtable):
cdef shared_ptr[CpSmallCounttable] _st_this
54 changes: 50 additions & 4 deletions khmer/_oxli/graphs.pyx
Original file line number Diff line number Diff line change
@@ -25,7 +25,7 @@ from khmer._khmer import ReadParser

CYTHON_TABLES = (Hashtable, Nodetable, Counttable, CyclicCounttable,
SmallCounttable,
QFCounttable, Nodegraph, Countgraph, SmallCountgraph)
QFCounttable,BufferedQFCounttable, Nodegraph, Countgraph, SmallCountgraph)


cdef class Hashtable:
@@ -368,27 +368,73 @@ cdef class QFCounttable(Hashtable):
Set the number of slots used by the counting quotient filter. This
determines the amount of memory used and how many k-mers can be entered
into the datastructure. Each slot uses roughly 1.3 bytes.
slot size: integer
"""

def __cinit__(self, int k, uint64_t size):
def __cinit__(self, int k, uint64_t size,uint64_t slotsize):
# size has to be a power of two
power_of_two = ((size & (size - 1) == 0) and
(size != 0))
if not power_of_two:
raise ValueError("size has to be a power of two, not"
" {}.".format(size))
if type(self) is QFCounttable:
self._qf_this = make_shared[CpQFCounttable](k, <uint64_t>log(size, 2))
self._qf_this = make_shared[CpQFCounttable](k, <uint64_t>log(size, 2),slotsize)
self._ht_this = <shared_ptr[CpHashtable]>self._qf_this


@classmethod
def load(cls, file_name):
"""Load the graph from the specified file."""
cdef QFCounttable table = cls(1, 1)
cdef QFCounttable table = cls(1, 1,1)
deref(table._qf_this).load(_bstring(file_name))
return table


cdef class BufferedQFCounttable(Hashtable):
"""Count kmers using a counting quotient filter.

The counting quotient filter (CQF) is an extension of the quotient filter
that supports counting in addition to simple membership testing. A CQF has
better cache locality compared to (Small)Counttable which increases
performance.

Each new k-mer uses one slot, and the number of slots used per k-mer
increases the more often the same k-mer is entered into the CQF. As a result
the CQF can be "full" and will stop accepting calls to `add` and `count`.

Parameters
----------
k : integer
k-mer size

size : integer
Set the number of slots used by the counting quotient filter. This
determines the amount of memory used and how many k-mers can be entered
into the datastructure. Each slot uses roughly 1.3 bytes.
slot size: integer
"""

def __cinit__(self, int k, uint64_t size,uint64_t slotsize):
# size has to be a power of two
power_of_two = ((size & (size - 1) == 0) and
(size != 0))
if not power_of_two:
raise ValueError("size has to be a power of two, not"
" {}.".format(size))
if type(self) is BufferedQFCounttable:
self._qf_this = make_shared[CpBufferedQFCounttable](k, <uint64_t>log(size, 2),slotsize)
self._ht_this = <shared_ptr[CpHashtable]>self._qf_this


@classmethod
def load(cls, file_name):
"""Load the graph from the specified file."""
cdef BufferedQFCounttable table = cls(1, 1,1)
deref(table._qf_this).load(_bstring(file_name))
return table


cdef class Counttable(Hashtable):

def __cinit__(self, int k, uint64_t starting_size, int n_tables,
41 changes: 41 additions & 0 deletions khmer/khmer_args.py
Original file line number Diff line number Diff line change
@@ -538,6 +538,47 @@ def create_nodegraph(args, ksize=None, multiplier=1.0, fp_rate=0.01):
return khmer.Nodegraph(ksize, tablesize, args.n_tables)


def create_MQFGraph(args, ksize=None, multiplier=1.0, fp_rate=0.1):
if ksize is None:
ksize = args.ksize
if ksize > 31:
print_error("\n** ERROR: khmer only supports k-mer sizes <= 32.\n")
sys.exit(1)
if not args.unique_kmers:
print_error("\n** ERROR: please supply unique number of kmers.\n")
sys.exit(1)


if args.unique_kmers:
size=int(math.ceil(math.log2(1.3*args.unique_kmers)))
# if args.max_tablesize:
# size=args.max_tablesize

if args.fp_rate:
log_info("*** INFO: Overriding default fp {def_fp} with new fp:"
" {new_fp}", def_fp=fp_rate, new_fp=args.fp_rate)
fp_rate = args.fp_rate

p=int(math.ceil(math.log2(float(args.unique_kmers)/float(fp_rate))))
slotSize=p-size

if slotSize<2 :
print_error("\n** ERROR: too small slot size.\n")
sys.exit(1)

nslots=2**size
nslots+=10*math.sqrt(nslots)
nblocks=int(((nslots+63)/64))
blockSize=17
bitsPerSlot=slotSize+2
totalSize=nblocks*(blockSize+bitsPerSlot*8)
totalSize/=(1000.0 ** 3);
log_info("*** INFO: creating MQF of size {size} and slot {slotsize}. Total Size ={totalSize}G"
, size=size, slotsize=slotSize,totalSize=totalSize)

return khmer.QFCounttable(ksize,2**size,slotSize)


def create_countgraph(args, ksize=None, multiplier=1.0, fp_rate=0.1):
"""Create and return a countgraph."""
args = _check_fp_rate(args, fp_rate)
8 changes: 7 additions & 1 deletion scripts/normalize-by-median.py
Original file line number Diff line number Diff line change
@@ -51,7 +51,7 @@
import os
import khmer
import textwrap
from khmer import khmer_args, Countgraph
from khmer import khmer_args, Countgraph, QFCounttable
from contextlib import contextmanager
from khmer.khmer_args import (build_counting_args, add_loadgraph_args,
report_on_config, calculate_graphsize,
@@ -295,6 +295,8 @@ def get_parser():
help='Input FAST[AQ] sequence filename.', nargs='+')
add_loadgraph_args(parser)
add_output_compression_type(parser)
parser.add_argument('--mqf', dest='mqf', default=False,
action='store_true')
return parser


@@ -338,10 +340,14 @@ def main(): # pylint: disable=too-many-branches,too-many-statements
log_info('loading k-mer countgraph from {graph}',
graph=args.loadgraph)
countgraph = Countgraph.load(args.loadgraph)
elif args.mqf:
countgraph = khmer_args.create_MQFGraph(args)
else:
log_info('making countgraph')
countgraph = khmer_args.create_countgraph(args)



# create an object to handle diginorm of all files
norm = Normalizer(args.cutoff, countgraph)
with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency)
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -6,7 +6,7 @@ undef = NO_UNIQUE_RC
# docker/Dockerfile
# libraries = z,bz2
## if using system libraries
include-dirs = include:third-party/zlib:third-party/bzip2:third-party/seqan/core/include:third-party/smhasher:third-party/cqf:third-party/rollinghash
include-dirs = include:third-party/zlib:third-party/bzip2:third-party/seqan/core/include:third-party/smhasher:third-party/rollinghash:third-party/MQF:third-party/MQF/ThirdParty/stxxl/build/include
# include-dirs = lib
## if using system libraries (broken)

7 changes: 4 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
@@ -330,10 +330,11 @@ def run(self):
if sys.platform == 'darwin' and 'gcov' in self.libraries:
self.libraries.remove('gcov')

cqfcmd = ['bash', '-c', 'cd third-party/cqf && make']
spawn(cmd=cqfcmd, dry_run=self.dry_run)
mqfcmd = ['bash', '-c', 'cd third-party/MQF && make all NH=1']
spawn(cmd=mqfcmd, dry_run=self.dry_run)
for ext in self.extensions:
ext.extra_objects.append(path_join("third-party", "cqf", "gqf.o"))
ext.extra_objects.append(path_join("third-party", "MQF", "libMQF.a"))
# ext.extra_objects.append(path_join("third-party", "MQF","ThirdParty","stxxl","build","lib", "libstxxl.a"))

if "z" not in self.libraries:
zcmd = ['bash', '-c', 'cd ' + ZLIBDIR + ' && ( test Makefile -nt'
30 changes: 16 additions & 14 deletions src/oxli/Makefile
Original file line number Diff line number Diff line change
@@ -69,7 +69,7 @@ PREFIX=/usr/local

INCLUDES= -I ../../include/ -I ../../third-party/seqan/core/include/ \
-I ../../third-party/smhasher/ \
-I ../../third-party/cqf/ \
-I ../../third-party/MQF/ \
-I ../../third-party/rollinghash

ifeq ($(USE_SYSTEM_ZLIB), false)
@@ -164,6 +164,8 @@ SONAME = liboxli.$(SHARED_EXT).$(LIB_VERSION)
SONAME_FLAGS = -install_name $(PREFIX)/lib/$(SONAME) \
-compatibility_version $(LIB_VERSION) \
-current_version $(LIB_VERSION)
LDFLAGS += -undefined dynamic_lookup -arch x86_64

else
SHARED_EXT = so
SONAME = liboxli.$(SHARED_EXT).$(LIB_VERSION)
@@ -221,10 +223,10 @@ BZIP2_OBJS_BASE= \
BZIP2_OBJS=$(addprefix $(BZIP2_DIR)/, $(BZIP2_OBJS_BASE))

# Counting bloom filter
CQF_DIR=../../third-party/cqf
CQF_OBJS_BASE= gqf.o
MQF_DIR=../../third-party/MQF
MQF_OBJS_BASE= libMQF.a

CQF_OBJS=$(addprefix $(CQF_DIR)/, $(CQF_OBJS_BASE))
MQF_OBJS=$(addprefix $(MQF_DIR)/, $(MQF_OBJS_BASE))

#### oxli proper below here ####

@@ -259,9 +261,9 @@ PRECOMILE_OBJS += $(BZIP2_OBJS)
PRECLEAN_TARGS += libbz2clean
endif

LIBOXLI_OBJS += $(CQF_OBJS)
PRECOMILE_OBJS += $(CQF_OBJS)
PRECLEAN_TARGS += libcqfclean
LIBOXLI_OBJS += $(MQF_OBJS)
PRECOMILE_OBJS += $(MQF_OBJS)
PRECLEAN_TARGS += libmqfclean

HEADERS= \
hashtable.hh \
@@ -290,18 +292,18 @@ zlibclean:
(cd $(ZLIB_DIR) && make distclean)
libbz2clean:
(cd $(BZIP2_DIR) && make -f Makefile-libbz2_so clean)
libcqfclean:
(cd $(CQF_DIR) && make clean)
libmqfclean:
(cd $(MQF_DIR) && make clean)

clean: $(PRECLEAN_TARGS)
rm -f *.o *.a *.$(SHARED_EXT)* oxli.pc $(TEST_PROGS)

install: $(LIBOXLISO) liboxli.a oxli.pc $(OXLI_HEADERS)
rm -rf $(PREFIX)/include/oxli $(PREFIX)/include/khmer
mkdir -p $(PREFIX)/lib $(PREFIX)/lib/pkgconfig $(PREFIX)/include/oxli
mkdir -p $(PREFIX)/lib $(PREFIX)/lib/pkgconfig $(PREFIX)/BufferedQFCounttable/oxli
cp -r $(OXLI_HEADERS) \
../../third-party/smhasher/MurmurHash3.h \
$(PREFIX)/include/oxli/
$(PREFIX)/BufferedQFCounttable/oxli/
cp oxli.pc $(PREFIX)/lib/pkgconfig/
cp $(LIBOXLISO) liboxli.a $(PREFIX)/lib/
ln -sf $(PREFIX)/lib/$(LIBOXLISO) $(PREFIX)/lib/liboxli.$(SHARED_EXT)
@@ -315,8 +317,8 @@ $(ZLIB_OBJS):
$(BZIP2_OBJS):
(cd $(BZIP2_DIR) && make -f Makefile-libbz2_so $(BZIP2_OBJS_BASE))

$(CQF_OBJS):
(cd $(CQF_DIR) && make)
$(MQF_OBJS):
(cd $(MQF_DIR) && make)

# MurMur3
murmur3.o: ../../third-party/smhasher/MurmurHash3.cc
@@ -326,7 +328,7 @@ murmur3.o: ../../third-party/smhasher/MurmurHash3.cc
$(CXX) $(CXXFLAGS) $(LDFLAGS) -c -o $@ $<

$(LIBOXLISO): $(LIBOXLI_OBJS)
$(CXX) $(CXXFLAGS) $(LDFLAGS) $(SONAME_FLAGS) -shared -o $@ $^
$(CXX) $(CXXFLAGS) $(LDFLAGS) $(SONAME_FLAGS) -shared -o $@ $^ -lstdc++
ln -sf $(SONAME) liboxli.$(SHARED_EXT)

liboxli.a: $(LIBOXLI_OBJS)
82 changes: 30 additions & 52 deletions src/oxli/storage.cc
Original file line number Diff line number Diff line change
@@ -923,34 +923,14 @@ void QFStorage::save(std::string outfilename, WordLength ksize)
unsigned char version = SAVED_FORMAT_VERSION;
unsigned char ht_type = SAVED_QFCOUNT;


outfile.write(SAVED_SIGNATURE, 4);
outfile.write((const char *) &version, 1);
outfile.write((const char *) &ht_type, 1);
outfile.write((const char *) &ksize, sizeof(ksize));

/* just a hack to handle __uint128_t value. Don't know a better to handle it
* right now */
uint64_t tmp_range;
tmp_range = cf.range;

outfile.write((const char *) &cf.nslots, sizeof(cf.nslots));
outfile.write((const char *) &cf.xnslots, sizeof(cf.xnslots));
outfile.write((const char *) &cf.key_bits, sizeof(cf.key_bits));
outfile.write((const char *) &cf.value_bits, sizeof(cf.value_bits));
outfile.write((const char *) &cf.key_remainder_bits, sizeof(cf.key_remainder_bits));
outfile.write((const char *) &cf.bits_per_slot, sizeof(cf.bits_per_slot));
outfile.write((const char *) &tmp_range, sizeof(tmp_range));
outfile.write((const char *) &cf.nblocks, sizeof(cf.nblocks));
outfile.write((const char *) &cf.nelts, sizeof(cf.nelts));
outfile.write((const char *) &cf.ndistinct_elts, sizeof(cf.ndistinct_elts));
outfile.write((const char *) &cf.noccupied_slots, sizeof(cf.noccupied_slots));

#if BITS_PER_SLOT == 8 || BITS_PER_SLOT == 16 || BITS_PER_SLOT == 32 || BITS_PER_SLOT == 64
outfile.write((const char *) cf.blocks, sizeof(qfblock) * cf.nblocks);
#else
outfile.write((const char *) cf.blocks,
(sizeof(qfblock) + SLOTS_PER_BLOCK * cf.bits_per_slot / 8) * cf.nblocks);
#endif
outfile.write((const char *)mf.metadata,sizeof(qfmetadata));
outfile.write((const char *)mf.blocks,mf.metadata->size);
outfile.close();
}

@@ -1011,34 +991,32 @@ void QFStorage::load(std::string infilename, WordLength &ksize)
infile.read((char *) &save_ksize, sizeof(save_ksize));
ksize = save_ksize;

infile.read((char *) &cf.nslots, sizeof(cf.nslots));
infile.read((char *) &cf.xnslots, sizeof(cf.xnslots));
infile.read((char *) &cf.key_bits, sizeof(cf.key_bits));
infile.read((char *) &cf.value_bits, sizeof(cf.value_bits));
infile.read((char *) &cf.key_remainder_bits, sizeof(cf.key_remainder_bits));
infile.read((char *) &cf.bits_per_slot, sizeof(cf.bits_per_slot));
infile.read((char *) &tmp_range, sizeof(tmp_range));

infile.read((char *) &cf.nblocks, sizeof(cf.nblocks));
infile.read((char *) &cf.nelts, sizeof(cf.nelts));
infile.read((char *) &cf.ndistinct_elts, sizeof(cf.ndistinct_elts));
infile.read((char *) &cf.noccupied_slots, sizeof(cf.noccupied_slots));
/* just a hack to handle __uint128_t value. Don't know a better to handle it
* right now */
cf.range = tmp_range;
// deallocate previously allocated blocks
free(cf.blocks);
/* allocate the space for the actual qf blocks */
#if BITS_PER_SLOT == 8 || BITS_PER_SLOT == 16 || BITS_PER_SLOT == 32 || BITS_PER_SLOT == 64
cf.blocks = (qfblock *)calloc(cf.nblocks, sizeof(qfblock));
#else
cf.blocks = (qfblock *)calloc(cf.nblocks, sizeof(qfblock) + SLOTS_PER_BLOCK * cf.bits_per_slot / 8);
#endif
#if BITS_PER_SLOT == 8 || BITS_PER_SLOT == 16 || BITS_PER_SLOT == 32 || BITS_PER_SLOT == 64
infile.read((char *) cf.blocks, sizeof(qfblock) * cf.nblocks);
#else
infile.read((char *) cf.blocks,
(sizeof(qfblock) + SLOTS_PER_BLOCK * cf.bits_per_slot / 8) * cf.nblocks);
#endif
mf.mem = (qfmem *)calloc(sizeof(qfmem), 1);
mf.metadata = (qfmetadata *)calloc(sizeof(qfmetadata), 1);
infile.read((char*)mf.metadata,sizeof(qfmetadata));
mf.blocks = (qfblock *)calloc(mf.metadata->size, 1);
infile.read((char*)mf.blocks, mf.metadata->size);

mf.metadata->num_locks =
10;//should be changed to something realistic like function qf_deserialize
mf.mem->metadata_lock = 0;
/* initialize all the locks to 0 */
mf.mem->locks = (volatile int *)calloc(mf.metadata->num_locks,
sizeof(volatile int));





infile.close();
}

void BufferedMQFStorage::save(std::string outfilename, WordLength ksize)
{
}


void BufferedMQFStorage::load(std::string infilename, WordLength &ksize)
{

}
2 changes: 1 addition & 1 deletion tests/table_fixtures.py
Original file line number Diff line number Diff line change
@@ -58,7 +58,7 @@ def build(k, *args):

if tabletype is QFCounttable:
qf_size = 2**math.ceil(math.log(starting_size, 2))
return tabletype(k, qf_size)
return tabletype(k, qf_size,8)
else:
return tabletype(k, starting_size, n_tables)

135 changes: 123 additions & 12 deletions tests/test_qfstorage.py
Original file line number Diff line number Diff line change
@@ -2,26 +2,137 @@
import random

from khmer import QFCounttable
import khmer
from tests import khmer_tst_utils as utils
from khmer import ReadParser

from . import khmer_tst_utils as utils
import random
import pytest


def test_read_write():
rng = random.Random(1)
MAX_COUNT = 255
MAX_BIGCOUNT = 65535

sketchSize = 1048576


DNA = "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC"


def teardown():
utils.cleanup()


@pytest.fixture(params=[khmer.QFCounttable])
def getSketch(request):
return request.param


def test_count_1(getSketch):
print("start")
hi = getSketch(12, sketchSize,8)

kmer = 'G' * 12
hashval = hi.hash('G' * 12)

assert hi.get(kmer) == 0
assert hi.get(hashval) == 0

hi.count(kmer)
assert hi.get(kmer) == 1
assert hi.get(hashval) == 1

hi.count(kmer)
assert hi.get(kmer) == 2
assert hi.get(hashval) == 2

kmer = 'G' * 11

with pytest.raises(ValueError):
hi.hash(kmer)

qf = QFCounttable(20, 1024 * 4)

def test_count_2(getSketch):
hi = getSketch(12, sketchSize,8)
print("done")
kmer = 'G' * 12
hashval = hi.hash('G' * 12)

assert hi.get(kmer) == 0
assert hi.get(hashval) == 0

hi.count(kmer)
assert hi.get(kmer) == 1
assert hi.get(hashval) == 1

hi.count(hashval) # count hashes same as strings
assert hi.get(kmer) == 2
assert hi.get(hashval) == 2


def test_read_write(getSketch):
print("Start")
fname = str.encode(utils.get_temp_filename('zzz'))
rng = random.Random(1)
ctm = getSketch(20, sketchSize,8)

kmers = ["".join(rng.choice("ACGT") for _ in range(20))
for n in range(400)]
for kmer in kmers:
qf.add(kmer)
ctm.add(kmer)

fname = utils.get_temp_filename('zzz')
ctm.save(fname)

qf.save(fname)
# print("Finish")
# # on purpose choose parameters that are different from sct
ctm2 = getSketch.load(fname)
ctm2.load(fname)
# assert ctm.ksize() == ctm2.ksize()
# for kmer in kmers:
# assert ctm.get(kmer) == ctm2.get(kmer)
#
#

# on purpose choose parameters that are different from sct
qf2 = QFCounttable.load(fname)
assert qf.ksize() == qf2.ksize()
for kmer in kmers:
assert qf.get(kmer) == qf2.get(kmer)

def test_maxcount_with_bigcount(getSketch):
# hashtable should not saturate, if use_bigcount is set.
kh = getSketch(4, 128,8)

last_count = None
for _ in range(0, 10000):
kh.count('AAAA')
c = kh.get('AAAA')
print(c)
if c == last_count:
break

last_count = c

assert c == 10000, "should be able to count to 1000: %d" % c


def test_get_ksize(getSketch):
kh = getSketch(22, 16,8)
assert kh.ksize() == 22


#
# def test_read_write():
# rng = random.Random(1)
#
# qf = QFCounttable(20, 1024 * 4)
#
# kmers = ["".join(rng.choice("ACGT") for _ in range(20))
# for n in range(400)]
# for kmer in kmers:
# qf.add(kmer)
#
# fname = utils.get_temp_filename('zzz')
#
# qf.save(fname)
#
# # on purpose choose parameters that are different from sct
# qf2 = QFCounttable.load(fname)
# assert qf.ksize() == qf2.ksize()
# for kmer in kmers:
# assert qf.get(kmer) == qf2.get(kmer)
10 changes: 6 additions & 4 deletions tests/test_tabletype.py
Original file line number Diff line number Diff line change
@@ -77,12 +77,14 @@ def test_n_occupied(AnyTabletype):
assert tt.n_unique_kmers() == 1

tt.add(kmer)
assert tt.n_occupied() == 1
# the CQF implementation we use can use more than one slot to represent
# counts for a single kmer
if not tt.__class__.__name__.startswith("QF"):
assert tt.n_occupied() == 1
else:
assert tt.n_occupied() == 2
# if not tt.__class__.__name__.startswith("QF"):
# assert tt.n_occupied() == 1
# else:
# assert tt.n_occupied() == 2

assert tt.n_unique_kmers() == 1


4 changes: 4 additions & 0 deletions third-party/MQF/.codecov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ignore:
- main.c
- catch.hpp
- test.h
19 changes: 19 additions & 0 deletions third-party/MQF/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@

*.gcda

*.gcno

mqf_test

tmp\.ser

*.o

main

docs/

libgqf\.so
ThirdParty/stxxl/build/*

libMQF.a
23 changes: 23 additions & 0 deletions third-party/MQF/.travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
language: cpp
sudo: required
dist: trusty
compiler:
- g++
os:
- linux


before_install:
- sudo apt-get -qq update
- sudo apt-get install -y make automake autotools-dev
script:
- make all test NH=1
- ./mqf_test
- gcov -n -o . gqf.c > /dev/null;
branches:
only:
- mqfDevelopmenet
- master

after_success:
- bash <(curl -s https://codecov.io/bash)
29 changes: 29 additions & 0 deletions third-party/MQF/LICENSE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
BSD 3-Clause License

Copyright (c) 2017,
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
140 changes: 140 additions & 0 deletions third-party/MQF/LayeredMQF.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
#include <inttypes.h>
#include <stdbool.h>
#include <pthread.h>
#include "gqf.h"
#include "LayeredMQF.h"
#include <iostream>

using namespace std;
void layeredMQF_init(layeredMQF *qf, uint64_t nslots_singletons ,uint64_t nslots
, uint64_t key_bits, uint64_t value_bits,uint64_t fixed_counter_size, bool mem
, const char *path, uint32_t seed){

if(qf==NULL)
{
qf= new layeredMQF();
}

qf_init(qf->firstLayer_singletons,nslots_singletons,key_bits,value_bits,1,0,mem,path,seed);
qf_init(qf->secondLayer,nslots,key_bits,value_bits,fixed_counter_size,0,mem,path,seed);
}

void layeredMQF_reset(layeredMQF *qf){
qf_reset(qf->firstLayer_singletons);
qf_reset(qf->secondLayer);
}

void layeredMQF_destroy(layeredMQF *qf){
qf_destroy(qf->firstLayer_singletons);
qf_destroy(qf->secondLayer);
}

void layeredMQF_copy(layeredMQF *dest, layeredMQF *src){
qf_copy(dest->firstLayer_singletons,src->firstLayer_singletons);
qf_copy(dest->secondLayer,src->secondLayer);
}

/* Increment the counter for this key/value pair by count. */

bool layeredMQF_insert(layeredMQF *qf, uint64_t key, uint64_t count,
bool lock, bool spin){
if(count==0)
return true;
bool inSecondLayer=qf_count_key(qf->secondLayer,key)>0;
if(inSecondLayer)
{
return qf_insert(qf->secondLayer,key,count,lock,spin);
}
uint64_t CountinFirstLayer=qf_count_key(qf->firstLayer_singletons,key);
if(CountinFirstLayer>1)
{
cerr<<"First Layer has items > 1"<<endl;
}
if(CountinFirstLayer>0)
{
if(qf_remove(qf->firstLayer_singletons,key,CountinFirstLayer,lock,spin))
return qf_insert(qf->secondLayer,key,count+1,lock,spin);
else{
return false;
}
}
if(count==1)
return qf_insert(qf->firstLayer_singletons,key,count,lock,spin);
else
return qf_insert(qf->secondLayer,key,count,lock,spin);


}

/* Remove count instances of this key/value combination. */
bool LayeredMQF_remove(layeredMQF *qf, uint64_t hash, uint64_t count, bool lock, bool spin){
bool res=false;
res|=qf_remove(qf->firstLayer_singletons,hash,count,lock,spin);
res|=qf_remove(qf->secondLayer,hash,count,lock,spin);
return res;
}




/* Return the number of times key has been inserted, with any value,
into qf. */
uint64_t layeredMQF_count_key(const layeredMQF *qf, uint64_t key){
uint64_t res=qf_count_key(qf->secondLayer,key);
if(res==0)
{
res=qf_count_key(qf->firstLayer_singletons,key);
}
return res;
}

int layeredMQF_space(layeredMQF *qf){
return max(qf_space(qf->firstLayer_singletons),qf_space(qf->secondLayer));
}
//
// /* Initialize an iterator */
// bool layeredMQF_iterator(layeredMQF *qf, layeredMQFIterator* qfi, uint64_t position){
// if(qfi ==NULL)
// qfi=new layeredMQFIterator();
// bool res=false;
// res|=qf_iterator(qf->firstLayer_singletons,qfi->firstLayerIterator,position);
// res|=qf_iterator(qf->secondLayer,qfi->secondLayerIterator,position);
// return res;
// }
//
// /* Returns 0 if the iterator is still valid (i.e. has not reached the
// end of the QF. */
// int layeredMQF_qfi_get(layeredMQFIterator*qfi, uint64_t *key, uint64_t *value, uint64_t *count){
//
// }
//
// /* Advance to next entry. Returns whether or not another entry is
// found. */
// int layeredMQF_qfi_next(layeredMQFIterator*qfi);
//
// /* Check to see if the if the end of the QF */
// int layeredMQF_qfi_end(layeredMQFIterator*qfi);
//
// /* For debugging */
// void layeredMQF_dump(const layeredMQF *);
//
// /* write data structure of to the disk */
// void layeredMQF_serialize(const layeredMQF *qf, const char *filename);
//
// /* read data structure off the disk */
// void layeredMQF_deserialize(layeredMQF *qf, const char *filename);
//
// /* mmap the QF from disk. */
// void layeredMQF_read(layeredMQF *qf, const char *path);

/* merge two QFs into the third one. */
//void layeredMQF_merge(layeredMQF *layeredMQFa, layeredMQF *layeredMQFb, QF *qfc);

/* merge multiple QFs into the final QF one. */
//void qf_multi_merge(QF *qf_arr[], int nqf, QF *qfr);

/* find cosine similarity between two QFs. */
// uint64_t qf_inner_product(QF *qfa, QF *qfb);

/* magnitude of a QF. */
//uint64_t qf_magnitude(QF *qf);
136 changes: 136 additions & 0 deletions third-party/MQF/LayeredMQF.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#ifndef layeredMQF_H
#define layeredMQF_H

#include <inttypes.h>
#include <stdbool.h>
#include <pthread.h>
#include "gqf.h"
#ifdef __cplusplus
extern "C" {
#endif


typedef class layeredMQF {
public:
QF* firstLayer_singletons;
QF* secondLayer;
layeredMQF(){
firstLayer_singletons=new QF();
secondLayer=new QF();
}
~layeredMQF()
{
delete firstLayer_singletons;
delete secondLayer;
}
} layeredMQF;




typedef struct layeredMQFIterator {
QFi firstLayerIterator;
QFi secondLayerIterator;
} layeredMQF_iterator;


void layeredMQF_init(layeredMQF *qf, uint64_t nslots_singletons ,uint64_t nslots, uint64_t key_bits, uint64_t value_bits,uint64_t fixed_counter_size, bool mem, const char *path, uint32_t seed);

void layeredMQF_reset(layeredMQF *qf);

void layeredMQF_destroy(layeredMQF *qf);

void layeredMQF_copy(layeredMQF *dest, layeredMQF *src);

/* Increment the counter for this key/value pair by count. */
bool layeredMQF_insert(layeredMQF *qf, uint64_t key, uint64_t count,
bool lock, bool spin);

/* Remove count instances of this key/value combination. */
bool layeredMQF_remove(QF *qf, uint64_t hash, uint64_t count, bool lock=false, bool spin=false);


/*!
@breif Add Tag to item.
@param Qf* qf : pointer to the Filter
@param uint64_t key : hash of the item to be insertedItems
@param uint64_t tag: tag to be added
@param bool lock: For Multithreading, Lock the slot used by the current thread so that other threads can't change the value
@param bool spin: For Multithreading, If there is a lock on the target slot. wait until the lock is freed and insert the count.
@return bool: True if the item is inserted correctly.
*/
uint64_t layeredMQF_add_tag(const QF *qf, uint64_t key, uint64_t tag, bool lock=false, bool spin=false);
/*!
@breif Return the tag associated with a given item.
@param Qf* qf : pointer to the Filter.
@param uint64_t key : hash of the item.
@return uint64_t the tag associated with the input key.
*/
uint64_t layeredMQF_get_tag(const QF *qf, uint64_t key);
/*!
@breif delete the tag associated with a given item.
@param Qf* qf : pointer to the Filter.
@param uint64_t key : hash of the item.
@return bool: Returns true if the item is removed successfully.
*/
uint64_t layeredMQF_remove_tag(const QF *qf, uint64_t key, bool lock=false, bool spin=false);



/* Return the number of times key has been inserted, with any value,
into qf. */
uint64_t layeredMQF_count_key(const layeredMQF *qf, uint64_t key);



/* Initialize an iterator */
bool layeredMQF_qf_iterator(layeredMQF *qf, layeredMQFIterator *qfi, uint64_t position);

/* Returns 0 if the iterator is still valid (i.e. has not reached the
end of the QF. */
int layeredMQF_qfi_get(layeredMQFIterator *qfi, uint64_t *key, uint64_t *value, uint64_t *count);

/* Advance to next entry. Returns whether or not another entry is
found. */
int layeredMQF_qfi_next(layeredMQFIterator *qfi);

/* Check to see if the if the end of the QF */
int layeredMQF_qfi_end(layeredMQFIterator *qfi);

/* For debugging */
void layeredMQF_dump(const layeredMQF *);

/* write data structure of to the disk */
void layeredMQF_serialize(const layeredMQF *qf, const char *filename);

/* read data structure off the disk */
void layeredMQF_deserialize(layeredMQF *qf, const char *filename);

/* mmap the QF from disk. */
void layeredMQF_read(layeredMQF *qf, const char *path);

/* merge two QFs into the third one. */
//void layeredMQF_merge(layeredMQF *layeredMQFa, layeredMQF *layeredMQFb, QF *qfc);

/* merge multiple QFs into the final QF one. */
//void qf_multi_merge(QF *qf_arr[], int nqf, QF *qfr);

/* find cosine similarity between two QFs. */
// uint64_t qf_inner_product(QF *qfa, QF *qfb);

/* magnitude of a QF. */
//uint64_t qf_magnitude(QF *qf);

int layeredMQF_space(layeredMQF *qf);

#ifdef __cplusplus
}
#endif

#endif /* layeredMQF_H */
90 changes: 90 additions & 0 deletions third-party/MQF/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
TARGETS=main libMQF.a
TESTFILES = tests/CountingTests.o tests/HighLevelFunctionsTests.o tests/IOTests.o tests/tagTests.o tests/bufferedCountingTests.o tests/onDiskCountingTests.o

ifdef D
DEBUG=-g
OPT=
else
DEBUG=
OPT=-Ofast
endif

ifdef NH
ARCH=
else
ARCH=-msse4.2 -D__SSE4_2_
endif

ifdef P
PROFILE=-pg -no-pie # for bug in gprof.
endif

CXX = g++ -std=c++11
CC = g++ -std=c++11
LD= g++ -std=c++11

INCLUDE= -I ThirdParty/stxxl/include/ -I ThirdParty/stxxl/build/include/

CXXFLAGS = -fPIC -Wall $(DEBUG) $(PROFILE) $(OPT) $(ARCH) $(INCLUDE) -fpermissive -fopenmp -m64 -I. -Wno-unused-result -Wno-strict-aliasing -Wno-unused-function -Wno-sign-compare

#STXXL= -L ThirdParty/stxxl/build/lib/ -llibstxxl
STXXL= ThirdParty/stxxl/build/lib/libstxxl.a

LDFLAGS = -fopenmp $(DEBUG) $(PROFILE) $(OPT)

#
# declaration of dependencies
#

all: $(TARGETS)

OBJS= gqf.o utils.o bufferedMQF.o onDiskMQF.o


# dependencies between programs and .o files

main: main.o $(STXXL) $(OBJS)
$(LD) $^ $(LDFLAGS) -o $@ $(STXXL)
# dependencies between .o files and .h files

libgqf.so: $(OBJS)
$(LD) $^ $(LDFLAGS) --shared -o $@

test: $(TESTFILES) gqf.c test.o utils.o
$(LD) $(LDFLAGS) -DTEST -o mqf_test test.o LayeredMQF.o bufferedMQF.o onDiskMQF.o utils.o $(TESTFILES) gqf.c $(STXXL)

main.o: gqf.h

libMQF.a: $(STXXL) $(OBJS)
ar rcs libMQF.a $(OBJS) $(STXXL)
# dependencies between .o files and .cc (or .c) files

$(STXXL):
mkdir -p ThirdParty/stxxl/build
cd ThirdParty/stxxl/build && cmake DBUILD_STATIC_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ ..
cd ThirdParty/stxxl/build && make all install


gqf.o: gqf.c gqf.h


bufferedMQF.o: bufferedMQF.cpp bufferedMQF.h





%.o: %.cc
$(CXX) $(CXXFLAGS) $(INCLUDE) $< -c -o $@

%.o: %.c %.h
$(CC) $(CXXFLAGS) $(INCLUDE) $< -c -o $@

%.o: %.cpp %.hpp
$(CXX) $(CXXFLAGS) $(INCLUDE) $< -c -o $@




clean:
rm -f $(OBJS) $(TARGETS) $(TESTS) $(TESTFILES)
Binary file added third-party/MQF/QuotientFilter_MQF.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
170 changes: 170 additions & 0 deletions third-party/MQF/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
# MQF
[![Build Status](https://travis-ci.org/shokrof/MQF.svg?branch=mqfDevelopmenet)](https://travis-ci.org/shokrof/MQF)
[![codecov](https://codecov.io/gh/shokrof/MQF/branch/mqfDevelopmenet/graph/badge.svg)](https://codecov.io/gh/shokrof/MQF)

[Approximate membership query (AMQ)](http://www.cs.cmu.edu/~lblum/flac/Presentations/Szabo-Wexler_ApproximateSetMembership.pdf) data structures provide approximate representation for data using a smaller amount of memory compared to the real data size. As the name suggests, AMQ answers if a particular element exists or not in a given dataset but with possible false positive errors. AMQ has many examples such as [bloom filter](https://en.wikipedia.org/wiki/Bloom_filter), [count-min sketch](https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch), [Quotient Filter](https://en.wikipedia.org/wiki/Quotient_filter), and Counting Quotient Filter([CQF](https://github.com/splatlab/cqf)). Here, we are proposing a new AMQ data structure called Mixed Counting Quotient Filter (MQF).

CQF splits the hash-bits of an item into two components: quotient and remaining parts. Quotient Part is used to determine the target slot. The remaining is inserted into the target slot. The insertion algorithm uses a variant of linear probing to resolve collisions. CQF allows counting the number of instances inserted by using slots following the item's reminder as a counter. CQF has relatively complex scheme to encode counters so that it can distinguish counters from item's reminder.

![alt text](https://raw.githubusercontent.com/shokrof/MQF/mqfDevelopmenet/QuotientFilter_MQF.png)

MQF, Mixed Quotient Filter, is a variant of CQF. MQF uses the same probing technique as CQF. MQF has more metadata called fixed size counters and different encoding for the counters. The improvement makes mqf more memory efficient for wider range of zipifan distribution.

When an item is inserted more than one time, MQF first use the fixed size counter to the number of insertions(count). After the fixed size counter is full, MQF store the count in the following slot and fixed-size counter. MQF uses the necessary number of slots to store the count.

In other words, Fixed-size counters is used in counting and marking the slots used for counting. Fixed-size counters for all slots related to the same item store the counter's maximum value except the last one should store value strictly less than the maximum. When the maximum is reached in the last fixed counter, a new slot is added with empty fixed-size counter.




## Advantages:
- MQF supports counting and removing items. MQF uses variable size counters; therefore, It is memory efficient when count data that follows zipfian distribution where most the items occur once or twice but few items can happen in very high counts..
- MQF has lower bits per element than Bloom filter and Count-min sketches ([Ref](https://www3.cs.stonybrook.edu/~ppandey/files/p775-pandey.pdf)).
- MQF has good data locality which makes it efficient when running on secondary storage.
- MQF supports add/remove tags for the item.
- MQF can be iterated to get the items and counts inserted in the filter.
- MQF supports merging function. Two or more filters can be merged into one filter.
- MQF can be resized to bigger/ smaller filter.


## Documentation
### Building
MQF only requires make and g++ to be installed.
```bash
apt-get install make g++
make NH=1
make test NH=1
./mqf_test
```
### Initialization
MQF Initialization requires the estimation of some parameters: number of slots, Key bits, fixed counter size, and tag size.

Fixed-size counter size is estimated from the shape of data distribution. If most of the items are singletons. The fixed-size counter should be limited to 1 bit. However, If a big portion of items is repeated more than one time, a bigger fixed-size counter will hold more counts and thus save slots.

The number of slots is estimated from the number of items expected to be inserted into the filter. Slots are used for inserting the remaining part of the hash of the items and the count. After calculating the required number of slots, multiply the result by 1.05 because MQF can't be filled by more than 95% of its capacity. Then, round the result to the nearest bigger power of two.

Key bits equal to log2(number of slots) + the remaining part bits. the remaining part bits is estimated from the desired accuracy using the formula below.

![eqn](https://raw.githubusercontent.com/shokrof/MQF/mqfDevelopmenet/r_eqn.gif)

Tag size is straightforward to be estimated. it can be set to zero if tags are not necessary.


1. qf_init
Initialize mqf .
```c++
void qf_init(QF *qf, uint64_t nslots, uint64_t key_bits, uint64_t tag_bits,uint64_t fixed_counter_size, bool mem, const char *path, uint32_t seed);
```
* Qf* qf : pointer to the Filter.
* uint64_t nslots : Number of slots in the filter. Should be of power of two. Maximum number of items to be inserted depends on this number.
* uint64_t key_bits: Number of bits in the hash values.
* uint64_t tag_bits: Number of bits in tag value.
* uint64_t fixed_counter_size: Fixed counter size. must be > 0.
* bool mem: Flag to create the filter on memory. IF false, mmap is used.
* const char * path: In case of mmap. Path of the file used to pack the filter.
* uint32_t seed: useless value. To be removed
2. qf_destroy
3. estimate
### Functions Supported
1. Insert :
Increment the counter for this item by count.
```c++
bool qf_insert(QF *qf, uint64_t key,
uint64_t count,bool lock, bool spin);
```

* Qf* qf : pointer to the Filter
* uint64_t key : hash of the item to be inserted.
* uint64_t count: Count to be added
* bool lock: For Multithreading, Lock the * slot used by the current thread so that other threads can't change the value
* bool spin: For Multithreading, If there is a lock on the target slot. wait until the lock is freed and insert the count.
* returns True if the insertion succeeded.

2. Count:
Return the number of times key has been inserted, with any value, into qf.
```c++
uint64_t qf_count_key(const QF *qf, uint64_t key);
```
* Qf* qf : pointer to the Filter
* uint64_t key : hash of the item to be counted.
* returns the number of times the item is inserted.
3. Remove:
Decrement the counter for this item by count.
```c++
bool qf_remove(QF *qf, uint64_t hash, uint64_t count, bool lock, bool spin);
```
* Qf* qf : pointer to the Filter
* uint64_t key : hash of the item to be removed
* uint64_t count: Count to be removed
* bool lock: For Multithreading, Lock the slot used by the current thread so that other threads can't change the value
* bool spin: For Multithreading, If there is a lock on the target slot. wait until the lock is freed and insert the count.

4. Add/Remove tag to elements
```c++
uint64_t qf_add_tag(const QF *qf, uint64_t key, uint64_t tag, bool lock, bool spin);
uint64_t qf_get_tag(const QF *qf, uint64_t key);
uint64_t qf_remove_tag(const QF *qf, uint64_t key, bool lock, bool spin);
```
* Qf* qf : pointer to the Filter
* uint64_t key : hash of the item.
* uint64_t tag: tag for the item.
* bool lock: For Multithreading, Lock the slot used by the current thread so that other threads can't change the value
* bool spin: For Multithreading, If there is a lock on the target slot. wait until the lock is freed and insert the count.
5. Resize:
resize the filter into a bigger or smaller one
```c++
QF* qf_resize(QF* qf, int newQ, const char * originalFilename=NULL, const char * newFilename=NULL);
```
* Qf* qf : pointer to the Filter
* uint64_t newQ: new number of slots(Q). the slot size will be recalculated to keep the range constant.
* string originalFilename(optional): dump the current filter to the disk to free space for the new filter. Filename is provided as the content of the string.
* string newFilename(optional): the new filter is created on disk. Filename is provided as the content of the string.
* returns a pointer to the new filter

6. Merge: merge more than one filter into a final one.
```c++
void qf_merge(QF *qfa, QF *qfb, QF *qfc);
void qf_multi_merge(QF *qf_arr[], int nqf, QF *qfr);
```
7. Invertible Merge: Invertible merge offers addiotinal functionality to normal merge. Original source filter can be queried for each key.
Invertiable merge function adds tag for each key and creates index structure. The index is map of an integer and vector of integers where the integer is the value of the tags and vector on integers is the ids of the source filters.
```c++
void qf_invertable_merge(QF *qf_arr[], int nqf, QF *qfr,std::map<uint64_t, std::vector<int> > *inverted_index_ptr);
```
* Qf* qf_arr : input array of filters
* int nqf: number of filters
* QF* qfr: pointer to the output filter.
* map (uint64_t,vector(int) ) inverted_index_ptr: Pointer to the output index.




7. Compare:
check if two filters have the same items, counts and tags.
```c++
bool qf_equals(QF *qfa, QF *qfb);
```
8. Intersect
calculate the intersection between two filters.
```c++
void qf_intersect(QF *qfa, QF *qfb, QF *qfc);
```
9. Subtract
subtract the second filter from the first.
```c++
void qf_subtract(QF *qfa, QF *qfb, QF *qfc);
```
10. Space:
returns the space percent occupied by the inserted items.
```c++
int qf_space(QF *qf);
```

### Miscellaneous Functions
1. Capacity
2. Copy
3. Serialize/ Deserialize
4. MMap read
37 changes: 37 additions & 0 deletions third-party/MQF/ThirdParty/stxxl/.travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
language: cpp

env:
global:
# limit parallel threads (default is 32!)
- OMP_NUM_THREADS=4
matrix:
# gcc-4.8 builds
- CMAKE_CC="gcc-4.8" CMAKE_CXX="g++-4.8" CMAKE_FLAGS="" CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug -DUSE_OPENMP=OFF -DNO_CXX11=ON"
- CMAKE_CC="gcc-4.8" CMAKE_CXX="g++-4.8" CMAKE_FLAGS="" CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug -DUSE_OPENMP=ON -DUSE_GNU_PARALLEL=OFF"
- CMAKE_CC="gcc-4.8" CMAKE_CXX="g++-4.8" CMAKE_FLAGS="" CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug -DUSE_OPENMP=ON -DUSE_GNU_PARALLEL=ON"
- CMAKE_CC="gcc-4.8" CMAKE_CXX="g++-4.8" CMAKE_FLAGS="" CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release -DUSE_OPENMP=ON -DUSE_GNU_PARALLEL=OFF"
- CMAKE_CC="gcc-4.8" CMAKE_CXX="g++-4.8" CMAKE_FLAGS="" CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release -DUSE_OPENMP=ON -DUSE_GNU_PARALLEL=ON"
# one boost build
- CMAKE_CC="gcc" CMAKE_CXX="g++" CMAKE_FLAGS="" CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug -DUSE_GNU_PARALLEL=OFF" USE_BOOST=ON
# one 32-bit build
- CMAKE_CC="gcc" CMAKE_CXX="g++" CMAKE_FLAGS="-m32" CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug -DUSE_GNU_PARALLEL=OFF" USE_BOOST=OFF
# clang build
- CMAKE_CC="clang" CMAKE_CXX="clang++" CMAKE_FLAGS="-Wno-sign-conversion" CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release"

install:
- sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
- sudo apt-get -qq update
- if [ "$CMAKE_CXX" == "g++-4.8" ]; then sudo apt-get install g++-4.8; fi
- if [ "$CMAKE_FLAGS" == "-m32" ]; then sudo apt-get install g++-multilib; fi
- if [ "$USE_BOOST" == "ON" ]; then sudo apt-get install libboost-all-dev; fi

before_script:
- mkdir build
- cd build
- cmake -DCMAKE_C_COMPILER=$CMAKE_CC -DCMAKE_CXX_COMPILER=$CMAKE_CXX
-DBUILD_TESTS=ON -DTRY_COMPILE_HEADERS=ON
-DUSE_BOOST=$USE_BOOST $CMAKE_ARGS ../
#-DCMAKE_C_FLAGS="-Wconversion $CMAKE_FLAGS" -DCMAKE_CXX_FLAGS="-Wconversion $CMAKE_FLAGS"

script:
- make -j4 && ./tools/stxxl_tool info && ctest -V
15 changes: 15 additions & 0 deletions third-party/MQF/ThirdParty/stxxl/AUTHORS
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
Andreas Beckmann <beckmann@cs.uni-frankfurt.de>
Daniel Feist <daniel.feist@student.kit.edu>
Daniel Godas-Lopez <dgodas@gmail.com>
Ilja Andronov <sni4ok@yandex.ru>
Jaroslaw Fedorowicz <fedorow@cs.uni-frankfurt.de>
Jens Mehnert <jmehnert@mpi-sb.mpg.de>
Johannes Singler <singler@ira.uka.de>
Manuel Krings
Markus Westphal <mail@markuswestphal.de>
Peter Sanders <sanders@mpi-sb.mpg.de>
Raoul Steffen <R-Steffen@gmx.de>
Roman Dementiev <dementiev@ira.uka.de>
Thomas Keh <thomas.keh@student.kit.edu>
Thomas Nowak <t.nowak@imail.de>
Timo Bingmann <tb@panthema.net>
536 changes: 536 additions & 0 deletions third-party/MQF/ThirdParty/stxxl/CHANGELOG

Large diffs are not rendered by default.

791 changes: 791 additions & 0 deletions third-party/MQF/ThirdParty/stxxl/CMakeLists.txt

Large diffs are not rendered by default.

23 changes: 23 additions & 0 deletions third-party/MQF/ThirdParty/stxxl/INSTALL
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
BASIC INFORMATION

For detailed installation instructions, including instruction for installation
in a Linux or Windows based environment and application compilation hints
(including an example Makefile) please read the doxygen manual:
http://stxxl.sourceforge.net/


QUICK INSTALLATION INSTRUCTIONS FOR POSIX COMPATIBLE SYSTEMS

* Extract the tarball and change into the stxxl root directory.

* Create a separate build directory (in-source building is prohibited):
- mkdir build
- cd build

* Run cmake to detect the build platform (where ".." is the STXXL tree):
- cmake ..

* Check cmake's output for errors, then run build the library:
- make

* See http://stxxl.sourceforge.net/ for more details.
23 changes: 23 additions & 0 deletions third-party/MQF/ThirdParty/stxxl/LICENSE_1_0.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
Boost Software License - Version 1.0 - August 17th, 2003

Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:

The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
27 changes: 27 additions & 0 deletions third-party/MQF/ThirdParty/stxxl/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
DESCRIPTION

STXXL is an implementation of the C++ standard template library STL for
external memory (out-of-core) computations, i. e. STXXL implements containers
and algorithms that can process huge volumes of data that only fit on disks.
While the closeness to the STL supports ease of use and compatibility with
existing applications, another design priority is high performance.


DOCUMENTATION

See the Doxygen documentation for installation manual and programmer
documentation: http://stxxl.sourceforge.net


LICENSE TERMS

STXXL is distributed under the Boost Software License, Version 1.0.
(See accompanying file LICENSE_1_0.txt or copy at
http://www.boost.org/LICENSE_1_0.txt)


BUGS AND QUESTIONS

If you find any bugs or have any questions, please visit the forums at

http://sourceforge.net/projects/stxxl/forums
116 changes: 116 additions & 0 deletions third-party/MQF/ThirdParty/stxxl/TODO
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
* asynchronous pipelining
(currently being developed in branch parallel_pipelining_integration)

* if the stxxl disk files have been enlarged because more external memory
was requested by the program, resize them afterwards to
max(size_at_program_start, configured_size)
https://sourceforge.net/forum/message.php?msg_id=4925158

* allocation strategies: provide a method get_num_disks()
and don't use stxxl::config::get_instance()->disks_number() inappropriately

* implement recursion in stable_ksort and do not assume random key
distribution, do sampling instead
as a start, at least abort early if the expected size of a bucket is larger
than the memory available to sort it

* debug stable_ksort in depth, there are still some crashing cases left

* continue using the new approach for STXXL_VERBOSE:
$(CXX) -DSTXXL_VERBOSE_FOO=STXXL_VERBOSEx

* check+fix all sorted_runs() calls to not cause write I/Os

* on disk destruction, check whether all blocks had been deallocated before,
i.e. free_bytes == disk_size

* implement an allocator which looks at the available free space on disks
when distributing blocks to disks, or does load-balancing depending
on the given speed of the disks

* abstract away block manager so every container can attach to a file.

* retry incomplete I/Os for all file types (currently only syscall)

* iostats: add support for splitting I/Os and truncating reads at EOF

* do not specify marginal properties such as allocation strategy as part of the template type.
instead, make such properties dynamically configurable using run-time polymorphism,
which would incur only a negligible running time overhead (one virtual function call per block).

* If we get rid of the sentinels (at least for sorting), we can drop the
min_value()/max_value() requirement on the comparator and therefore
unmodified comparators could be used for stxxl.

* Traditionally stxxl only supports PODs in external containers, e.g. nothing
that has non-trivial constructors, destructors or copy/assignemnt operators.
That is neccessary to allow fast operations by accessing the underlying
blocks directly without caring about copying/moving/otherwise manipulating
the content or uninitialized elements.

For some applications it could be helpful to have containers that take
non-POD elements, but you would probably lose the features you gain by having
direct access to the blocks.

Some discussion regarding using some shared_ptr<> in a stxxl::vector can be
found here: https://sourceforge.net/projects/stxxl/forums/forum/446474/topic/4099030

* The following code is not correct:

file * f = create_file(..., RDONLY);
vector<> v(f);
v[42]; // non-const access, sets dirty flag

but the error message it produces is very unclear and may come very
asynchronous:

terminate called after throwing an instance of 'stxxl::io_error' what():
Error in function virtual void stxxl::mmap_file::serve(const
stxxl::request*): Info: Mapping failed. Page size: 4096 offset modulo page
size 0 Permission denied

Caused by stxxl::vector<> when swapping out a dirty page. Reproducible with
containers/test_vector_sizes.stxxl.bin

Possible solution: vector::write_page() should check if
vector-is-bound-to-a-file and file-is-opened-read-only throw "please use only
const access to a read only vector"

* I've noticed that the destructor for stxxl::vector flushes any dirty data to
disk. Possibly that makes sense for a file-backed vector (I haven't looked at
how those work), but for a vector that uses the disks configured in .stxxl it
is wasted I/O as the data is immediately freed. For the moment I am working
around this by calling thevector.clear() immediately before destroying it,
which seems to prevent the writeback (based on querying the stats).

* There should be a function like

bool supports_filetype(const char *)

that allows to check at runtime whether a file type (given as string) is
available. Useful for tests that take a file type as parameter - they
currently throw an exception "unsupported filetype".

* Change the constructor
stxxl::vector<>::vector(stxxl::file*)
to
stxxl::vector<>::vector(shared_ptr<stxxl::file>)
so that ownership of the file can be transferred to the vector and cleanup
(closing etc.) of the file can happen automatically

* materialize() that takes an empty vector as argument and grows it, including
allocating more blocks of fills the existing part of a vectorand once we came
to it's end automatically expands the vector (including proper resizes). This
approach should be more efficient (due to overlapping) than repeated
push_back()

* streamop passthrough() useful if one wants to replace some streamop by a
no-op by just redefining one type needs peformance measurements, hopefully
has no impact on speed.

* stream::discard() should not need to call in.operator*() - are there side
effects to be expected?
-> add a streamop "inspect" or so for this purpose
-> drop *in call from discard

* add discard(StreamOp, unsigned_irgendwas n): discard at most n elements
186 changes: 186 additions & 0 deletions third-party/MQF/ThirdParty/stxxl/doc/DoxygenLayout.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
<doxygenlayout version="1.0">
<!-- Generated by doxygen 1.8.3.1 -->
<!-- Navigation index tabs for HTML output -->
<navindex>
<tab type="mainpage" visible="yes" title=""/>
<tab type="pages" visible="yes" title="" intro=""/>
<tab type="modules" visible="yes" title="" intro=""/>
<tab type="namespaces" visible="yes" title="">
<tab type="namespacelist" visible="yes" title="" intro=""/>
<tab type="namespacemembers" visible="yes" title="" intro=""/>
</tab>
<tab type="classes" visible="yes" title="">
<tab type="classlist" visible="yes" title="" intro=""/>
<tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/>
<tab type="hierarchy" visible="yes" title="" intro=""/>
<tab type="classmembers" visible="yes" title="" intro=""/>
</tab>
<tab type="files" visible="yes" title="">
<tab type="filelist" visible="yes" title="" intro=""/>
<tab type="globals" visible="yes" title="" intro=""/>
</tab>
<tab type="examples" visible="yes" title="" intro=""/>
</navindex>

<!-- Layout definition for a class page -->
<class>
<detaileddescription title=""/>
<inheritancegraph visible="$CLASS_GRAPH"/>
<collaborationgraph visible="$COLLABORATION_GRAPH"/>
<memberdecl>
<nestedclasses visible="yes" title=""/>
<publictypes title=""/>
<publicslots title=""/>
<signals title=""/>
<publicmethods title=""/>
<publicstaticmethods title=""/>
<publicattributes title=""/>
<publicstaticattributes title=""/>
<protectedtypes title=""/>
<protectedslots title=""/>
<protectedmethods title=""/>
<protectedstaticmethods title=""/>
<protectedattributes title=""/>
<protectedstaticattributes title=""/>
<packagetypes title=""/>
<packagemethods title=""/>
<packagestaticmethods title=""/>
<packageattributes title=""/>
<packagestaticattributes title=""/>
<properties title=""/>
<events title=""/>
<privatetypes title=""/>
<privateslots title=""/>
<privatemethods title=""/>
<privatestaticmethods title=""/>
<privateattributes title=""/>
<privatestaticattributes title=""/>
<friends title=""/>
<related title="" subtitle=""/>
<membergroups visible="yes"/>
</memberdecl>
<memberdef>
<inlineclasses title=""/>
<typedefs title=""/>
<enums title=""/>
<constructors title=""/>
<functions title=""/>
<related title=""/>
<variables title=""/>
<properties title=""/>
<events title=""/>
</memberdef>
<allmemberslink visible="yes"/>
<usedfiles visible="$SHOW_USED_FILES"/>
<authorsection visible="yes"/>
</class>

<!-- Layout definition for a namespace page -->
<namespace>
<briefdescription visible="yes"/>
<memberdecl>
<nestednamespaces visible="yes" title=""/>
<classes visible="yes" title=""/>
<typedefs title=""/>
<enums title=""/>
<functions title=""/>
<variables title=""/>
<membergroups visible="yes"/>
</memberdecl>
<detaileddescription title=""/>
<memberdef>
<inlineclasses title=""/>
<typedefs title=""/>
<enums title=""/>
<functions title=""/>
<variables title=""/>
</memberdef>
<authorsection visible="yes"/>
</namespace>

<!-- Layout definition for a file page -->
<file>
<briefdescription visible="yes"/>
<includes visible="$SHOW_INCLUDE_FILES"/>
<includegraph visible="$INCLUDE_GRAPH"/>
<includedbygraph visible="$INCLUDED_BY_GRAPH"/>
<sourcelink visible="yes"/>
<memberdecl>
<classes visible="yes" title=""/>
<namespaces visible="yes" title=""/>
<defines title=""/>
<typedefs title=""/>
<enums title=""/>
<functions title=""/>
<variables title=""/>
<membergroups visible="yes"/>
</memberdecl>
<detaileddescription title=""/>
<memberdef>
<inlineclasses title=""/>
<defines title=""/>
<typedefs title=""/>
<enums title=""/>
<functions title=""/>
<variables title=""/>
</memberdef>
<authorsection/>
</file>

<!-- Layout definition for a group page -->
<group>
<briefdescription visible="yes"/>
<groupgraph visible="$GROUP_GRAPHS"/>
<memberdecl>
<nestedgroups visible="yes" title=""/>
<dirs visible="yes" title=""/>
<files visible="yes" title=""/>
<namespaces visible="yes" title=""/>
<classes visible="yes" title=""/>
<defines title=""/>
<typedefs title=""/>
<enums title=""/>
<enumvalues title=""/>
<functions title=""/>
<variables title=""/>
<signals title=""/>
<publicslots title=""/>
<protectedslots title=""/>
<privateslots title=""/>
<events title=""/>
<properties title=""/>
<friends title=""/>
<membergroups visible="yes"/>
</memberdecl>
<detaileddescription title=""/>
<memberdef>
<pagedocs/>
<inlineclasses title=""/>
<defines title=""/>
<typedefs title=""/>
<enums title=""/>
<enumvalues title=""/>
<functions title=""/>
<variables title=""/>
<signals title=""/>
<publicslots title=""/>
<protectedslots title=""/>
<privateslots title=""/>
<events title=""/>
<properties title=""/>
<friends title=""/>
</memberdef>
<authorsection visible="yes"/>
</group>

<!-- Layout definition for a directory page -->
<directory>
<briefdescription visible="yes"/>
<directorygraph visible="yes"/>
<memberdecl>
<dirs visible="yes"/>
<files visible="yes"/>
</memberdecl>
<detaileddescription title=""/>
</directory>
</doxygenlayout>
79 changes: 79 additions & 0 deletions third-party/MQF/ThirdParty/stxxl/doc/coding_style.dox
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// -*- mode: c++; mode: visual-line; mode: flyspell; fill-column: 100000 -*-
/***************************************************************************
* doc/coding_style.dox
*
* Coding style guidelines of the STXXL.
*
* Part of the STXXL. See http://stxxl.sourceforge.net
*
* Copyright (C) 2013 Timo Bingmann <tb@panthema.net>
*
* Distributed under the Boost Software License, Version 1.0.
* (See accompanying file LICENSE_1_0.txt or copy at
* http://www.boost.org/LICENSE_1_0.txt)
**************************************************************************/

namespace stxxl {

/** \page coding_style Coding Style Guidelines
\author Timo Bingmann (2013)
STXXL coding style follows the current style of STL and Boost, as STXXL strives to provide STL compatible interfaces. Following these guidelines greatly shortens the maintainers' response time and increases their willingness to incorporate your code into STXXL.
We cannot provide a full coding style document here, the source code itself must serve as a large example. As the STXXL has grown historically, not all parts of the STXXL library following this coding style. But we do put down the following list of rules:
- Naming of classes, structs, functions and variables must follow STL naming conventions: no capital letters, use underscores between words.
- The exception are template parameters: use CamelCase for the parameters themselves, and the underscore version for the typedef.
- The smaller the scope of a variable is, the shorter its name should be.
- Member attributes of larger classes should be prefixed with \c m_ ! Attributes of smaller structs can omit \c m_.
- Tabs should not use used, indentation width is 4 spaces.
- The following code shows example of the rules above:
\code
//! A class that does something important.
template <typename ValueType>
class some_class
{
protected:
//! a class attribute, prefixed with m_
int m_used;
public:
//! types are almost always suffixes with _type,
//! with _iterator being an exception.
typedef ValueType value_type;
//! \name Group of Functions
//! \{
//! Return current state of page in cache.
//! \param page please only document parameters when needed.
void get_state(int page) const
{
int ret = 0;
for (size_t i = 0; i < list.size(); ++i) {
if (list[i].page == page)
ret = list[i].state;
}
return ret;
}
//! \}
};
\endcode
- Use of "using namespace" is absolutely prohibited.
- All public interfaces must be documented using doxygen (see tags in example).
- All containers and extensions must provide a simple tutorial and example. Design documentation is greatly recommended.
- All extensions and subsystems must provide tests which sufficiently cover the functions.
- All preprocessor macros should begin with \c STXXL_ .
*/

} // namespace stxxl
530 changes: 530 additions & 0 deletions third-party/MQF/ThirdParty/stxxl/doc/common.dox

Large diffs are not rendered by default.

1,264 changes: 1,264 additions & 0 deletions third-party/MQF/ThirdParty/stxxl/doc/design.dox

Large diffs are not rendered by default.

43 changes: 43 additions & 0 deletions third-party/MQF/ThirdParty/stxxl/doc/doxygen-extra.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/* Customization of doxygen's style for STXXL */

/* space out lines slightly */
body {
line-height: 1.20em;
}

/* increase size of code snippets */
div.fragment {
padding: 4px 6px;
}

div.fragment div.line {
line-height: 135%;
}

/* make heading smaller and quit justifying */
h1 { font-size: 16pt; margin: 12pt 0px 8pt 0px; text-align: left; }
h2 { font-size: 15pt; margin: 12pt 0px 8pt 0px; text-align: left; }
h3 { font-size: 14pt; margin: 12pt 0px 8pt 0px; text-align: left; }
h4 { font-size: 14pt; margin: 10pt 0px 6pt 0px; text-align: left; }

/* limit maximum width of text content */
div.contents {
max-width: 120ex;
text-align: justify;
hyphens: auto;
moz-hyphens: auto;
o-hyphens: auto;
-ms-hyphens: auto;
-webkit-hyphens: auto;
}

/* make links more blue */
a {
color: #3D4A9C;
font-weight: normal;
text-decoration: none;
}

.contents a:visited {
color: #3D4A9C;
}
161 changes: 161 additions & 0 deletions third-party/MQF/ThirdParty/stxxl/doc/faq.dox
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
// -*- mode: c++; mode: visual-line; mode: flyspell; fill-column: 100000 -*-
/***************************************************************************
* doc/faq.dox
*
* Frequently asked and answered questions
*
* Part of the STXXL. See http://stxxl.sourceforge.net
*
* Copyright (C) 2007 Andreas Beckmann <beckmann@mpi-inf.mpg.de>
* Copyright (C) 2013 Timo Bingmann <tb@panthema.net>
*
* Distributed under the Boost Software License, Version 1.0.
* (See accompanying file LICENSE_1_0.txt or copy at
* http://www.boost.org/LICENSE_1_0.txt)
**************************************************************************/

/** \page faq FAQ - Frequently Asked Questions
\section faq_latest Latest version of this FAQ
The most recent version of this FAQ can always be found <a href="http://stxxl.sourceforge.net">here</a>.
\section faq_compilers Supported Compilers and Platforms
The following compilers have been tested in different \c STXXL configurations. Other compilers might work, too, but we don't have the resources (systems, compilers or time) to test them. Feedback is welcome.
Please note that from STXXL 1.4.0 on, only 64-bit systems are fully supported. Compilation on 32-bit seems to work, but we cannot support it anymore.
The compilers marked with '*' are the maintainers' favorite choices and are most thoroughly tested.
compiler | supported options
----------------------- | --------------------------
gcc 4.9.1 | stxxl parallel (boost) (c++11)
gcc 4.8.3 * | stxxl parallel (boost) (c++11)
gcc 4.7.3 | stxxl parallel (boost) (c++0x)
gcc 4.6.4 | stxxl parallel (boost) (c++0x)
gcc 4.5.4 | stxxl parallel (boost) (c++0x)
gcc 4.4.7 | stxxl parallel (boost) (c++0x)
gcc 4.3.6 | stxxl (boost)
gcc 4.1.2 | stxxl (boost)
gcc 3.4.6 | stxxl (boost)
gcc 3.3 | unsupported
icpc 2015.0.090 * | stxxl (boost) (c++0x)
icpc 2013.5.192 * | stxxl (boost) (c++0x)
icpc 2011.13.367 | stxxl (boost) (c++0x)
clang++ 3.2, 3.3, 3.4.2 | stxxl (boost) (c++0x)
mingw-w64 gcc 4.8.3 | stxxl parallel (boost) (c++11)
cygwin gcc 4.8.3 | stxxl parallel (boost) (c++11)
msvc 2013 12.0 * | stxxl (boost) (c++11)
msvc 2012 11.0 | stxxl (boost) (c++0x)
msvc 2010 10.0 | stxxl boost required
- The option "parallel" uses the __gnu_parallel extensions in some parts of STXXL. For all \c gcc versions >= 4.4 the __gnu_parallel extensions are ON by default. Support for MCSTL (predecessor of __gnu_parallel) was removed in STXXL 1.4.0.
- Boost is optional and not recommended on all systems, except MSVC 2010. It provides no advantages on other platforms. \n
STXXL has been tested with Boost 1.40.0, 1.42.0 and 1.46.1. Other versions may work, too, but older versions will not get support.
- Support for C++0x and C++11 is integrated and automatically detected. No core parts of STXXL require C++11.
- All options are automatically detected by CMake.
\section faq_credit How can I credit STXXL, and thus foster its development?
- For all users:
- Sign up at Ohloh and add yourself as an STXXL user / rate STXXL: http://www.ohloh.net/p/stxxl
- Rate STXXL at heise Software-Verzeichnis (German): http://www.heise.de/software/download/stxxl/76072
- Rate STXXL at SourceForge: https://sourceforge.net/projects/stxxl/
- For scientific work: Cite the papers mentioned here: http://stxxl.sourceforge.net/
- For industrial users: Tell us the name of your company, so we can use it as a reference.
\section faq_nonPODs References to Elements in External Memory Data Structures
You should not pass or store references to elements in an external memory data structure. When the reference is used, the block that contains the element may be no longer in internal memory.<br> Use/pass an iterator (reference) instead.<br> For an \c stxxl::vector with \c n pages and LRU replacement strategy, it can be guaranteed that the last \c n references obtained using \c stxxl::vector::operator[] or dereferencing an iterator are valid.
However, if \c n is 1, even a single innocent-looking line like
\verbatim
std::cout << v[0] << " " << v[1000000] << std::endl;
\endverbatim
can lead to inconsistent results.
\section faq_templateparam Parameterizing STXXL Containers
STXXL container types like stxxl::vector can be parameterized only with a value type that is a
<a href="http://en.wikipedia.org/wiki/Plain_old_data_structures">POD</a>
(i. e. no virtual functions, no user-defined copy assignment/destructor, etc.)
and does not contain references (including pointers) to internal memory.
Usually, "complex" data types do not satisfy this requirements.
This is why stxxl::vector<std::vector<T> > and stxxl::vector<stxxl::vector<T> > are invalid.
If appropriate, use std::vector<stxxl::vector<T> >, or emulate a two-dimensional array by
doing index calculation.
\section faq_threadsafe Thread-Safety
The I/O and block management layers are thread-safe (since release 1.1.1).
The user layer data structures are not thread-safe.<br>
I.e. you may access <b>different</b> \c STXXL data structures from concurrent threads without problems,
but you should not share a data structure between threads (without implementing proper locking yourself).<br>
This is a design choice, having the data structures thread-safe would mean a significant performance loss.
\section faq_diskalloc Disk Allocation on Multiple Disks
Q: I have configured several disks to use with STXXL. Why does STXXL fail complaining about the <b>lack of space</b>? According to my calclulations, the space on the disks should be sufficient.
A: This may happen if the disks have <b>different size</b>. With the default parameters \c STXXL containers use randomized block-to-disk allocation strategies
that distribute data evenly between the disks but ignore the availability of free space on them. Thus when the smallest disk is full, the program will abort because it cannot grow the file on that disk.
A2: This round-robin disk allocation is due to the history of STXXL's support for parallel disk algorithms. It would be great if someone would contribute a patch for this issue. This would require adapting stxxl::disk_allocator and stxxl::block_manager to skip full disks when allocating new blocks.
\section faq_msclr STXXL in a Microsoft CLR Library
From STXXL user Christian, posted in the <a href="https://sourceforge.net/projects/stxxl/forums/forum/446474/topic/3407329">forum</a>:
Precondition: I use STXXL in a Microsoft CLR Library (a special DLL). That means that managed code and native code (e.g. STXXL) have to co-exist in your library.
Symptom: Application crashes at process exit, when the DLL is unloaded.
Cause: STXXL's singleton classes use the \c atexit() function to destruct themselves at process exit. The exit handling will cause the process to crash at exit (still unclear if it's a bug or a feature of the MS runtime).
Solution:
1.) Compiled STXXL static library with \c STXXL_NON_DEFAULT_EXIT_HANDLER defined.
2.) For cleanup, \c stxxl::run_exit_handlers() has now to be called manually. To get this done automatically:
Defined a CLI singleton class "Controller":
\verbatim
public ref class Controller {
private:
static Controller^ instance = gcnew Controller;
Controller();
};
\endverbatim
Registered my own cleanup function in Controller's constructor which will manage to call \c stxxl::run_exit_handlers():
\verbatim
#pragma managed(push, off)
static int myexitfn()
{
stxxl::run_exit_handlers();
return 0;
}
#pragma managed(pop)
Controller::Controller()
{
onexit(myexitfn);
}
\endverbatim
*/
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6,099 changes: 6,099 additions & 0 deletions third-party/MQF/ThirdParty/stxxl/doc/images/btree_uml.xmi

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
576 changes: 576 additions & 0 deletions third-party/MQF/ThirdParty/stxxl/doc/images/layer_diagram.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
581 changes: 581 additions & 0 deletions third-party/MQF/ThirdParty/stxxl/doc/images/pdm.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
759 changes: 759 additions & 0 deletions third-party/MQF/ThirdParty/stxxl/doc/images/san00b_pqueue.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
118 changes: 118 additions & 0 deletions third-party/MQF/ThirdParty/stxxl/doc/images/simple_logo.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
566 changes: 566 additions & 0 deletions third-party/MQF/ThirdParty/stxxl/doc/install.dox

Large diffs are not rendered by default.

84 changes: 84 additions & 0 deletions third-party/MQF/ThirdParty/stxxl/doc/introduction.dox
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/***************************************************************************
* doc/introduction.dox
*
* Most of this is from the old TeX tutorial and papers.
* Edited 2013 by Timo Bingmann <tb@panthema.net>
*
* Part of the STXXL. See http://stxxl.sourceforge.net
*
* Copyright (C) 2007 Roman Dementiev <dementiev@mpi-sb.mpg.de>
*
* Distributed under the Boost Software License, Version 1.0.
* (See accompanying file LICENSE_1_0.txt or copy at
* http://www.boost.org/LICENSE_1_0.txt)
**************************************************************************/

/** \page introduction Introduction to External Memory

\author Roman Dementiev, Lutz Kettner, Peter Sanders (2007)

Massive data sets arise naturally in many domains. Spatial databases of geographic information systems like GoogleEarth and NASA’s World Wind store terabytes of geographically referenced information that includes the whole Earth. In computer graphics one has to visualize highly complex scenes using only a conventional workstation with limited memory \cite Farias2001. Billing systems of telecommunication companies evaluate terabytes of phone call log files \cite BillingLarge. One is interested in analyzing huge network instances like a web graph \cite Donato2006 or a phone call graph. Search engines like Google and Yahoo provide fast text search in their databases indexing billions of web pages. A precise simulation of the Earth’s climate needs to manipulate with petabytes of data \cite Moore2000. These examples are only a sample of numerous applications that have to process vast amounts of data.

The <i>internal memories</i> of computers can keep only a small fraction of these large data sets. During the processing the applications need to access the <i>external memory</i> (e.g. hard disks) very frequently. One such access can be about 106 times slower than a main memory access. Therefore, the disk accesses (I/Os) become the main bottleneck.

The data are stored on the magnetic surface of a hard disk that rotates 4200–15,000 times per minute. In order to read or write a designated track of data, the disk controller moves the read/write arm to the position of this track (seek latency). If only a part of the track is needed, there is an additional rotational delay. The total time for such a disk access is an average of 3–10 ms for modern disks. The latency depends on the size and rotational speed of the disk and can hardly be reduced because of the \a mechanical nature of hard disk technology. After placing the read/write arm, the data are streamed at a high speed which is limited only by the surface data density and the MB bandwidth of the I/O interface. This speed is called sustained throughput and achieves up to 80 MB/s nowadays. In order to amortize the high seek latency, one reads or writes the data in blocks. The block size is balanced when the seek latency is a fraction of the sustained transfer time for the block. Good results show blocks containing a full track. For older low-density disks of the early 90s the track capacities were about 16–64 kB. Nowadays, disk tracks have a capacity of several megabytes.

Operating systems implement the virtual memory mechanism that extends the working space for applications, mapping an external memory file (page/swap file) to virtual addresses. This idea supports the Random Access Machine model \cite Neu45 in which a program has an infinitely large main memory. With virtual memory the application does not know where its data are located: in the main memory or in the swap file. This abstraction does not have large running time penalties for simple sequential access patterns: the operating system is even able to predict them and to load the data in ahead. For more complicated patterns these remedies are not useful and even counterproductive: the swap file is accessed very frequently; the executable code can be swapped out in favor of unnecessary data; the swap file is highly fragmented and thus many random I/O operations are needed even for scanning.

\section introduction_io_model I/O-efficient Algorithms and Models

The operating system cannot adapt to complicated access patterns of applications dealing with massive data sets. Therefore, there is a need for explicit handling of external memory accesses. The applications and their underlying algorithms and data structures should care about the pattern and the number of external memory accesses (I/Os) which they cause.

Several simple models have been introduced for designing I/O-efficient algorithms and data structures (also called <i>external memory</i> algorithms and data structures). The most popular and realistic model is the Parallel disk model (PDM) of Vitter and Shriver \cite VitShr94both. In this model, I/Os are handled explicitly by the application. An I/O operation transfers a block of \a B consecutive elements from/to a disk to amortize the latency. The application tries to transfer \a D blocks between the main memory of size \a M bytes and \a D independent disks in one I/O step to improve bandwidth, see figure below. The input size is \a N bytes which is (much) larger than \a M. The main complexity metrics of an I/O-efficient algorithm in PDM are the number of I/O steps (main metric) and the number of operations executed by the CPU. If not I/O but a slow internal CPU processing is the limiting factor of the performance of an application, we call such behavior <i>CPU-bound</i>.

The PDM has become the standard theoretical model for designing and analyzing I/O-efficient algorithms. For this model, the following matching upper and lower bounds for I/O complexity are known. Scanning a sequence of N items takes \f$ \mathrm{scan}(N) = \Theta(N / (DB)) \f$ I/Os. Sorting a sequence of \a N items takes \f$ \mathrm{sort}(N) = \Theta(N / (DB) \cdot \log_{M/B} (N/M)) \f$ I/Os. Online search among \a N items takes \f$ \mathrm{search}(N) = \Theta(\log_{DB} (N)) \f$ I/Os.

\section introduction_memory_hierarchies Memory Hierarchies

The PDM measures the transfers between the main memory and the hard disks, however, in modern architectures, the CPU does not access the main memory directly. There are a few levels of faster memory caches in-between (figure below): CPU registers, level one (L2), level two (L2) and even level three (L3) caches. The main memory is cheaper and slower than the caches. Cheap dynamic random access memory, used in the majority of computer systems, has an access latency up to 60 ns whereas L1 has a latency of less than a ns. However, for a streamed access a high bandwidth of several GB/s can be achieved. The discrepancy between the speed of CPUs and the latency of the lower hierarchy levels grows very quickly: the speed of processors is improved by about 55% yearly, the hard disk access latency only by 9% \cite Patterson2004. Therefore, the algorithms that are aware of the memory hierarchy will continue to benefit in the future and the development of such algorithms is an important trend in computer science.

\image html pdm_small.png "Schemes of parallel disk model (left) and memory hierarchy (right)"

The PDM model only describes a single level in the hierarchy. An algorithm tuned to make a minimum number of I/Os between two particular levels could be I/O-inefficient on other levels. The cache-oblivious model in \cite FLPR99 avoids this problem by not providing the knowledge of the block size \a B and main memory size \a M to the algorithm. The benefit of such an algorithm is that it is I/O-efficient on all levels of the memory hierarchy across many systems without fine tuning for any particular real machine parameters. Many basic algorithms and data structures have been designed for this model (\cite FLPR99, \cite ABDHBM02, \cite BDIW02, \cite BFMZ04). A drawback of cache-oblivious algorithms playing a role in practice is that they are only asymptotically I/O-optimal. The constants hidden in the O-notation of their I/O-complexity are significantly larger than the constants of the corresponding I/O-efficient PDM algorithms (on a particular memory hierarchy level). For instance, a tuned cache-oblivious funnel sort implementation \cite ChristianiThesis is 2.6–4.0 times slower than our I/O-efficient sorter from STXXL (see \ref design_algo_sorting) for out-of-memory inputs \cite Ajwani2007. A similar funnel sort implementation \cite BFV04 is up to two times slower than the I/O-efficient sorter from the TPIE library for large inputs. The reason for this is that these I/O-efficient sorters are highly optimized to minimize the number of transfers between the main memory and the hard disks where the imbalance in the access latency is the largest. Cache-oblivious implementations tend to lose on the inputs, exceeding the main memory size, because they do (a constant factor) more I/Os at the last level of memory hierarchy. In this paper, we concentrate on extremely large out-of-memory inputs, therefore, we will design and implement algorithms and data structures efficient in the PDM.

\section introduction_algorithm_engineering Algorithm Engineering for Large Data Sets

Theoretically, I/O-efficient algorithms and data structures have been developed for many problem domains: graph algorithms, string processing, computational geometry, etc. (see the surveys \cite MSS03, \cite Vit01). Some of them have been implemented: sorting, matrix multiplication (\cite TPIEscientific96), search trees (\cite ChiangPHD, \cite Bkdtree03, \cite DynRTrees99, \cite CRBtree03), priority queues (\cite Brengel00), text processing (\cite CraFer02). However, only few of the existing I/O-efficient algorithms have been studied experimentally. As new algorithmic results rely on previous ones, researchers, who would like to engineer practical implementations of their ideas and show the feasibility of external memory computation for the solved problem, need to invest much time in the careful design of unimplemented underlying external algorithms and data structures. Additionally, since I/O-efficient algorithms deal with hard disks, a good knowledge of low-level operating system issues is required when implementing details of I/O accesses and file system management. This delays the transfer of theoretical results into practical applications, which will have a tangible impact for industry. Therefore, one of the primary goals of algorithm engineering for large data sets is to create software frameworks and libraries that handle both the low-level I/O details efficiently and in an abstract way, and provide well-engineered and robust implementations of basic external memory algorithms and data structures.

\section introduction_stl C++ Standard Template Library

The Standard Template Library (STL) \cite stepanov94standard is a C++ library which is included in every C++ compiler distribution. It provides basic data structures (called containers) and algorithms. STL containers are generic and can store any built-in or user data type that supports some elementary operations (e.g. copying and assignment). STL algorithms are not bound to a particular container: an algorithm can be applied to any container that supports the operations required for this algorithm (e.g. random access to its elements). This flexibility significantly reduces the complexity of the library.

STL is based on the C++ template mechanism. The flexibility is supported using compile-time polymorphism rather than the object-oriented run-time polymorphism. The run-time polymorphism is implemented in languages like C++ with the help of virtual functions that usually cannot be inlined by C++ compilers. This results in a high per-element penalty of calling a virtual function. In contrast, modern C++ compilers minimize the abstraction penalty of STL inlining many functions.

STL containers include: \c std::vector (an unbounded array), \c std::priority queue, \c std::list, \c std::stack, \c std::deque, \c std::set, \c std::multiset (allows duplicate elements), \c std::map (allows mapping from one data item (a key) to another (a value)), \c std::multimap (allows duplicate keys), etc. Containers based on hashing (\c hash_set, \c hash_multiset, \c hash_map and \c hash_multimap) are not yet standardized and distributed as an STL extension.

Iterators are an important part of the STL library. An iterator is a kind of handle used to access items stored in data structures. Iterators offer the following operations: read/write the value pointed by the iterator, move to the next/previous element in the container, move forward/backward (random access) by some number of elements.

STL provides a large number of algorithms that perform scanning, searching, and sorting. The implementations accept iterators that possess a certain set of operations described above. Thus, the STL algorithms will work on any container with iterators following the requirements. To achieve flexibility, STL algorithms are parameterized with objects, overloading the function operator (<tt>operator()</tt>). Such objects are called \a functors. A functor can, for instance, define the sorting order for the STL sorting algorithm or keep the state information in functions passed to other functions. Since the type of the functor is a template parameter of an STL algorithm, the function operator does not need to be virtual and can easily be inlined by the compiler, thus avoiding the function call costs.

The STL library is well accepted and its generic approach and principles are followed in other famous C++ libraries like Boost \cite karlsson2005beyond and CGAL \cite fabri1998design.

\section introduction_goals The Goals of STXXL

Several external memory software library projects (LEDA-SM \cite CraMeh99 and TPIE \cite tpie_manual) were started to reduce the gap between theory and practice in external memory computing. They offer frameworks that aim to speed up the process of implementing I/O-efficient algorithms, abstracting away the details of how I/O is performed. Those projects are excellent proofs of EM paradigm, but have some drawbacks which \b impede their practical use.

Therefore we started to develop STXXL library, which tries to avoid those obstacles. The objectives of STXXL project (distinguishing it from other libraries):

- Offer \b transparent support of parallel disks. This feature although announced has not been implemented in any library.

- Implement \b parallel disk algorithms. LEDA-SM and TPIE libraries offer only implementations of single disk EM algorithms.

- Make the library able to handle problems of <b>real world size</b> (up to dozens of terabytes).

- Improved utilization of computer resources. STXXL explicitly supports \b overlapping between I/O and computation. STXXL implementations of external memory algorithms and data structures benefit from the overlapping of I/O and computation.

- STXXL achieves small constant factors in I/O volume. In particular, \b "pipelining" can save more than \b half the number of I/Os performed by many algorithms.

- Care about the <b>internal work</b>, improve the in-memory algorithms. Having many disks can hide the latency and increase the I/O bandwidth, s.t. internal work becomes a bottleneck.

- Care about operating system overheads. Use <b>unbuffered disk access</b> to avoid superfluous copying of data.

- Short development times due to well-known STL-compatible interfaces for external memory algorithms and data structures. STL algorithms can be directly applied to STXXL containers (code reuse); moreover, the I/O complexity of the algorithms remains optimal in most cases.

*/
153 changes: 153 additions & 0 deletions third-party/MQF/ThirdParty/stxxl/doc/mainpage.dox
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
// -*- mode: c++; mode: visual-line; mode: flyspell; fill-column: 100000 -*-
/***************************************************************************
* doc/mainpage.dox
*
* Main page of STXXL doxygen tree. All doc pages should be linked here.
*
* Part of the STXXL. See http://stxxl.sourceforge.net
*
* Copyright (C) 2013 Timo Bingmann <tb@panthema.net>
*
* Distributed under the Boost Software License, Version 1.0.
* (See accompanying file LICENSE_1_0.txt or copy at
* http://www.boost.org/LICENSE_1_0.txt)
**************************************************************************/

/** \mainpage Welcome to STXXL
The core of STXXL is an implementation of the C++ standard template library STL for <b>external memory</b> (out-of-core) computations, i.e., STXXL implements containers and algorithms that can process huge volumes of data that only fit on disks. While the compatibility to the STL supports ease of use and compatibility with existing applications, another design priority is high performance. Here is a selection of STXXL performance features:
- transparent support of multiple disks
- variable block lengths
- overlapping of I/O and computation
- prevention of OS file buffering overhead
- algorithm pipelining
- utilization of multiple processor cores for internal computation
See the \subpage introduction "introduction to external memory" for a longer description and our vision.
# Getting Started: Building and Tutorial
This section will help you if you are using the STXXL for the first time.
First you must compile the library. Pick one of the following \subpage install "build instructions", depending on your host system:
- \ref install_unix
- \ref install_windows
Once compiled, you can read the following simple tutorials on how to use STXXL containers and algorithms:
- \ref tutorial_vector
- \ref tutorial_stack
- \ref tutorial_pqueue
See the corresponding page for a \subpage tutorial "complete list of all Tutorials and Examples".
# Design and More Information
We have collected much documentation about the design of STXXL. Even more information is available as academic research papers, technical reports and theses.
- \subpage design "Design of STXXL concepts, containers and algorithms"
If you plan to contribute code to STXXL, please read the \subpage coding_style and use the \subpage common.
# FAQ, Troubleshooting, Bugs and More
- \subpage faq
- Questions concerning use and development of the STXXL library should be posted to the <a href="http://sourceforge.net/projects/stxxl/forums"><b>FORUMS</b></a>. Please search the forum before posting, your question may have been answered before.
- See \ref faq_compilers when compilation fails.
- Bugs and pull requests can be reported via <a href="http://github.com/stxxl/stxxl"><b>Github</b></a>: http://github.com/stxxl/stxxl.
- Check the \ref changelog for recent changes when switching version.
- The STXXL source also contains \subpage stxxl_tool "stxxl_tool", a collection of simple tools and benchmarks.
# License and Authors
\c STXXL is distributed under the Boost Software License, Version 1.0. \n
You can find a copy of the license in the accompanying file \ref license or at <a href="http://www.boost.org/LICENSE_1_0.txt">http://www.boost.org/LICENSE_1_0.txt</a>. \n
Many people have contributed to STXXL, see all \ref authors.
\subpage textfiles "&nbsp;"
*/

/** \page textfiles Additional Text Files
- \subpage readme
- \subpage changelog
- \subpage authors
- \subpage textfiles_install
- \subpage license
- \subpage textfiles_todo
\page readme README
\verbinclude README
\page changelog ChangeLog
\verbinclude CHANGELOG
\page authors AUTHORS
The following list of authors have contributed to STXXL:
\verbinclude AUTHORS
\page textfiles_install INSTALL
\verbinclude INSTALL
\page license LICENSE_1_0.txt
\verbinclude LICENSE_1_0.txt
\page textfiles_todo TODO
\verbinclude TODO
*/

// Module Groups are defined here to fix their order:

/*! \defgroup stllayer STL-User Layer
Layer which groups STL compatible algorithms and containers
*/

/*! \defgroup streampack Stream Package
Package that enables pipelining of consequent sorts and scans of the external data avoiding the saving the intermediate results on the disk, e.g. the output of a sort can be directly fed into a scan procedure without the need to save it on a disk. All components of the package are contained in the \c stxxl::stream namespace.
STREAM ALGORITHM CONCEPT (Do not confuse with C++ input/output streams)
\verbatim
struct stream_algorithm // stream, pipe, whatever
{
typedef some_type value_type;
const value_type & operator * () const; // return current element of the stream
stream_algorithm & operator ++ (); // go to next element. precondition: empty() == false
bool empty() const; // return true if end of stream is reached
};
\endverbatim
*/

/*! \defgroup mnglayer Block Management Layer
Group of classes which help controlling external memory space, managing disks, and allocating and deallocating blocks of external storage.
*/

/*! \defgroup iolayer I/O Primitives Layer
Group of classes which enable abstraction from operating system calls and support system-independent interfaces for asynchronous I/O.
*/

/*! \defgroup support Common Utilities and Support Classes
Supporting classes also useful for applications, see also \ref common .
*/
Loading