Release v0.2.0 (#22)

PyDataBlog · web-flow · commit a76423ac6865 · 2022-02-23T16:45:16.000+01:00
* Release v0.2.0 with optimizations
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "SimString"
 uuid = "2e3c4037-312d-4650-b9c0-fcd0fc09aae4"
 authors = ["Bernard Brenyah"]
-version = "0.1.0"
+version = "0.2.0"
 
 [deps]
 CircularArrays = "7a955b69-7140-5f4e-a0ed-f168c5e2e749"
diff --git a/README.md b/README.md
@@ -15,9 +15,9 @@ This package is be particulary useful for natural language processing tasks whic
 - [X] Fast algorithm for string matching
 - [X] 100% exact retrieval
 - [X] Support for unicodes
+- [X] Support for building databases directly from text files
 - [ ] Custom user defined feature generation methods
 - [ ] Mecab-based tokenizer support
-- [X] Support for building databases directly from text files
 - [ ] Support for persistent databases
 
 ## Suported String Similarity Measures
@@ -41,7 +41,7 @@ pkg> add SimString
 The few (and selected) brave ones can simply grab the current experimental features by simply adding the master branch to your development environment after invoking the package manager with `]`:
 
 ```julia
-pkg> add SimString#master
+pkg> add SimString#main
 ```
 
 You are good to go with bleeding edge features and breakages!
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -7,16 +7,18 @@ CurrentModule = SimString
 Documentation for [SimString](https://github.com/PyDataBlog/SimString.jl).
 
 A native Julia implementation of the CPMerge algorithm, which is designed for approximate string matching.
-This package is be particulary useful for natural language processing tasks which demand the retrieval of strings/texts from a very large corpora (big amounts of texts). Currently, this package supports both Character and Word based N-grams feature generations and there are plans to open the package up for custom user defined feature generation methods.
+This package is be particulary useful for natural language processing tasks which require the retrieval of strings/texts from a very large corpora (big amounts of texts). Currently, this package supports both Character and Word based N-grams feature generations and there are plans to open the package up for custom user defined feature generation methods.
+
+CPMerge Paper: [https://aclanthology.org/C10-1096/](https://aclanthology.org/C10-1096/)
 
 ## Features
 
 - [X] Fast algorithm for string matching
 - [X] 100% exact retrieval
 - [X] Support for unicodes
+- [X] Support for building databases directly from text files
 - [ ] Custom user defined feature generation methods
 - [ ] Mecab-based tokenizer support
-- [X] Support for building databases directly from text files
 - [ ] Support for persistent databases
 
 ## Suported String Similarity Measures
@@ -82,6 +84,7 @@ desc = describe_collection(db)
 ## Release History
 
 - 0.1.0 Initial release.
+- 0.2.0 Added support for unicodes
 
 ```@index
 ```
diff --git a/src/dictdb.jl b/src/dictdb.jl
@@ -129,7 +129,7 @@ end
 Internal function to lookup feature sets by size and feature
 """
 function lookup_feature_set_by_size_feature(db::DictDB, size, feature)
-    if feature ∉ keys(db.lookup_cache[size])
+    if !haskey(db.lookup_cache[size], feature)
         db.lookup_cache[size][feature] = get(db.string_feature_map[size], feature, Set{String}())
     end
     return db.lookup_cache[size][feature]
diff --git a/src/features.jl b/src/features.jl
@@ -10,7 +10,6 @@ end
 Internal function to pad AbstractVector types with specified padder
 """
 function pad_string(x::AbstractVector, padder::AbstractString)
-    # TODO: Insert a padder as the first and last element of x with undef
     insert!(x, 1, padder)
     push!(x, padder)
     return x
@@ -96,7 +95,6 @@ end
 Internal function to count and pad generated character-level ngrams (including duplicates)
 """
 function cummulative_ngram_count(x)
-    # TODO: Use length of x initiate non allocated ngrams
     counter = Dict{eltype(x), Int}()
 
     return map(x) do val
diff --git a/src/search.jl b/src/search.jl
@@ -102,11 +102,6 @@ function search!(measure::AbstractSimilarityMeasure, db_collection::DictDB, quer
     # Generate features from query string
     features = extract_features(db_collection.feature_extractor, query)
 
-    # Metadata from the generated features (length, min & max sizes)
-    # length_of_features = length(features)
-    # min_feature_size = minimum_feature_size(measure, length_of_features, α)
-    # max_feature_size = maximum_feature_size(measure, db_collection, length_of_features, α)
-
     results = String[]
 
     # Generate and return results from the potential candidate size pool