Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit f822944

Browse files
authoredFeb 10, 2022
Initial public release (#15)
* Initial public release
1 parent 504460e commit f822944

File tree

12 files changed

+203
-23
lines changed

12 files changed

+203
-23
lines changed
 

‎Project.toml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,17 @@ version = "0.1.0"
77
CircularArrays = "7a955b69-7140-5f4e-a0ed-f168c5e2e749"
88
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
99
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
10-
ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
1110

1211
[compat]
12+
CircularArrays = "1"
13+
DataStructures = "0.18"
14+
OffsetArrays = "1"
1315
julia = "1"
1416

1517
[extras]
1618
Faker = "0efc519c-db33-5916-ab87-703215c3906f"
1719
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
20+
Suppressor = "fd094767-a336-5f1f-9728-57cf17d0bbfb"
1821

1922
[targets]
20-
test = ["Test", "Faker"]
23+
test = ["Test", "Faker", "Suppressor"]

‎README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,16 @@ This package is be particulary useful for natural language processing tasks whic
1717
- [X] Support for unicodes
1818
- [ ] Custom user defined feature generation methods
1919
- [ ] Mecab-based tokenizer support
20+
- [X] Support for building databases directly from text files
21+
- [ ] Support for persistent databases
2022

2123
## Suported String Similarity Measures
2224

2325
- [X] Dice coefficient
2426
- [X] Jaccard coefficient
2527
- [X] Cosine coefficient
2628
- [X] Overlap coefficient
29+
- [X] Exact match
2730

2831
## Installation
2932

‎docs/src/index.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ This package is be particulary useful for natural language processing tasks whic
1616
- [X] Support for unicodes
1717
- [ ] Custom user defined feature generation methods
1818
- [ ] Mecab-based tokenizer support
19-
- [ ] Support for building databases directly from text files
19+
- [X] Support for building databases directly from text files
2020
- [ ] Support for persistent databases
2121

2222
## Suported String Similarity Measures
@@ -64,6 +64,8 @@ push!(db, "fooo");
6464

6565
# Convinient approach is to use an array of strings for multiple entries: `append!(db, ["foo", "bar", "fooo"]);`
6666

67+
# OR: Build database from text files: `append!(db, "YOUR_FILE_NAME.txt");
68+
6769
# Retrieve the closest match(es)
6870
res = search(Dice(), db, "foo"; α=0.8, ranked=true)
6971
# 2-element Vector{Tuple{String, Float64}}:
@@ -72,7 +74,7 @@ res = search(Dice(), db, "foo"; α=0.8, ranked=true)
7274

7375
# Describe a working database collection
7476
desc = describe_collection(db)
75-
# (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13)
77+
# (total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13)
7678
```
7779

7880
## TODO: Benchmarks

‎extras/benchmark_sim.jl

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
using SimString
2+
using Faker
3+
using BenchmarkTools
4+
using DataStructures
5+
6+
################################# Benchmark Bulk addition #####################
7+
db = DictDB(CharacterNGrams(3, " "));
8+
Faker.seed(2020)
9+
@time fake_names = [string(Faker.first_name(), " ", Faker.last_name()) for i in 1:100_000];
10+
11+
12+
f(d, x) = append!(d, x)
13+
@time f(db, fake_names)
14+
15+
16+
17+
################################ Simple Addition ###############################
18+
19+
db = DictDB(CharacterNGrams(2, " "));
20+
push!(db, "foo");
21+
push!(db, "bar");
22+
push!(db, "fooo");
23+
24+
f(x, c, s, a, r) = search(x, c, s; α=a, ranked=r)
25+
test = "foo";
26+
col = db;
27+
sim = Cosine();
28+
a = 0.8;
29+
r = true;
30+
31+
f(Cosine(), db, "foo", 0.8, true)
32+
33+
@btime f($sim, $col, $test, $a, $r)
34+
@btime search(Cosine(), db, "foo"; α=0.8, ranked=true)
35+
36+
37+
38+
db2 = DictDB(CharacterNGrams(3, " "));
39+
append!(db2, ["foo", "bar", "fooo", "foor"]) # also works via multiple dispatch on a vector
40+
41+
results = search(Cosine(), db, "foo"; α=0.8, ranked=true) # yet to be implemented
42+
43+
bs = ["foo", "bar", "foo", "foo", "bar"]
44+
SimString.extract_features(CharacterNGrams(3, " "), "prepress")
45+
SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude.")
46+
47+
db = DictDB(WordNGrams(2, " ", " "))
48+
push!(db, "You are a really really really cool dude.")

‎src/SimString.jl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ module SimString
22

33
import Base: push!, append!
44
using DataStructures: DefaultOrderedDict, DefaultDict
5-
using ProgressMeter
65
using CircularArrays
76
using OffsetArrays
87

‎src/dictdb.jl

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ Basic summary stats for the DB
8787
db = DictDB(CharacterNGrams(2, " "));
8888
append!(db, ["foo", "bar", "fooo"]);
8989
describe_collection(db)
90+
(total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13)
9091
9192
# Returns
9293
* NamedTuples: Summary stats for the DB
@@ -98,7 +99,7 @@ function describe_collection(db::DictDB)
9899
# Total number of strings in collection
99100
= length(db.string_collection)
100101

101-
# Average number of ngram features
102+
# Average size of ngram features
102103
n = [x for x in keys(db.string_size_map)]
103104
μ = sum(n) / length(n)
104105

@@ -108,7 +109,19 @@ for i in values(db.string_feature_map)
108109
total_ngrams += length(i)
109110
end
110111

111-
return (total_collection = ∑, avg_num_ngrams = μ, total_ngrams = total_ngrams)
112+
return (total_collection = ∑, avg_size_ngrams = μ, total_ngrams = total_ngrams)
113+
end
114+
115+
116+
"""
117+
Pretty print summary stats for the DB
118+
"""
119+
function Base.show(io::IO, x::DictDB)
120+
metrics = describe_collection(x)
121+
println(io, "DictDB($(x.feature_extractor))")
122+
println(io, "Total collection: ", metrics.total_collection)
123+
println(io, "Average number of ngram features: ", metrics.avg_size_ngrams)
124+
println(io, "Total number of ngram features: ", metrics.total_ngrams)
112125
end
113126

114127

‎src/features.jl

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,25 @@ end
9999

100100

101101
"""
102+
push!(db::AbstractSimStringDB, str::AbstractString)
103+
102104
Add a new item to a new or existing collection of strings using
103105
the custom AbstractSimStringDB type.
106+
107+
# Arguments:
108+
* `db`: AbstractSimStringDB - The collection of strings to add to
109+
* `str`: AbstractString - The string to add to the collection
110+
111+
# Example:
112+
```julia
113+
db = DictDB(CharacterNGrams(2, " "));
114+
push!(db, "foo")
115+
push!(db, "bar")
116+
push!(db, "fooo")
117+
````
118+
119+
# Returns:
120+
* `db`: AbstractSimStringDB - The collection of strings with the new string added
104121
"""
105122
function push!(db::AbstractSimStringDB, str::AbstractString)
106123
# Extract features based on the specified feature extractor
@@ -125,11 +142,54 @@ end
125142

126143

127144
"""
145+
append!(db::AbstractSimStringDB, str::Vector)
146+
128147
Add bulk items to a new or existing collection of strings using
129148
the custom AbstractSimStringDB type.
149+
150+
# Arguments:
151+
* db: AbstractSimStringDB - The database to add the strings to
152+
* str: Vector of AbstractString - Vector/Array of strings to add to the database
153+
154+
# Example:
155+
```julia
156+
db = DictDB(CharacterNGrams(2, " "));
157+
append!(db, ["foo", "foo", "fooo"]);
158+
```
159+
160+
# Returns:
161+
* db: AbstractSimStringDB - The database with the new strings added
130162
"""
131163
function append!(db::AbstractSimStringDB, str::Vector)
132164
@inbounds @simd for i in str
133165
push!(db, i)
134166
end
167+
end
168+
169+
170+
"""
171+
append!(db::AbstractSimStringDB, file::AbstractString)
172+
173+
Add bulk items to a new or existing collection of strings using
174+
from a file using the custom AbstractSimStringDB type.
175+
176+
# Arguments:
177+
* `db``: AbstractSimStringDB - The database to add the items to
178+
* `file`: AbstractString - Path to the file to read from
179+
180+
# Example:
181+
```julia
182+
db = DictDB(CharacterNGrams(2, " "));
183+
append!(db, "./data/test.txt")
184+
```
185+
186+
# Returns:
187+
* `db`: AbstractSimStringDB - The database with the items added
188+
"""
189+
function append!(db::AbstractSimStringDB, file::AbstractString)
190+
open(file) do f
191+
for line in eachline(f)
192+
push!(db, line)
193+
end
194+
end
135195
end

‎src/search.jl

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ function overlap_join(db_collection::AbstractSimStringDB, features, τ, candidat
7474
results = String[]
7575

7676
for (candidate, match_count) in candidate_match_counts
77-
for i in (query_feature_length - τ + 1) : query_feature_length # TODO: Verify
77+
for i in (query_feature_length - τ + 1) : query_feature_length
7878
if candidate in lookup_feature_set_by_size_feature(db_collection, candidate_size, features[i])
7979
match_count += 1
8080
end
@@ -103,16 +103,16 @@ function search!(measure::AbstractSimilarityMeasure, db_collection::DictDB, quer
103103
features = extract_features(db_collection.feature_extractor, query)
104104

105105
# Metadata from the generated features (length, min & max sizes)
106-
length_of_features = length(features)
107-
min_feature_size = minimum_feature_size(measure, length_of_features, α)
108-
max_feature_size = maximum_feature_size(measure, db_collection, length_of_features, α)
106+
# length_of_features = length(features)
107+
# min_feature_size = minimum_feature_size(measure, length_of_features, α)
108+
# max_feature_size = maximum_feature_size(measure, db_collection, length_of_features, α)
109109

110110
results = String[]
111111

112112
# Generate and return results from the potential candidate size pool
113-
@inbounds for candidate_size in min_feature_size:max_feature_size
113+
@inbounds for candidate_size in minimum_feature_size(measure, length(features), α) : maximum_feature_size(measure, db_collection, length(features), α)
114114
# Minimum overlap
115-
τ = minimum_overlap(measure, length_of_features, candidate_size, α)
115+
τ = minimum_overlap(measure, length(features), candidate_size, α)
116116

117117
# Generate approximate candidates from the overlap join
118118
append!(results, overlap_join(db_collection, features, τ, candidate_size))

‎test/dummy_sents.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
You are a really really really cool dude.
2+
Sometimes you are not really really cool tho

‎test/dummy_words.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
foo
2+
bar
3+
fooo

‎test/test01_dictdb.jl

Lines changed: 44 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ using Test
1515

1616
@test collect(keys(db.string_feature_map)) == [5, 6]
1717

18-
@test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) )
19-
@test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
18+
@test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)))
19+
@test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
2020
end
2121

2222

@@ -41,10 +41,10 @@ end
4141

4242
@test collect(keys(db.string_feature_map)) == [5, 6]
4343

44-
@test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) )
45-
@test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
44+
@test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)))
45+
@test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
4646

47-
@test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String, Int64}
47+
@test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String,Int64}
4848
end
4949

5050

@@ -59,19 +59,53 @@ end
5959
@test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
6060
@test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
6161

62-
@test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String, String}, Int64}
62+
@test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String,String},Int64}
6363
end
6464

6565

6666

6767
@testset "Test describe functionality" begin
68-
db = DictDB(CharacterNGrams(2, " "));
69-
append!(db, ["foo", "bar", "fooo"]);
68+
db = DictDB(CharacterNGrams(2, " "))
69+
append!(db, ["foo", "bar", "fooo"])
7070

7171
# Interact with db
72-
search(Dice(), db, "zep"; α=0.8, ranked=true)
72+
search(Dice(), db, "zep"; α = 0.8, ranked = true)
73+
74+
@test describe_collection(db) == (total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13)
75+
end
76+
77+
78+
@testset "Test bulk insertion from a file using CharacterNGrams" begin
79+
db = DictDB(CharacterNGrams(3, " "))
80+
append!(db, "dummy_words.txt")
81+
82+
@test db.string_collection == ["foo", "bar", "fooo"]
83+
@test db.string_size_map[5] == Set(["bar", "foo"])
84+
@test db.string_size_map[6] == Set(["fooo"])
85+
86+
@test collect(keys(db.string_feature_map)) == [5, 6]
87+
88+
@test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)))
89+
@test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
90+
91+
@test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String,Int64}
92+
end
93+
94+
95+
96+
@testset "Test bulk insertion from a file using WordNGrams" begin
97+
db = DictDB(WordNGrams(2, " ", " "))
98+
append!(db, "dummy_sents.txt")
99+
100+
@test db.string_collection == ["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]
101+
@test db.string_size_map[9] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
102+
103+
@test collect(keys(db.string_feature_map)) == [9]
104+
@test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
105+
@test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
106+
107+
@test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String,String},Int64}
73108

74-
@test describe_collection(db) == (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13)
75109
end
76110

77111

‎test/test04_search.jl

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ module TestMeasures
22
using SimString
33
using Test
44
using Faker
5+
using Suppressor
56

67

78
@testset "Test Dice Search" begin
@@ -54,6 +55,7 @@ end
5455

5556
end
5657

58+
5759
@testset "Test Micro Deep Dive Search" begin
5860
db = DictDB(CharacterNGrams(2, " "));
5961
append!(db, ["a", "ab", "abc", "abcd", "abcde"]);
@@ -76,6 +78,17 @@ end
7678
end
7779

7880

81+
@testset "Test output from show" begin
82+
db = DictDB(CharacterNGrams(2, " "));
83+
append!(db, ["foo", "bar", "fooo"]);
84+
85+
expected_out = "DictDB(SimString.CharacterNGrams{Int64, String}(2, \" \"))\nTotal collection: 3\nAverage number of ngram features: 4.5\nTotal number of ngram features: 13\n"
86+
r = @capture_out show(db)
87+
@test r == expected_out
88+
end
89+
90+
91+
7992

8093

8194
end # module

2 commit comments

Comments
 (2)

PyDataBlog commented on Feb 10, 2022

@PyDataBlog
OwnerAuthor

JuliaRegistrator commented on Feb 10, 2022

@JuliaRegistrator

Registration pull request created: JuliaRegistries/General/54345

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.1.0 -m "<description of version>" f822944fcc9416389a3d34e93c44d57d11db7ef2
git push origin v0.1.0
Please sign in to comment.