Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit d375dfb

Browse files
authoredAug 9, 2023
Mecab support (#24)
* Added support for Japanese language via Wakame.jl * Added installation of MeCab for CI
1 parent a76423a commit d375dfb

File tree

10 files changed

+112
-46
lines changed

10 files changed

+112
-46
lines changed
 

‎.github/workflows/CI.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ jobs:
1616
os:
1717
- ubuntu-latest
1818
- macOS-latest
19-
- windows-latest
19+
# - windows-latest
2020
arch:
2121
- x64
2222
steps:
@@ -35,6 +35,11 @@ jobs:
3535
${{ runner.os }}-test-${{ env.cache-name }}-
3636
${{ runner.os }}-test-
3737
${{ runner.os }}-
38+
- name: Install Mecab (MacOS)
39+
if: runner.os == 'macOS'
40+
run: |
41+
brew install mecab
42+
brew install mecab-ipadic
3843
- uses: julia-actions/julia-buildpkg@v1
3944
- uses: julia-actions/julia-runtest@v1
4045
- uses: julia-actions/julia-processcoverage@v1

‎Project.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,25 @@
11
name = "SimString"
22
uuid = "2e3c4037-312d-4650-b9c0-fcd0fc09aae4"
33
authors = ["Bernard Brenyah"]
4-
version = "0.2.0"
4+
version = "0.3.0"
55

66
[deps]
77
CircularArrays = "7a955b69-7140-5f4e-a0ed-f168c5e2e749"
88
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
99
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
10+
Wakame = "4447db07-3941-47e2-90a2-965b7cb1b6ce"
1011

1112
[compat]
1213
CircularArrays = "1"
1314
DataStructures = "0.18"
1415
OffsetArrays = "1"
1516
julia = "1"
17+
Wakame = "0.1"
1618

1719
[extras]
1820
Faker = "0efc519c-db33-5916-ab87-703215c3906f"
19-
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
2021
Suppressor = "fd094767-a336-5f1f-9728-57cf17d0bbfb"
22+
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
2123

2224
[targets]
2325
test = ["Test", "Faker", "Suppressor"]

‎README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,8 @@ This package is be particulary useful for natural language processing tasks whic
1616
- [X] 100% exact retrieval
1717
- [X] Support for unicodes
1818
- [X] Support for building databases directly from text files
19-
- [ ] Custom user defined feature generation methods
20-
- [ ] Mecab-based tokenizer support
21-
- [ ] Support for persistent databases
19+
- [X] Mecab-based tokenizer support
20+
- [ ] Support for persistent databases like MongoDB
2221

2322
## Suported String Similarity Measures
2423

‎docs/src/index.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,8 @@ CPMerge Paper: [https://aclanthology.org/C10-1096/](https://aclanthology.org/C10
1717
- [X] 100% exact retrieval
1818
- [X] Support for unicodes
1919
- [X] Support for building databases directly from text files
20-
- [ ] Custom user defined feature generation methods
21-
- [ ] Mecab-based tokenizer support
22-
- [ ] Support for persistent databases
20+
- [X] Mecab-based tokenizer support for Japanese
21+
- [ ] Support for persistent databases like MongoDB
2322

2423
## Suported String Similarity Measures
2524

@@ -59,7 +58,9 @@ pkg> free SimString
5958
using SimString
6059

6160
# Inilisate database and some strings
62-
db = DictDB(CharacterNGrams(2, " "));
61+
db = DictDB(CharacterNGrams(2, " "));
62+
# OR: db = DictDB(WordNGrams(2, " ")); for word based ngrams
63+
# OR db = DictDB(MecabNGrams(2, " ", Mecab())) for Japanese ngrams. Requires installation of Mecab
6364
push!(db, "foo");
6465
push!(db, "bar");
6566
push!(db, "fooo");
@@ -85,6 +86,7 @@ desc = describe_collection(db)
8586

8687
- 0.1.0 Initial release.
8788
- 0.2.0 Added support for unicodes
89+
- 0.3.0 Added Japanese support via Mecab
8890

8991
```@index
9092
```

‎src/SimString.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import Base: push!, append!
44
using DataStructures: DefaultOrderedDict, DefaultDict
55
using CircularArrays
66
using OffsetArrays
7+
using Wakame
78

89
######### Import modules & utils ################
910
include("db_collection.jl")
@@ -17,7 +18,7 @@ include("search.jl")
1718
####### Global export of user API #######
1819
export Dice, Jaccard, Cosine, Overlap, ExactMatch,
1920
AbstractSimStringDB, DictDB, describe_collection,
20-
CharacterNGrams, WordNGrams,
21+
CharacterNGrams, WordNGrams, MecabNGrams,
2122
search
2223

2324

‎src/db_collection.jl

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@ Abstract type for feature extraction structs
1212
abstract type FeatureExtractor end
1313

1414

15-
# Feature Extraction Definitions
16-
1715
"""
1816
Feature extraction on character-level ngrams
1917
"""
@@ -33,3 +31,12 @@ struct WordNGrams{T1<:Int, T2<:AbstractString} <: FeatureExtractor
3331
end
3432

3533

34+
35+
"""
36+
Feature extraction based on mecab word-level ngrams
37+
"""
38+
struct MecabNGrams{T1<:Int, T2<:AbstractString, T3<:Mecab} <: FeatureExtractor
39+
n::T1 # number of n-grams to extract
40+
padder::T2 # string to use to pad n-grams
41+
tokenizer::T3 # Mecab tokenizer to use
42+
end

‎src/dictdb.jl

Lines changed: 46 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,19 @@ function DictDB(x::CharacterNGrams)
4444
end
4545

4646

47+
"""
48+
Internal function for generating a base DictDB object for WordNGrams and MecabNGrams
49+
"""
50+
function generate_base_dict_db(x)
51+
DictDB(
52+
x,
53+
String[],
54+
DefaultDict{Int, Set{String}}( () -> Set{String}() ),
55+
DefaultDict{ Int, DefaultOrderedDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}} }( () -> DefaultOrderedDict{Tuple{SubArray{SubString{String}}, Int}, Set{String} }(Set{String})),
56+
DefaultDict{ Int, DefaultDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}} }( () -> DefaultDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}}(Set{String}))
57+
)
58+
end
59+
4760
"""
4861
DictDB(x::WordNGrams)
4962
@@ -60,15 +73,28 @@ db = DictDB(WordNGrams(2, " ", " "))
6073
# Returns
6174
* `DictDB`: A DictDB object with additional containers and Metadata for WordNGrams
6275
"""
63-
function DictDB(x::WordNGrams)
64-
DictDB(
65-
x,
66-
String[],
67-
DefaultDict{Int, Set{String}}( () -> Set{String}() ),
68-
DefaultDict{ Int, DefaultOrderedDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}} }( () -> DefaultOrderedDict{Tuple{SubArray{SubString{String}}, Int}, Set{String} }(Set{String})),
69-
DefaultDict{ Int, DefaultDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}} }( () -> DefaultDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}}(Set{String}))
70-
)
71-
end
76+
DictDB(x::WordNGrams) = generate_base_dict_db(x)
77+
78+
79+
80+
"""
81+
DictDB(x::MecabNGrams)
82+
83+
Initialize a dict DB with additional containers and Metadata for MecabNGrams
84+
85+
# Arguments
86+
* `x`: MecabNGrams object
87+
88+
# Example
89+
```julia
90+
db = DictDB(MecabNGrams(2, " ", Mecab()))
91+
```
92+
93+
# Returns
94+
* `DictDB`: A DictDB object with additional containers and Metadata for MecabNGrams
95+
"""
96+
DictDB(x::MecabNGrams) = generate_base_dict_db(x)
97+
7298

7399

74100

@@ -96,20 +122,20 @@ describe_collection(db)
96122
"""
97123
function describe_collection(db::DictDB)
98124

99-
# Total number of strings in collection
100-
= length(db.string_collection)
125+
# Total number of strings in collection
126+
= length(db.string_collection)
101127

102-
# Average size of ngram features
103-
n = [x for x in keys(db.string_size_map)]
104-
μ = sum(n) / length(n)
128+
# Average size of ngram features
129+
n = [x for x in keys(db.string_size_map)]
130+
μ = sum(n) / length(n)
105131

106-
# Total number of ngram features
107-
total_ngrams = 0
108-
for i in values(db.string_feature_map)
109-
total_ngrams += length(i)
110-
end
132+
# Total number of ngram features
133+
total_ngrams = 0
134+
for i in values(db.string_feature_map)
135+
total_ngrams += length(i)
136+
end
111137

112-
return (total_collection = ∑, avg_size_ngrams = μ, total_ngrams = total_ngrams)
138+
return (total_collection = ∑, avg_size_ngrams = μ, total_ngrams = total_ngrams)
113139
end
114140

115141

‎src/features.jl

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -36,29 +36,20 @@ function init_ngrams(extractor::CharacterNGrams, x, n)
3636
end
3737

3838

39-
4039
"""
41-
Internal function to generate intial uncounted ngrams on a word level
40+
Internal function to generate intial uncounted word ngrams on a word level
4241
"""
43-
function init_ngrams(extractor::WordNGrams, x, n)
42+
function init_ngrams(extractor, x, n)
4443
map(0:length(x)-n) do i
4544
@view x[i+1: i+n]
4645
end
4746
end
4847

4948

5049
"""
51-
Internal function to create character-level ngrams features from an AbstractString
52-
"""
53-
function n_grams(extractor::CharacterNGrams, x, n)
54-
return cummulative_ngram_count(init_ngrams(extractor, x, n))
55-
end
56-
57-
58-
"""
59-
Internal function to create word-level ngrams from an AbstractVector
50+
Internal function to create counted ngrams
6051
"""
61-
function n_grams(extractor::WordNGrams, x, n)
52+
function n_grams(extractor, x, n)
6253
return cummulative_ngram_count(init_ngrams(extractor, x, n))
6354
end
6455

@@ -91,6 +82,24 @@ function extract_features(extractor::WordNGrams, str)
9182
end
9283

9384

85+
"""
86+
Internal function to generate Mecab word-level ngrams features from an AbstractString
87+
"""
88+
function extract_features(extractor::MecabNGrams, str)
89+
words_split = tokenize(extractor.tokenizer, str)
90+
padded_words = pad_string(words_split, extractor.padder)
91+
return make_zero_index_circular_array(n_grams(extractor, padded_words, extractor.n))
92+
end
93+
94+
95+
"""
96+
Internal function to tokenize a string using Mecab
97+
"""
98+
function tokenize(tokenizer::Mecab, str::AbstractString)
99+
return parse_surface(tokenizer, str)
100+
end
101+
102+
94103
"""
95104
Internal function to count and pad generated character-level ngrams (including duplicates)
96105
"""

‎test/test01_dictdb.jl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
module TestDBCollection
22
using SimString
3+
using Wakame: Mecab
34
using Test
45

56

@@ -110,5 +111,14 @@ end
110111

111112

112113

114+
@testset "Test mecab insert" begin
115+
db = DictDB(MecabNGrams(2, " ", Mecab()))
116+
append!(db, ["pythonが大好きです", "I am a cat."])
117+
118+
@test db.string_collection == ["pythonが大好きです", "I am a cat."]
119+
@test db.string_size_map[5] == Set(["pythonが大好きです"])
120+
@test db.string_size_map[6] == Set(["I am a cat."])
121+
end
122+
113123

114124
end # module

‎test/test02_features.jl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
module TestFeatures
22
using SimString
3+
using Wakame: Mecab
34
using Test
45

56

@@ -13,6 +14,10 @@ using Test
1314
word_ngram_res = SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude 😄🍕")
1415
@test word_ngram_res[5] == (["really", "really"], 2)
1516
@test word_ngram_res[8] == (["dude", "😄🍕"], 1)
17+
18+
mecab_ngram_res = SimString.extract_features(MecabNGrams(2, " ", Mecab()), "pythonが大好きです")
19+
@test mecab_ngram_res[1] == (["python", ""], 1)
20+
@test mecab_ngram_res[2] == (["", "大好き"], 1)
1621
end
1722

1823

2 commit comments

Comments
 (2)

PyDataBlog commented on Aug 9, 2023

@PyDataBlog
OwnerAuthor

JuliaRegistrator commented on Aug 9, 2023

@JuliaRegistrator

Registration pull request created: JuliaRegistries/General/89326

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.3.0 -m "<description of version>" d375dfbe023f8b005f5673601d409d3fc8969900
git push origin v0.3.0
Please sign in to comment.