-
Notifications
You must be signed in to change notification settings - Fork 40
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[feature](analysis) add new chinese tokenizer IK
Migrate analysis-ik from Java to C++, implement basic tokenization functionality, and integrate it into CLucene.
- Loading branch information
Hj Wu
committed
Jan 20, 2025
1 parent
a1b2a2a
commit f538355
Showing
62 changed files
with
727,906 additions
and
56 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
#include "IKTokenizer.h" | ||
|
||
#include "CLucene/_ApiHeader.h" | ||
#include "CLucene/analysis/ik/core/IKSegmenter.h" | ||
#include "CLucene/util/CLStreams.h" | ||
|
||
CL_NS_DEF2(analysis, ik) | ||
CL_NS_USE(analysis) | ||
CL_NS_USE(util) | ||
|
||
IKTokenizer::IKTokenizer(Reader* reader, std::shared_ptr<Configuration> config) | ||
: Tokenizer(reader), config_(config) { | ||
reset(reader); | ||
Tokenizer::lowercase = false; | ||
Tokenizer::ownReader = false; | ||
} | ||
|
||
IKTokenizer::IKTokenizer(Reader* reader, std::shared_ptr<Configuration> config, bool isSmart, | ||
bool lowercase, bool ownReader) | ||
: Tokenizer(reader), config_(config) { | ||
config_->setUseSmart(isSmart); | ||
config_->setEnableLowercase(lowercase); | ||
reset(reader); | ||
Tokenizer::lowercase = lowercase; | ||
Tokenizer::ownReader = ownReader; | ||
} | ||
|
||
Token* IKTokenizer::next(Token* token) { | ||
if (buffer_index_ >= data_length_) { | ||
return nullptr; | ||
} | ||
|
||
std::string& token_text = tokens_text_[buffer_index_++]; | ||
size_t size = std::min(token_text.size(), static_cast<size_t>(LUCENE_MAX_WORD_LEN)); | ||
if (Tokenizer::lowercase) { | ||
if (!token_text.empty() && static_cast<uint8_t>(token_text[0]) < 0x80) { | ||
std::transform(token_text.begin(), token_text.end(), | ||
token_text.begin(), | ||
[](char c) { return to_lower(c); }); | ||
} | ||
} | ||
token->setNoCopy(token_text.data(), 0, size); | ||
return token; | ||
} | ||
|
||
void IKTokenizer::reset(Reader* reader) { | ||
this->input = reader; | ||
this->buffer_index_ = 0; | ||
this->data_length_ = 0; | ||
this->tokens_text_.clear(); | ||
|
||
buffer_.reserve(input->size()); | ||
|
||
IKSegmentSingleton::getInstance().setContext(reader, config_); | ||
|
||
Lexeme lexeme; | ||
while (IKSegmentSingleton::getInstance().next(lexeme)) { | ||
tokens_text_.emplace_back(std::move(lexeme.getText())); | ||
} | ||
|
||
data_length_ = tokens_text_.size(); | ||
} | ||
|
||
CL_NS_END2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#ifndef CLUCENE_IKTOKENIZER_H | ||
#define CLUCENE_IKTOKENIZER_H | ||
#include <memory> | ||
#include <string_view> | ||
|
||
#include "CLucene.h" | ||
#include "CLucene/analysis/AnalysisHeader.h" | ||
#include "CLucene/analysis/LanguageBasedAnalyzer.h" | ||
#include "CLucene/analysis/ik/cfg/Configuration.h" | ||
#include "CLucene/analysis/ik/core/IKSegmenter.h" | ||
CL_NS_DEF2(analysis, ik) | ||
CL_NS_USE(analysis) | ||
|
||
class IKSegmentSingleton{ | ||
public: | ||
static IKSegmenter& getInstance() { | ||
static IKSegmenter instance; | ||
return instance; | ||
} | ||
|
||
private: | ||
IKSegmentSingleton() = default; | ||
}; | ||
|
||
class IKTokenizer : public lucene::analysis::Tokenizer { | ||
private: | ||
int32_t buffer_index_ {0}; | ||
int32_t data_length_ {0}; | ||
std::string buffer_; | ||
std::vector<std::string> tokens_text_; | ||
std::shared_ptr<Configuration> config_; | ||
|
||
public: | ||
|
||
explicit IKTokenizer(lucene::util::Reader* reader, std::shared_ptr<Configuration> config); | ||
explicit IKTokenizer(lucene::util::Reader* reader, std::shared_ptr<Configuration> config, | ||
bool is_smart, bool use_lowercase, bool own_reader = false); | ||
~IKTokenizer() override = default; | ||
|
||
lucene::analysis::Token* next(lucene::analysis::Token* token) override; | ||
void reset(lucene::util::Reader* reader) override; | ||
}; | ||
|
||
CL_NS_END2 | ||
#endif //CLUCENE_IKTOKENIZER_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
#ifndef CLUCENE_CONFIGURATION_H | ||
#define CLUCENE_CONFIGURATION_H | ||
|
||
#include <string> | ||
|
||
CL_NS_DEF2(analysis, ik) | ||
|
||
class Configuration { | ||
private: | ||
bool use_smart_; | ||
bool enable_lowercase_; | ||
std::string dict_path_; | ||
|
||
struct DictFiles { | ||
std::string main {"main.dic"}; | ||
std::string quantifier {"quantifier.dic"}; | ||
std::string stopwords {"stopword.dic"}; | ||
} dict_files_; | ||
|
||
std::vector<std::string> ext_dict_files_; | ||
std::vector<std::string> ext_stop_word_dict_files_; | ||
|
||
public: | ||
Configuration() | ||
: use_smart_(true), enable_lowercase_(true) { | ||
ext_dict_files_ = {"extra_main.dic", "extra_single_word.dic", "extra_single_word_full.dic", | ||
"extra_single_word_low_freq.dic"}; | ||
|
||
ext_stop_word_dict_files_ = {"extra_stopword.dic"}; | ||
} | ||
|
||
bool isUseSmart() const { return use_smart_; } | ||
Configuration& setUseSmart(bool smart) { | ||
use_smart_ = smart; | ||
return *this; | ||
} | ||
|
||
bool isEnableLowercase() const { return enable_lowercase_; } | ||
Configuration& setEnableLowercase(bool enable) { | ||
enable_lowercase_ = enable; | ||
return *this; | ||
} | ||
|
||
std::string getDictPath() const { return dict_path_; } | ||
Configuration& setDictPath(const std::string& path) { | ||
dict_path_ = path; | ||
return *this; | ||
} | ||
|
||
void setMainDictFile(const std::string& file) { dict_files_.main = file; } | ||
void setQuantifierDictFile(const std::string& file) { dict_files_.quantifier = file; } | ||
void setStopWordDictFile(const std::string& file) { dict_files_.stopwords = file; } | ||
|
||
const std::string& getMainDictFile() const { return dict_files_.main; } | ||
const std::string& getQuantifierDictFile() const { return dict_files_.quantifier; } | ||
const std::string& getStopWordDictFile() const { return dict_files_.stopwords; } | ||
|
||
void addExtDictFile(const std::string& filePath) { ext_dict_files_.push_back(filePath); } | ||
void addExtStopWordDictFile(const std::string& filePath) { | ||
ext_stop_word_dict_files_.push_back(filePath); | ||
} | ||
|
||
const std::vector<std::string>& getExtDictFiles() const { return ext_dict_files_; } | ||
const std::vector<std::string>& getExtStopWordDictFiles() const { | ||
return ext_stop_word_dict_files_; | ||
} | ||
}; | ||
|
||
CL_NS_END2 | ||
|
||
#endif //CLUCENE_CONFIGURATION_H |
Oops, something went wrong.