Skip to content

Commit

Permalink
[feature](analysis) add new chinese tokenizer IK
Browse files Browse the repository at this point in the history
Migrate analysis-ik from Java to C++, implement basic tokenization functionality, and integrate it into CLucene.
  • Loading branch information
Hj Wu committed Jan 20, 2025
1 parent a1b2a2a commit f538355
Show file tree
Hide file tree
Showing 62 changed files with 727,906 additions and 56 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,8 @@ INCLUDE_DIRECTORIES( ${_CL_BOOST_INCLUDE_PATH} )

#include the projects
ADD_SUBDIRECTORY (src/ext)
set(PARALLEL_HASHMAP_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/src/ext/parallel_hashmap")
include_directories(${PARALLEL_HASHMAP_INCLUDE_DIR})
include(cmake/FindRoaring.cmake)
find_package(Roaring REQUIRED)

Expand Down
41 changes: 36 additions & 5 deletions src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#include "CLucene/analysis/Analyzers.h"
#include "CLucene/analysis/cjk/CJKAnalyzer.h"
#include "CLucene/analysis/jieba/ChineseTokenizer.h"
#include "CLucene/analysis/ik/IKTokenizer.h"
#include "CLucene/analysis/ik/dic/Dictionary.h"
#include "CLucene/analysis/standard/StandardFilter.h"
#include "CLucene/analysis/standard/StandardTokenizer.h"
#include "CLucene/snowball/SnowballFilter.h"
Expand All @@ -20,6 +22,7 @@ CL_NS_USE2(analysis, cjk)
CL_NS_USE2(analysis, jieba)
CL_NS_USE2(analysis, standard)
CL_NS_USE2(analysis, snowball)
CL_NS_USE2(analysis, ik)

CL_NS_DEF(analysis)

Expand All @@ -33,6 +36,8 @@ LanguageBasedAnalyzer::LanguageBasedAnalyzer(const TCHAR *language, bool stem, A
this->stem = stem;
this->mode = mode;
Analyzer::_lowercase = false;
ikConfig = std::make_shared<CL_NS2(analysis,ik)::Configuration>();
ikConfig->setUseSmart(mode == AnalyzerMode::IK_Smart);
}

LanguageBasedAnalyzer::~LanguageBasedAnalyzer() {
Expand Down Expand Up @@ -77,6 +82,12 @@ void LanguageBasedAnalyzer::initDict(const std::string &dictPath) {
}

CL_NS2(analysis, jieba)::ChineseTokenizer::init(&chineseDict);
} else if (_tcscmp(lang, _T("ik")) == 0) {
if (!ikConfig) {
ikConfig = std::make_shared<CL_NS2(analysis,ik)::Configuration>();
}
ikConfig->setDictPath(dictPath);
Dictionary::initial(*ikConfig);
}
}

Expand All @@ -90,9 +101,11 @@ TokenStream *LanguageBasedAnalyzer::reusableTokenStream(const TCHAR * /*fieldNam
} else if (_tcscmp(lang, _T("chinese")) == 0) {
streams->tokenStream = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader, mode, Analyzer::_lowercase);
streams->filteredTokenStream = streams->tokenStream;
} else if (_tcscmp(lang, _T("ik")) == 0) {
streams->tokenStream = _CLNEW CL_NS2(analysis, ik)::IKTokenizer(reader, ikConfig, mode==AnalyzerMode::IK_Smart, Analyzer::_lowercase);
streams->filteredTokenStream = streams->tokenStream;
} else {
CL_NS(util)::BufferedReader* bufferedReader = reader->__asBufferedReader();

if (bufferedReader == nullptr) {
streams->tokenStream = _CLNEW StandardTokenizer(
_CLNEW CL_NS(util)::FilteredBufferedReader(reader, false), true);
Expand All @@ -116,13 +129,21 @@ TokenStream *LanguageBasedAnalyzer::reusableTokenStream(const TCHAR * /*fieldNam
return streams->filteredTokenStream;
}

TokenStream *LanguageBasedAnalyzer::tokenStream(const TCHAR *fieldName, Reader *reader) {
TokenStream *ret = nullptr;
TokenStream* LanguageBasedAnalyzer::tokenStream(const TCHAR* fieldName, Reader* reader) {
TokenStream* ret = nullptr;
if (_tcscmp(lang, _T("cjk")) == 0) {
ret = _CLNEW CL_NS2(analysis, cjk)::CJKTokenizer(reader);
} else if (_tcscmp(lang, _T("chinese")) == 0) {
ret = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader, mode, Analyzer::_lowercase, Analyzer::_ownReader);
} else {
ret = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(
reader, mode, Analyzer::_lowercase, Analyzer::_ownReader);
} else if (_tcscmp(lang, _T("ik")) == 0) {
if (ikConfig) {
ret = _CLNEW CL_NS2(analysis, ik)::IKTokenizer(
reader, ikConfig, mode!=AnalyzerMode::IK_Max_Word, Analyzer::_lowercase, Analyzer::_ownReader);
} else {
_CLTHROWA(CL_ERR_NullPointer, std::string("no ikConfig for ik tokenizer").c_str());
}
}else {
CL_NS(util)::BufferedReader* bufferedReader = reader->__asBufferedReader();

if (bufferedReader == nullptr) {
Expand All @@ -143,4 +164,14 @@ TokenStream *LanguageBasedAnalyzer::tokenStream(const TCHAR *fieldName, Reader *
return ret;
}



void LanguageBasedAnalyzer::setIKConfiguration(const CL_NS2(analysis,ik)::Configuration& cfg) {
if (!ikConfig) {
ikConfig = std::make_shared<CL_NS2(analysis,ik)::Configuration>(cfg);
} else {
*ikConfig = cfg;
}
}

CL_NS_END
8 changes: 7 additions & 1 deletion src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,18 @@
#ifndef _lucene_analysis_languagebasedanalyzer_
#define _lucene_analysis_languagebasedanalyzer_

#include <memory>
#include "CLucene/analysis/AnalysisHeader.h"
#include "CLucene/analysis/ik/cfg/Configuration.h"

CL_NS_DEF(analysis)

enum class AnalyzerMode {
Default,
All,
Search
Search,
IK_Smart,
IK_Max_Word
};

class CLUCENE_CONTRIBS_EXPORT LanguageBasedAnalyzer : public CL_NS(analysis)::Analyzer {
Expand All @@ -37,6 +41,7 @@ class CLUCENE_CONTRIBS_EXPORT LanguageBasedAnalyzer : public CL_NS(analysis)::An
TCHAR lang[100]{};
bool stem;
AnalyzerMode mode{};
std::shared_ptr<CL_NS2(analysis,ik)::Configuration> ikConfig;

public:
explicit LanguageBasedAnalyzer(const TCHAR *language = nullptr, bool stem = true, AnalyzerMode mode = AnalyzerMode::All);
Expand All @@ -47,6 +52,7 @@ class CLUCENE_CONTRIBS_EXPORT LanguageBasedAnalyzer : public CL_NS(analysis)::An
void setLanguage(const TCHAR *language);
void setStem(bool s);
void setMode(AnalyzerMode m);
void setIKConfiguration(const CL_NS2(analysis,ik)::Configuration& cfg);
void initDict(const std::string &dictPath);
TokenStream *tokenStream(const TCHAR *fieldName, CL_NS(util)::Reader *reader) override;
TokenStream *reusableTokenStream(const TCHAR * /*fieldName*/, CL_NS(util)::Reader *reader) override;
Expand Down
64 changes: 64 additions & 0 deletions src/contribs-lib/CLucene/analysis/ik/IKTokenizer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#include "IKTokenizer.h"

#include "CLucene/_ApiHeader.h"
#include "CLucene/analysis/ik/core/IKSegmenter.h"
#include "CLucene/util/CLStreams.h"

CL_NS_DEF2(analysis, ik)
CL_NS_USE(analysis)
CL_NS_USE(util)

IKTokenizer::IKTokenizer(Reader* reader, std::shared_ptr<Configuration> config)
: Tokenizer(reader), config_(config) {
reset(reader);
Tokenizer::lowercase = false;
Tokenizer::ownReader = false;
}

IKTokenizer::IKTokenizer(Reader* reader, std::shared_ptr<Configuration> config, bool isSmart,
bool lowercase, bool ownReader)
: Tokenizer(reader), config_(config) {
config_->setUseSmart(isSmart);
config_->setEnableLowercase(lowercase);
reset(reader);
Tokenizer::lowercase = lowercase;
Tokenizer::ownReader = ownReader;
}

Token* IKTokenizer::next(Token* token) {
if (buffer_index_ >= data_length_) {
return nullptr;
}

std::string& token_text = tokens_text_[buffer_index_++];
size_t size = std::min(token_text.size(), static_cast<size_t>(LUCENE_MAX_WORD_LEN));
if (Tokenizer::lowercase) {
if (!token_text.empty() && static_cast<uint8_t>(token_text[0]) < 0x80) {
std::transform(token_text.begin(), token_text.end(),
token_text.begin(),
[](char c) { return to_lower(c); });
}
}
token->setNoCopy(token_text.data(), 0, size);
return token;
}

void IKTokenizer::reset(Reader* reader) {
this->input = reader;
this->buffer_index_ = 0;
this->data_length_ = 0;
this->tokens_text_.clear();

buffer_.reserve(input->size());

IKSegmentSingleton::getInstance().setContext(reader, config_);

Lexeme lexeme;
while (IKSegmentSingleton::getInstance().next(lexeme)) {
tokens_text_.emplace_back(std::move(lexeme.getText()));
}

data_length_ = tokens_text_.size();
}

CL_NS_END2
45 changes: 45 additions & 0 deletions src/contribs-lib/CLucene/analysis/ik/IKTokenizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#ifndef CLUCENE_IKTOKENIZER_H
#define CLUCENE_IKTOKENIZER_H
#include <memory>
#include <string_view>

#include "CLucene.h"
#include "CLucene/analysis/AnalysisHeader.h"
#include "CLucene/analysis/LanguageBasedAnalyzer.h"
#include "CLucene/analysis/ik/cfg/Configuration.h"
#include "CLucene/analysis/ik/core/IKSegmenter.h"
CL_NS_DEF2(analysis, ik)
CL_NS_USE(analysis)

class IKSegmentSingleton{
public:
static IKSegmenter& getInstance() {
static IKSegmenter instance;
return instance;
}

private:
IKSegmentSingleton() = default;
};

class IKTokenizer : public lucene::analysis::Tokenizer {
private:
int32_t buffer_index_ {0};
int32_t data_length_ {0};
std::string buffer_;
std::vector<std::string> tokens_text_;
std::shared_ptr<Configuration> config_;

public:

explicit IKTokenizer(lucene::util::Reader* reader, std::shared_ptr<Configuration> config);
explicit IKTokenizer(lucene::util::Reader* reader, std::shared_ptr<Configuration> config,
bool is_smart, bool use_lowercase, bool own_reader = false);
~IKTokenizer() override = default;

lucene::analysis::Token* next(lucene::analysis::Token* token) override;
void reset(lucene::util::Reader* reader) override;
};

CL_NS_END2
#endif //CLUCENE_IKTOKENIZER_H
71 changes: 71 additions & 0 deletions src/contribs-lib/CLucene/analysis/ik/cfg/Configuration.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#ifndef CLUCENE_CONFIGURATION_H
#define CLUCENE_CONFIGURATION_H

#include <string>

CL_NS_DEF2(analysis, ik)

class Configuration {
private:
bool use_smart_;
bool enable_lowercase_;
std::string dict_path_;

struct DictFiles {
std::string main {"main.dic"};
std::string quantifier {"quantifier.dic"};
std::string stopwords {"stopword.dic"};
} dict_files_;

std::vector<std::string> ext_dict_files_;
std::vector<std::string> ext_stop_word_dict_files_;

public:
Configuration()
: use_smart_(true), enable_lowercase_(true) {
ext_dict_files_ = {"extra_main.dic", "extra_single_word.dic", "extra_single_word_full.dic",
"extra_single_word_low_freq.dic"};

ext_stop_word_dict_files_ = {"extra_stopword.dic"};
}

bool isUseSmart() const { return use_smart_; }
Configuration& setUseSmart(bool smart) {
use_smart_ = smart;
return *this;
}

bool isEnableLowercase() const { return enable_lowercase_; }
Configuration& setEnableLowercase(bool enable) {
enable_lowercase_ = enable;
return *this;
}

std::string getDictPath() const { return dict_path_; }
Configuration& setDictPath(const std::string& path) {
dict_path_ = path;
return *this;
}

void setMainDictFile(const std::string& file) { dict_files_.main = file; }
void setQuantifierDictFile(const std::string& file) { dict_files_.quantifier = file; }
void setStopWordDictFile(const std::string& file) { dict_files_.stopwords = file; }

const std::string& getMainDictFile() const { return dict_files_.main; }
const std::string& getQuantifierDictFile() const { return dict_files_.quantifier; }
const std::string& getStopWordDictFile() const { return dict_files_.stopwords; }

void addExtDictFile(const std::string& filePath) { ext_dict_files_.push_back(filePath); }
void addExtStopWordDictFile(const std::string& filePath) {
ext_stop_word_dict_files_.push_back(filePath);
}

const std::vector<std::string>& getExtDictFiles() const { return ext_dict_files_; }
const std::vector<std::string>& getExtStopWordDictFiles() const {
return ext_stop_word_dict_files_;
}
};

CL_NS_END2

#endif //CLUCENE_CONFIGURATION_H
Loading

0 comments on commit f538355

Please sign in to comment.