Skip to content

Commit

Permalink
resolve comments
Browse files Browse the repository at this point in the history
  • Loading branch information
Sayan Shaw committed Feb 21, 2025
1 parent 66ae0a4 commit 0d3ed58
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 5 deletions.
6 changes: 3 additions & 3 deletions operators/tokenizer/bpe_kernels.cc
Original file line number Diff line number Diff line change
Expand Up @@ -682,11 +682,11 @@ void JsonFastTokenizer::UpdateTokenizer(const TokenJsonConfig& config, const jso
}
}

const json added_tokens_decoder = config.added_tokens_decoder;
std::shared_ptr<json> added_tokens_decoder = config.added_tokens_decoder;

// Add any tokens from the added_tokens_decoder that were missing in added_tokens_
if (!added_tokens_decoder.empty()) {
for (const auto& [id_str, token] : added_tokens_decoder.items()) {
if (added_tokens_decoder && !added_tokens_decoder->empty()) {
for (const auto& [id_str, token] : added_tokens_decoder->items()) {
int id = std::stoi(id_str); // Convert key (ID) from string to integer

// Check if this token is already in the added_tokens_
Expand Down
4 changes: 2 additions & 2 deletions operators/tokenizer/tokenizer_jsconfig.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class TokenJsonConfig final {
~TokenJsonConfig() {}
using json = nlohmann::json;
using json_pointer = nlohmann::json_pointer<std::string>;
json added_tokens_decoder;
std::shared_ptr<json> added_tokens_decoder;

public:
OrtxStatus AppendModuleJson(json& json_config) {
Expand Down Expand Up @@ -193,7 +193,7 @@ class TokenJsonConfig final {
}

// Store added_tokens_decoder to add any missed tokens into added_tokens in UpdateTokenizer
added_tokens_decoder = json_config.value("added_tokens_decoder", json::object());
added_tokens_decoder = std::make_shared<json>(json_config.value("added_tokens_decoder", json::object()));

auto module_cfg = tok_dir / "tokenizer_module.json";
if (module_cfg.exists()) {
Expand Down
21 changes: 21 additions & 0 deletions test/pp_api_test/test_tokenizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -590,3 +590,24 @@ TEST(OrtxTokenizerTest, ChatGLMTokenizer) {
115, 176, 3867, 162, 9251, 2829, 4, 102, 220, 6, 5, 63977, 91446,
63829, 130009, 130008, 130008, 130008, 130008, 5, 4, 4, 21, 9, 130001, 130004}));
}

TEST(OrtxTokenizerTest, AddedTokensTest) {
auto tokenizer = std::make_unique<ort_extensions::TokenizerImpl>();
auto status = tokenizer->Load("data/added-tokens");
if (!status.IsOk()) {
std::cout << status.ToString() << std::endl;
tokenizer.reset();
}

ASSERT_NE(tokenizer.get(), nullptr) << "Tokenizer is null, stopping the test.";

std::vector<std::string_view> input = {"<|endoftext|><|assistant|><|placeholder1|>"};
std::vector<extTokenId_t> EXPECTED_IDS_0 = {1, 32000, 32001, 32002};

std::vector<std::vector<extTokenId_t>> token_ids;
status = tokenizer->Tokenize(input, token_ids);
EXPECT_TRUE(status.IsOk());
EXPECT_EQ(token_ids.size(), input.size());
DumpTokenIds(token_ids);
EXPECT_EQ(token_ids[0], EXPECTED_IDS_0);
}

0 comments on commit 0d3ed58

Please sign in to comment.