Skip to content

Commit

Permalink
Add missing tokens from added_tokens_decoder into added_tokens (#890)
Browse files Browse the repository at this point in the history
  • Loading branch information
sayanshaw24 authored Feb 21, 2025
1 parent 8ea1b6e commit 1bb7ab2
Show file tree
Hide file tree
Showing 5 changed files with 370 additions and 0 deletions.
26 changes: 26 additions & 0 deletions operators/tokenizer/bpe_kernels.cc
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,32 @@ void JsonFastTokenizer::UpdateTokenizer(const TokenJsonConfig& config, const jso
}
}

std::shared_ptr<json> added_tokens_decoder = config.added_tokens_decoder;

// Add any tokens from the added_tokens_decoder that were missing in added_tokens_
if (added_tokens_decoder && !added_tokens_decoder->empty()) {
for (const auto& [id_str, token] : added_tokens_decoder->items()) {
int id = std::stoi(id_str); // Convert key (ID) from string to integer

// Check if this token is already in the added_tokens_
auto existing_token = added_tokens_.find(ustring(token.value("content", "")));
if (existing_token == added_tokens_.end()) { // Token doesn't exist yet
// Prepare a new token (populate id's with the keys from added_tokens_decoder)
AddedToken added_token;
added_token.id_ = id;
added_token.content_ = token.value("content", "");
added_token.lstrip_ = token.value("lstrip", false);
added_token.normalized_ = token.value("normalized", false);
added_token.rstrip_ = token.value("rstrip", false);
added_token.single_word_ = token.value("single_word", false);
added_token.special_ = token.value("special", false);

// Add the new token to added_tokens_
added_tokens_.emplace(ustring(added_token.content_), added_token);
}
}
}

// iterate the added_tokens_ map and set the special tokens
for (const auto& [key, added_token] : added_tokens_) {
if (added_token.content_ == config.bos_token_) {
Expand Down
4 changes: 4 additions & 0 deletions operators/tokenizer/tokenizer_jsconfig.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class TokenJsonConfig final {
~TokenJsonConfig() {}
using json = nlohmann::json;
using json_pointer = nlohmann::json_pointer<std::string>;
std::shared_ptr<json> added_tokens_decoder;

public:
OrtxStatus AppendModuleJson(json& json_config) {
Expand Down Expand Up @@ -191,6 +192,9 @@ class TokenJsonConfig final {
return OrtxStatus(kOrtxErrorInvalidArgument, "Failed to parse config json.");
}

// Store added_tokens_decoder to add any missed tokens into added_tokens in UpdateTokenizer
added_tokens_decoder = std::make_shared<json>(json_config.value("added_tokens_decoder", json::object()));

auto module_cfg = tok_dir / "tokenizer_module.json";
if (module_cfg.exists()) {
std::ifstream module_ifs = module_cfg.open();
Expand Down
189 changes: 189 additions & 0 deletions test/data/added-tokens/tokenizer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<s>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "</s>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 32003,
"content": "<|placeholder2|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 32004,
"content": "<|placeholder3|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 32005,
"content": "<|placeholder4|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 32006,
"content": "<|system|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 32007,
"content": "<|end|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 32008,
"content": "<|placeholder5|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 32009,
"content": "<|placeholder6|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 32010,
"content": "<|user|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "Prepend",
"prepend": ""
},
{
"type": "Replace",
"pattern": {
"String": " "
},
"content": ""
}
]
},
"pre_tokenizer": null,
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
}
],
"special_tokens": {}
},
"decoder": {
"type": "Sequence",
"decoders": [
{
"type": "Replace",
"pattern": {
"String": ""
},
"content": " "
},
{
"type": "ByteFallback"
},
{
"type": "Fuse"
},
{
"type": "Strip",
"content": " ",
"start": 1,
"stop": 0
}
]
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "<unk>",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": true,
"byte_fallback": true,
"vocab": {
"<unk>": 0,
"<s>": 1,
"</s>": 2,
"<0x0A>": 13
},
"merges": {}
}
}
130 changes: 130 additions & 0 deletions test/data/added-tokens/tokenizer_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
{
"add_bos_token": true,
"add_eos_token": false,
"added_tokens_decoder": {
"0": {
"content": "<unk>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"1": {
"content": "<s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"2": {
"content": "</s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"32000": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"32001": {
"content": "<|assistant|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"32002": {
"content": "<|placeholder1|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"32003": {
"content": "<|placeholder2|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"32004": {
"content": "<|placeholder3|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"32005": {
"content": "<|placeholder4|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"32006": {
"content": "<|system|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"32007": {
"content": "<|end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"32008": {
"content": "<|placeholder5|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"32009": {
"content": "<|placeholder6|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"32010": {
"content": "<|user|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
}
},
"bos_token": "<s>",
"chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
"clean_up_tokenization_spaces": false,
"eos_token": "<|endoftext|>",
"legacy": false,
"model_max_length": 131072,
"pad_token": "<|endoftext|>",
"padding_side": "left",
"sp_model_kwargs": {},
"tokenizer_class": "LlamaTokenizer",
"unk_token": "<unk>",
"use_default_system_prompt": false
}
21 changes: 21 additions & 0 deletions test/pp_api_test/test_tokenizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -590,3 +590,24 @@ TEST(OrtxTokenizerTest, ChatGLMTokenizer) {
115, 176, 3867, 162, 9251, 2829, 4, 102, 220, 6, 5, 63977, 91446,
63829, 130009, 130008, 130008, 130008, 130008, 5, 4, 4, 21, 9, 130001, 130004}));
}

TEST(OrtxTokenizerTest, AddedTokensTest) {
auto tokenizer = std::make_unique<ort_extensions::TokenizerImpl>();
auto status = tokenizer->Load("data/added-tokens");
if (!status.IsOk()) {
std::cout << status.ToString() << std::endl;
tokenizer.reset();
}

ASSERT_NE(tokenizer.get(), nullptr) << "Tokenizer is null, stopping the test.";

std::vector<std::string_view> input = {"<|endoftext|><|assistant|><|placeholder1|>"};
std::vector<extTokenId_t> EXPECTED_IDS_0 = {1, 32000, 32001, 32002};

std::vector<std::vector<extTokenId_t>> token_ids;
status = tokenizer->Tokenize(input, token_ids);
EXPECT_TRUE(status.IsOk());
EXPECT_EQ(token_ids.size(), input.size());
DumpTokenIds(token_ids);
EXPECT_EQ(token_ids[0], EXPECTED_IDS_0);
}

0 comments on commit 1bb7ab2

Please sign in to comment.