Skip to content

Add full support for MLC_ENABLE_SENTENCEPIECE_TOKENIZER #85

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
*.lib

build
release

# Executables
*.exe
Expand Down
26 changes: 19 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ option(MSGPACK_USE_BOOST "Use Boost libraried" OFF)
add_subdirectory(msgpack)

option(MLC_ENABLE_SENTENCEPIECE_TOKENIZER "Enable SentencePiece tokenizer" ON)
message(DEBUG "MLC_ENABLE_SENTENCEPIECE_TOKENIZER= ${MLC_ENABLE_SENTENCEPIECE_TOKENIZER}")

if(MSVC)
set(TOKENIZERS_RUST_LIB "${TOKENIZERS_CPP_CARGO_BINARY_DIR}/tokenizers_c.lib")
Expand Down Expand Up @@ -153,16 +154,23 @@ add_custom_command(

set(
TOKENIZER_CPP_SRCS
src/sentencepiece_tokenizer.cc
src/huggingface_tokenizer.cc
src/rwkv_world_tokenizer.cc
)
if (${MLC_ENABLE_SENTENCEPIECE_TOKENIZER})
list(APPEND TOKENIZER_CPP_SRCS src/sentencepiece_tokenizer.cc)
endif()

add_library(tokenizers_cpp STATIC ${TOKENIZER_CPP_SRCS})
target_include_directories(tokenizers_cpp PRIVATE sentencepiece/src)

if (${MLC_ENABLE_SENTENCEPIECE_TOKENIZER})
target_include_directories(tokenizers_cpp PRIVATE sentencepiece/src)
endif()
target_include_directories(tokenizers_cpp PRIVATE msgpack/include)
target_include_directories(tokenizers_cpp PUBLIC ${TOKENIZERS_CPP_INCLUDE})
if (MLC_ENABLE_SENTENCEPIECE_TOKENIZER STREQUAL "ON")
target_compile_definitions(tokenizers_cpp PUBLIC MLC_ENABLE_SENTENCEPIECE_TOKENIZER)

if (${MLC_ENABLE_SENTENCEPIECE_TOKENIZER})
target_compile_definitions(tokenizers_cpp PUBLIC MLC_ENABLE_SENTENCEPIECE_TOKENIZER)
endif ()
target_link_libraries(tokenizers_cpp PRIVATE msgpack-cxx)

Expand All @@ -178,10 +186,14 @@ if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE})
endmacro (set_xcode_property)
endif()
add_subdirectory(sentencepiece sentencepiece EXCLUDE_FROM_ALL)
if (${MLC_ENABLE_SENTENCEPIECE_TOKENIZER})
add_subdirectory(sentencepiece sentencepiece EXCLUDE_FROM_ALL)
endif()

add_library(tokenizers_c INTERFACE ${TOKENIZERS_RUST_LIB})
target_link_libraries(tokenizers_c INTERFACE ${TOKENIZERS_RUST_LIB} ${TOKENIZERS_C_LINK_LIBS})

target_link_libraries(tokenizers_cpp PRIVATE tokenizers_c sentencepiece-static ${TOKENIZERS_CPP_LINK_LIBS})
if (${MLC_ENABLE_SENTENCEPIECE_TOKENIZER})
set(SPLIB "sentencepiece-static")
endif()
target_link_libraries(tokenizers_cpp PRIVATE tokenizers_c ${SPLIB} ${TOKENIZERS_CPP_LINK_LIBS})
target_include_directories(tokenizers_cpp PUBLIC ${TOKENIZERS_CPP_INCLUDE})
3 changes: 2 additions & 1 deletion example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ if(NOT MSVC)
check_cxx_compiler_flag("-std=c++17" SUPPORT_CXX17)
set(CMAKE_CXX_FLAGS "-std=c++17 ${CMAKE_CXX_FLAGS}")
set(CMAKE_CUDA_STANDARD 17)
find_package(Threads REQUIRED)
else()
check_cxx_compiler_flag("/std:c++17" SUPPORT_CXX17)
set(CMAKE_CXX_FLAGS "/std:c++17 ${CMAKE_CXX_FLAGS}")
Expand All @@ -24,4 +25,4 @@ target_include_directories(example PRIVATE ${TOKENZIER_CPP_PATH}/include)

# You can link tokenizers_cpp, it will automatically link tokenizers_c
# and sentencepiece libary
target_link_libraries(example PRIVATE tokenizers_cpp)
target_link_libraries(example PRIVATE tokenizers_cpp Threads::Threads)
18 changes: 14 additions & 4 deletions example/build_and_run.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
#/bin/bash

echo "Usage: $0 (ENABLE_SENTENCEPIECE_TOKENIZER default value 1=ON)"
# build
mkdir -p build
cd build
cmake ..
make -j8
echo
echo "cmake ..."
echo "CXX=$CXX"
g++ --version

ENABLESP=${1:-ON}

cmake .. -DMLC_ENABLE_SENTENCEPIECE_TOKENIZER=$ENABLESP || exit 1
echo
echo "make..."
make -j8 || exit 1
cd ..
# get example files

Expand All @@ -26,8 +35,9 @@ fi
if [ ! -f "merges.txt" ]; then
wget https://huggingface.co/Qwen/Qwen2.5-3B-Instruct/resolve/main/merges.txt
fi

cd ..

# run
echo "---Running example----"
./build/example
./build/example || exit 1
5 changes: 5 additions & 0 deletions example/example.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
using tokenizers::Tokenizer;

std::string LoadBytesFromFile(const std::string& path) {
std::cout << "Loading " << path << std::endl;
std::ifstream fs(path, std::ios::in | std::ios::binary);
if (fs.fail()) {
std::cerr << "Cannot open " << path << std::endl;
Expand Down Expand Up @@ -60,6 +61,7 @@ void TestTokenizer(std::unique_ptr<Tokenizer> tok, bool print_vocab = false,
std::cout << std::endl;
}

#ifdef MLC_ENABLE_SENTENCEPIECE_TOKENIZER
// Sentencepiece tokenizer
// - dist/tokenizer.model
void SentencePieceTokenizerExample() {
Expand All @@ -80,6 +82,7 @@ void SentencePieceTokenizerExample() {

TestTokenizer(std::move(tok), false, true);
}
#endif

// HF tokenizer
// - dist/tokenizer.json
Expand Down Expand Up @@ -141,7 +144,9 @@ void RWKVWorldTokenizerExample() {
}

int main(int argc, char* argv[]) {
#ifdef MLC_ENABLE_SENTENCEPIECE_TOKENIZER
SentencePieceTokenizerExample();
#endif
HuggingFaceTokenizerExample();
HuggingFaceBPETokenizerExample();
RWKVWorldTokenizerExample();
Expand Down