Skip to content

Commit 1886d97

Browse files
Add full support for MLC_ENABLE_SENTENCEPIECE_TOKENIZER
Allows to totally skip sentencepiece for both the libs and the example. Default still the same (ON)
1 parent f777109 commit 1886d97

File tree

5 files changed

+41
-12
lines changed

5 files changed

+41
-12
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
*.lib
2828

2929
build
30+
release
3031

3132
# Executables
3233
*.exe

CMakeLists.txt

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ option(MSGPACK_USE_BOOST "Use Boost libraried" OFF)
109109
add_subdirectory(msgpack)
110110

111111
option(MLC_ENABLE_SENTENCEPIECE_TOKENIZER "Enable SentencePiece tokenizer" ON)
112+
message(DEBUG "MLC_ENABLE_SENTENCEPIECE_TOKENIZER= ${MLC_ENABLE_SENTENCEPIECE_TOKENIZER}")
112113

113114
if(MSVC)
114115
set(TOKENIZERS_RUST_LIB "${TOKENIZERS_CPP_CARGO_BINARY_DIR}/tokenizers_c.lib")
@@ -153,16 +154,23 @@ add_custom_command(
153154

154155
set(
155156
TOKENIZER_CPP_SRCS
156-
src/sentencepiece_tokenizer.cc
157157
src/huggingface_tokenizer.cc
158158
src/rwkv_world_tokenizer.cc
159159
)
160+
if (${MLC_ENABLE_SENTENCEPIECE_TOKENIZER})
161+
list(APPEND TOKENIZER_CPP_SRCS src/sentencepiece_tokenizer.cc)
162+
endif()
163+
160164
add_library(tokenizers_cpp STATIC ${TOKENIZER_CPP_SRCS})
161-
target_include_directories(tokenizers_cpp PRIVATE sentencepiece/src)
165+
166+
if (${MLC_ENABLE_SENTENCEPIECE_TOKENIZER})
167+
target_include_directories(tokenizers_cpp PRIVATE sentencepiece/src)
168+
endif()
162169
target_include_directories(tokenizers_cpp PRIVATE msgpack/include)
163170
target_include_directories(tokenizers_cpp PUBLIC ${TOKENIZERS_CPP_INCLUDE})
164-
if (MLC_ENABLE_SENTENCEPIECE_TOKENIZER STREQUAL "ON")
165-
target_compile_definitions(tokenizers_cpp PUBLIC MLC_ENABLE_SENTENCEPIECE_TOKENIZER)
171+
172+
if (${MLC_ENABLE_SENTENCEPIECE_TOKENIZER})
173+
target_compile_definitions(tokenizers_cpp PUBLIC MLC_ENABLE_SENTENCEPIECE_TOKENIZER)
166174
endif ()
167175
target_link_libraries(tokenizers_cpp PRIVATE msgpack-cxx)
168176

@@ -178,10 +186,14 @@ if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
178186
XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE})
179187
endmacro (set_xcode_property)
180188
endif()
181-
add_subdirectory(sentencepiece sentencepiece EXCLUDE_FROM_ALL)
189+
if (${MLC_ENABLE_SENTENCEPIECE_TOKENIZER})
190+
add_subdirectory(sentencepiece sentencepiece EXCLUDE_FROM_ALL)
191+
endif()
182192

183193
add_library(tokenizers_c INTERFACE ${TOKENIZERS_RUST_LIB})
184194
target_link_libraries(tokenizers_c INTERFACE ${TOKENIZERS_RUST_LIB} ${TOKENIZERS_C_LINK_LIBS})
185-
186-
target_link_libraries(tokenizers_cpp PRIVATE tokenizers_c sentencepiece-static ${TOKENIZERS_CPP_LINK_LIBS})
195+
if (${MLC_ENABLE_SENTENCEPIECE_TOKENIZER})
196+
set(SPLIB "sentencepiece-static")
197+
endif()
198+
target_link_libraries(tokenizers_cpp PRIVATE tokenizers_c ${SPLIB} ${TOKENIZERS_CPP_LINK_LIBS})
187199
target_include_directories(tokenizers_cpp PUBLIC ${TOKENIZERS_CPP_INCLUDE})

example/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ if(NOT MSVC)
88
check_cxx_compiler_flag("-std=c++17" SUPPORT_CXX17)
99
set(CMAKE_CXX_FLAGS "-std=c++17 ${CMAKE_CXX_FLAGS}")
1010
set(CMAKE_CUDA_STANDARD 17)
11+
find_package(Threads REQUIRED)
1112
else()
1213
check_cxx_compiler_flag("/std:c++17" SUPPORT_CXX17)
1314
set(CMAKE_CXX_FLAGS "/std:c++17 ${CMAKE_CXX_FLAGS}")
@@ -24,4 +25,4 @@ target_include_directories(example PRIVATE ${TOKENZIER_CPP_PATH}/include)
2425

2526
# You can link tokenizers_cpp, it will automatically link tokenizers_c
2627
# and sentencepiece libary
27-
target_link_libraries(example PRIVATE tokenizers_cpp)
28+
target_link_libraries(example PRIVATE tokenizers_cpp Threads::Threads)

example/build_and_run.sh

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,19 @@
11
#/bin/bash
2-
2+
echo "Usage: $0 (ENABLE_SENTENCEPIECE_TOKENIZER default value 1=ON)"
33
# build
44
mkdir -p build
55
cd build
6-
cmake ..
7-
make -j8
6+
echo
7+
echo "cmake ..."
8+
echo "CXX=$CXX"
9+
g++ --version
10+
11+
ENABLESP=${1:-ON}
12+
13+
cmake .. -DMLC_ENABLE_SENTENCEPIECE_TOKENIZER=$ENABLESP || exit 1
14+
echo
15+
echo "make..."
16+
make -j8 || exit 1
817
cd ..
918
# get example files
1019

@@ -26,8 +35,9 @@ fi
2635
if [ ! -f "merges.txt" ]; then
2736
wget https://huggingface.co/Qwen/Qwen2.5-3B-Instruct/resolve/main/merges.txt
2837
fi
38+
2939
cd ..
3040

3141
# run
3242
echo "---Running example----"
33-
./build/example
43+
./build/example || exit 1

example/example.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
using tokenizers::Tokenizer;
1010

1111
std::string LoadBytesFromFile(const std::string& path) {
12+
std::cout << "Loading " << path << std::endl;
1213
std::ifstream fs(path, std::ios::in | std::ios::binary);
1314
if (fs.fail()) {
1415
std::cerr << "Cannot open " << path << std::endl;
@@ -60,6 +61,7 @@ void TestTokenizer(std::unique_ptr<Tokenizer> tok, bool print_vocab = false,
6061
std::cout << std::endl;
6162
}
6263

64+
#ifdef MLC_ENABLE_SENTENCEPIECE_TOKENIZER
6365
// Sentencepiece tokenizer
6466
// - dist/tokenizer.model
6567
void SentencePieceTokenizerExample() {
@@ -80,6 +82,7 @@ void SentencePieceTokenizerExample() {
8082

8183
TestTokenizer(std::move(tok), false, true);
8284
}
85+
#endif
8386

8487
// HF tokenizer
8588
// - dist/tokenizer.json
@@ -141,7 +144,9 @@ void RWKVWorldTokenizerExample() {
141144
}
142145

143146
int main(int argc, char* argv[]) {
147+
#ifdef MLC_ENABLE_SENTENCEPIECE_TOKENIZER
144148
SentencePieceTokenizerExample();
149+
#endif
145150
HuggingFaceTokenizerExample();
146151
HuggingFaceBPETokenizerExample();
147152
RWKVWorldTokenizerExample();

0 commit comments

Comments
 (0)