Skip to content

Commit 0d17887

Browse files
committed
[Windows] Fix build issues using Clang-CL on Windows, add CI
1 parent 00d86aa commit 0d17887

File tree

8 files changed

+98
-8
lines changed

8 files changed

+98
-8
lines changed

.github/workflows/pull.yml

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,59 @@ jobs:
3434
3535
# Run tests
3636
pytest
37+
38+
windows:
39+
uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
40+
with:
41+
runner: windows.4xlarge
42+
submodules: 'recursive'
43+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
44+
script: |
45+
conda init powershell
46+
powershell -Command "& {
47+
Set-PSDebug -Trace 1
48+
\$ErrorActionPreference = 'Stop'
49+
\$PSNativeCommandUseErrorActionPreference = \$true
50+
51+
# Create a symbolic link to work around path length limitations. This gives a much shorter
52+
# path, as the default checkout directory is deeply nested.
53+
\$workingDir = \$PWD.Path
54+
cd \$Env:GITHUB_WORKSPACE
55+
New-Item -ItemType SymbolicLink -Path tk -Value \$workingDir
56+
cd tk
57+
58+
# Run C++ unit tests
59+
cmake -DCMAKE_BUILD_TYPE=Debug test -Bbuild/test -T ClangCL
60+
cmake --build build/test -j9 --config Debug
61+
if (\$LASTEXITCODE -ne 0) {
62+
Write-Host "Build was not successful. Exit code: \$LASTEXITCODE."
63+
exit \$LASTEXITCODE
64+
}
65+
66+
Push-Location build/test
67+
ctest
68+
if (\$LASTEXITCODE -ne 0) {
69+
Write-Host "Unit tests were not successful. Exit code: \$LASTEXITCODE."
70+
exit \$LASTEXITCODE
71+
}
72+
Pop-Location
73+
74+
conda create --yes --quiet -n tokenizers python=3.12
75+
conda activate tokenizers
76+
77+
# Install tokenizers
78+
pip install . -v
79+
if (\$LASTEXITCODE -ne 0) {
80+
Write-Host "Python installation was unsuccessful. Exit code: \$LASTEXITCODE."
81+
exit \$LASTEXITCODE
82+
}
83+
pip install pytest blobfile transformers>=4.53.1
84+
85+
# Run python tests
86+
pytest
87+
if (\$LASTEXITCODE -ne 0) {
88+
Write-Host "Python tests were not successful. Exit code: \$LASTEXITCODE."
89+
Start-Sleep -Seconds 600 # Debug - keep alive to give time to SSH
90+
exit \$LASTEXITCODE
91+
}
92+
}"

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,7 @@ pip-out/
3434
*~
3535
.~lock.*
3636
*.idea
37+
38+
*.so
39+
*.dylib
40+
*.pyd

CMakeLists.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@ include(CMakePackageConfigHelpers)
2828
include(Utils.cmake)
2929

3030
# Ignore weak attribute warning
31-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes")
31+
if(NOT MSVC)
32+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes")
33+
endif()
3234

3335
set(ABSL_ENABLE_INSTALL ON)
3436
set(ABSL_PROPAGATE_CXX_STD ON)
@@ -49,7 +51,7 @@ endif()
4951

5052
add_subdirectory(
5153
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece
52-
${CMAKE_CURRENT_BINARY_DIR}/sentencepiece-build
54+
${CMAKE_CURRENT_BINARY_DIR}/sp-build
5355
EXCLUDE_FROM_ALL
5456
)
5557

include/pytorch/tokenizers/compiler.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
/**
10+
* @file
11+
* Compiler and platform-specific declarations.
12+
*/
13+
14+
#ifdef _WIN32
15+
#include <BaseTsd.h>
16+
// ssize_t isn't available on Windows. Alias it to the Windows SSIZE_T value.
17+
typedef SSIZE_T size_t;
18+
#endif

include/pytorch/tokenizers/tiktoken.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
// Local
2121
#include <pytorch/tokenizers/bpe_tokenizer_base.h>
22+
#include <pytorch/tokenizers/compiler.h>
2223
#include <pytorch/tokenizers/regex.h>
2324
#include <pytorch/tokenizers/result.h>
2425
#include <pytorch/tokenizers/tokenizer.h>

setup.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,6 @@ class CMakeBuild(build_ext):
3030
def build_extension(self, ext): # noqa C901
3131
extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
3232

33-
# Ensure the extension goes into the pytorch_tokenizers package directory
34-
extdir = os.path.join(extdir, "pytorch_tokenizers")
35-
3633
# Required for auto-detection & inclusion of auxiliary "native" libs
3734
if not extdir.endswith(os.path.sep):
3835
extdir += os.path.sep
@@ -55,6 +52,10 @@ def build_extension(self, ext): # noqa C901
5552
]
5653
build_args = ["--target", "pytorch_tokenizers_cpp"]
5754

55+
# Use Clang for Windows builds.
56+
if sys.platform == "win32":
57+
cmake_args += ["-T ClangCL"]
58+
5859
# Adding CMake arguments set as environment variable
5960
# (needed e.g. to build for ARM OSX on conda-forge)
6061
if "CMAKE_ARGS" in os.environ:
@@ -132,7 +133,7 @@ def build_extension(self, ext): # noqa C901
132133
long_description_content_type="text/markdown",
133134
url="https://github.com/meta-pytorch/tokenizers",
134135
packages=find_packages(),
135-
ext_modules=[CMakeExtension("pytorch_tokenizers_cpp")],
136+
ext_modules=[CMakeExtension("pytorch_tokenizers.pytorch_tokenizers_cpp")],
136137
cmdclass={"build_ext": CMakeBuild},
137138
zip_safe=False,
138139
python_requires=">=3.10",

src/hf_tokenizer.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,14 @@ Error HFTokenizer::load(const std::string& path) {
3434
std::string model_config_json = "";
3535
if (fs::is_directory(path)) {
3636
const fs::path root(path);
37-
model_json = root / "tokenizer.json";
37+
model_json = (root / "tokenizer.json").string();
3838
if (!fs::exists(model_json)) {
3939
TK_LOG(Info, "no tokenizer.json found in %s", path.c_str());
4040
return Error::LoadFailure;
4141
}
4242
const auto model_config_json_path = root / "tokenizer_config.json";
4343
if (fs::exists(model_config_json_path)) {
44-
model_config_json = model_config_json_path;
44+
model_config_json = model_config_json_path.string();
4545
}
4646
}
4747

test/test_tiktoken.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,11 @@ TEST_F(TiktokenTest, ConstructionWithInvalidBOSIndex) {
124124
std::vector<std::string>{"<|end_of_text|>"}),
125125
1,
126126
0),
127+
#if !GTEST_OS_WINDOWS
127128
::testing::KilledBySignal(SIGABRT),
129+
#else
130+
[](int exit_code) { return exit_code != 0; },
131+
#endif
128132
"");
129133
#endif
130134
}
@@ -139,7 +143,11 @@ TEST_F(TiktokenTest, ConstructionWithInvalidEOSIndex) {
139143
std::vector<std::string>{"<|begin_of_text|>"}),
140144
0,
141145
1),
146+
#if !GTEST_OS_WINDOWS
142147
::testing::KilledBySignal(SIGABRT),
148+
#else
149+
[](int exit_code) { return exit_code != 0; },
150+
#endif
143151
"");
144152
#endif
145153
}

0 commit comments

Comments
 (0)