Skip to content

Update CI

Update CI #2795

Workflow file for this run

name: 'Build & Test (Linux)'
on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
types: [opened, synchronize, reopened, labeled]
jobs:
build:
if: >-
github.event.action == 'opened' ||
contains(github.event.pull_request.labels.*.name, 'run-ci')
runs-on: ubuntu-22.04
strategy:
matrix:
python-version: ['3.10', '3.12']
env:
SCCACHE_GHA_ENABLED: "true"
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install and start etcd
run: |
wget https://github.com/etcd-io/etcd/releases/download/v3.6.1/etcd-v3.6.1-linux-amd64.tar.gz
tar xzf etcd-v3.6.1-linux-amd64.tar.gz
sudo mv etcd-v3.6.1-linux-amd64/etcd* /usr/local/bin/
etcd --advertise-client-urls http://127.0.0.1:2379 --listen-client-urls http://127.0.0.1:2379 &
sleep 3 # Give etcd time to start
etcdctl --endpoints=http://127.0.0.1:2379 endpoint health
shell: bash
- name: Free up disk space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
- name: Install CUDA Toolkit
uses: Jimver/[email protected]
with:
cuda: '12.8.1'
linux-local-args: '["--toolkit"]'
method: 'network'
sub-packages: '["nvcc"]'
- name: Run sccache-cache
uses: mozilla-actions/[email protected]
- name: Configure sccache
uses: actions/github-script@v7
with:
script: |
core.exportVariable('ACTIONS_RESULTS_URL', process.env.ACTIONS_RESULTS_URL || '');
core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
- name: Run sccache stat for check
shell: bash
run: ${SCCACHE_PATH} --show-stats
- name: Configure project
run: |
sudo apt update -y
sudo bash -x dependencies.sh -y
mkdir build
cd build
cmake .. -DUSE_HTTP=ON -DUSE_ETCD=ON -DSTORE_USE_ETCD=ON -DENABLE_ASAN=ON -DENABLE_SCCACHE=ON
shell: bash
- name: Build project
run: |
cd build
make -j
sudo make install
shell: bash
- name: Build nvlink_allocator.so
run: |
mkdir -p build/mooncake-transfer-engine/nvlink-allocator
cd mooncake-transfer-engine/nvlink-allocator
export LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LIBRARY_PATH
bash build.sh ../../build/mooncake-transfer-engine/nvlink-allocator/
shell: bash
- name: Start Metadata Server
run: |
cd mooncake-transfer-engine/example/http-metadata-server-python
pip install aiohttp
python ./bootstrap_server.py &
shell: bash
- name: Test (in build env)
run: |
cd build
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
ldconfig -v || echo "always continue"
MC_METADATA_SERVER=http://127.0.0.1:8080/metadata DEFAULT_KV_LEASE_TTL=500 make test -j ARGS="-V"
shell: bash
- name: Generate Python version tag
id: generate_tag_build
run: |
echo "python_version_tag=$(echo ${{ matrix.python-version }} | tr -d '.')" >> $GITHUB_OUTPUT
shell: bash
- name: Build Python wheel
run: |
# Build wheel with specific Python version
PYTHON_VERSION=${{ matrix.python-version }} OUTPUT_DIR=dist-py${{ steps.generate_tag_build.outputs.python_version_tag }} ./scripts/build_wheel.sh
shell: bash
test-wheel-ubuntu:
needs: build-flags
strategy:
matrix:
ubuntu-version: [ubuntu-22.04, ubuntu-24.04]
python-version: ['3.10', '3.12']
runs-on: ${{ matrix.ubuntu-version }}
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Generate Python version tag
id: generate_tag_test
run: |
echo "python_version_tag=$(echo ${{ matrix.python-version }} | tr -d '.')" >> $GITHUB_OUTPUT
shell: bash
- name: Download wheel artifact
uses: actions/download-artifact@v4
with:
name: mooncake-wheel-ubuntu-py${{ steps.generate_tag_test.outputs.python_version_tag }}
path: mooncake-wheel/dist
- name: Verify wheel file exists
run: |
ls -la mooncake-wheel/dist/
if [ ! -f mooncake-wheel/dist/*.whl ]; then
echo "ERROR: No wheel file found in mooncake-wheel/dist/"
exit 1
fi
shell: bash
- name: Run installation test script
run: |
bash scripts/test_installation.sh
shell: bash
- name: Start metadata server
run: |
source test_env/bin/activate
mooncake_http_metadata_server --port 8080 &
shell: bash
- name: Run tests with ssd
run: |
source test_env/bin/activate
MC_STORE_MEMCPY=false TEST_SSD_OFFLOAD_IN_EVICT=true ./scripts/run_tests.sh
deactivate
shell: bash
- name: Start Mooncake Master
run: |
source test_env/bin/activate
mkdir -p /tmp/mooncake_storage
mooncake_master \
--eviction_high_watermark_ratio=0.95 \
--cluster_id=ci_test_cluster \
--port 50051 &
sleep 3
shell: bash
- name: Run Python Tensor API Performance Test (CI check)
env:
MOONCAKE_MASTER: "127.0.0.1:50051"
MOONCAKE_TE_META_DATA_SERVER: "http://127.0.0.1:8080/metadata"
MOONCAKE_PROTOCOL: "tcp"
LOCAL_HOSTNAME: "127.0.0.1"
run: |
source test_env/bin/activate
python scripts/test_tensor_api.py -n 1
shell: bash
test-sglang-integration:
needs: build-flags
runs-on: ubuntu-latest
env:
tone_user_name: ${{ secrets.TONE_USER_NAME }}
steps:
- name: trigger T-one test
if: ${{ env.tone_user_name != '' }}
run: |
curl -L -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/artifacts > artifact.json
cat artifact.json
artifact_id=$(jq -r ".artifacts[] | select(.name | contains(\"py312\") ) | .id" artifact.json)
signature="${{ secrets.TONE_USER_NAME }}|${{ secrets.TONE_USER_TOKEN }}|$(python3 -c "import time;print(time.time())")"
signature="$(python3 -c "import base64;print(base64.b64encode(\"$signature\".encode('utf-8')).decode('utf-8'))")"
curl -s -H 'Content-Type: application/json' -X POST -d "{\"workspace\":\"mooncake_test\",\"project\":\"mooncake-ci\",\"template\":\"mooncake-ci-test\",\"name\":\"mooncake-ci-${{ github.sha }}\",\"username\":\"${{ secrets.TONE_USER_NAME }}\",\"env_ifs\":\" \",\"env_info\":\"ARTIFACT_ID=${artifact_id} GIT_REPO=${{ github.repository }}\",\"signature\":\"$signature\"}" https://tone.openanolis.cn/api/job/create/ > job.json
if [ "$(jq .code job.json)" == 200 ]; then
echo "job created"
else
echo "job create failed"
exit 1
fi
job_id=$(jq .data.id job.json)
echo "check job status here and remember to cancel it before restart the job !"
echo "job_url: https://tone.openanolis.cn/ws/gclfnh19/test_result/${job_id}"
echo "job_id=${job_id}" >> $GITHUB_ENV
shell: bash
- name: qurey job results
if: ${{ env.tone_user_name != '' }}
run: |
time=0
while true; do
if [ $time -gt 720 ]; then
echo "timeout"
exit 1
fi
signature="${{ secrets.TONE_USER_NAME }}|${{ secrets.TONE_USER_TOKEN }}|$(python3 -c "import time;print(time.time())")"
signature="$(python3 -c "import base64;print(base64.b64encode(\"$signature\".encode('utf-8')).decode('utf-8'))")"
curl -s -H 'Content-Type: application/json' -X POST -d "{\"username\":\"${{ secrets.TONE_USER_NAME }}\", \"signature\":\"$signature\", \"job_id\": \"${job_id}\"}" https://tone.openanolis.cn/api/job/query/ > job_status.json
if ! [ "$(jq .code job_status.json)" == 200 ]; then
echo "job query failed"
exit 1
fi
job_status=$(jq .data.job_second_state job_status.json)
if [[ $job_status =~ "pass" ]]; then
echo "job successful !"
exit 0
elif [[ $job_status =~ "fail" ]] ; then
echo "job failed or stopped !"
exit 1
fi
time=$(( time + 1))
sleep 10
done
shell: bash
build-flags:
if: >-
github.event.action == 'opened' ||
contains(github.event.pull_request.labels.*.name, 'run-ci')
runs-on: ubuntu-22.04
strategy:
matrix:
python-version: ['3.10', '3.12']
env:
BUILD_WITH_EP: "1"
SCCACHE_GHA_ENABLED: "true"
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Free up disk space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo rm -rf /usr/local/lib/android
df -h
- name: Install CUDA Toolkit
uses: Jimver/[email protected]
with:
cuda: '12.8.1'
linux-local-args: '["--toolkit"]'
method: 'network'
sub-packages: '["nvcc", "nvrtc-dev"]'
non-cuda-sub-packages: '["libcusparse-dev", "libcublas-dev", "libcusolver-dev"]'
- name: Run sccache-cache
uses: mozilla-actions/[email protected]
- name: Configure sccache
uses: actions/github-script@v7
with:
script: |
core.exportVariable('ACTIONS_RESULTS_URL', process.env.ACTIONS_RESULTS_URL || '');
core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
- name: Run sccache stat for check
shell: bash
run: ${SCCACHE_PATH} --show-stats
- name: Install dependencies
run: |
sudo apt update -y
sudo bash -x dependencies.sh -y
pip install torch==2.8.0
df -h
shell: bash
- name: Build transfer engine only
run: |
cd mooncake-transfer-engine
mkdir build
cd build
export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
cmake .. -DUSE_ETCD=OFF -DUSE_REDIS=ON -DUSE_HTTP=ON -DWITH_METRICS=ON -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON -DUSE_CUDA=OFF -DUSE_MNNVL=OFF -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs"
make -j4
sudo make install
df -h
shell: bash
- name: Configure project with all settings are ON
run: |
mkdir build
cd build
cmake .. -DUSE_ETCD=ON -DUSE_REDIS=ON -DUSE_HTTP=ON -DWITH_STORE=ON -DWITH_P2P_STORE=ON -DWITH_EP=ON -DWITH_METRICS=ON -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON -DUSE_CUDA=OFF -DUSE_MNNVL=OFF -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs"
shell: bash
# TODO: lack USE_NVMEOF,USE_CUDA,USE_MNNVL
- name: Build project with all settings are ON
run: |
cd build
make -j4
sudo make install
df -h
shell: bash
- name: Configure project with unit tests and examples
run: |
cd build
cmake .. -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON
shell: bash
# TODO: lack WITH_RUST_EXAMPLE
- name: Build project with unit tests and examples
run: |
cd build
make -j4
sudo make install
shell: bash
- name: Configure project
run: |
cd build
rm -r */tests
cmake .. -DBUILD_UNIT_TESTS=OFF -DBUILD_EXAMPLES=OFF -DUSE_HTTP=ON -DENABLE_SCCACHE=ON
shell: bash
- name: Build project
run: |
cd build
make -j4
sudo make install
shell: bash
- name: Build nvlink_allocator.so
run: |
mkdir -p build/mooncake-transfer-engine/nvlink-allocator
cd mooncake-transfer-engine/nvlink-allocator
export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LIBRARY_PATH
bash build.sh ../../build/mooncake-transfer-engine/nvlink-allocator/
shell: bash
- name: Generate Python version tag
id: generate_tag_flags
run: |
echo "python_version_tag=$(echo ${{ matrix.python-version }} | tr -d '.')" >> $GITHUB_OUTPUT
shell: bash
- name: Build Python wheel
run: |
# Build wheel with specific Python version
PYTHON_VERSION=${{ matrix.python-version }} OUTPUT_DIR=dist-py${{ steps.generate_tag_flags.outputs.python_version_tag }} ./scripts/build_wheel.sh
shell: bash
- name: Upload Python wheel artifact
uses: actions/upload-artifact@v4
with:
name: mooncake-wheel-ubuntu-py${{ steps.generate_tag_flags.outputs.python_version_tag }}
path: mooncake-wheel/dist-py${{ steps.generate_tag_flags.outputs.python_version_tag }}/*.whl
build-docker:
name: Build Docker Image
if: >-
github.event.action == 'opened' ||
contains(github.event.pull_request.labels.*.name, 'run-ci')
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Build Docker image
run: docker build -t mooncake-app .
spell-check:
name: Spell Check with Typos
if: >-
github.event.action == 'opened' ||
contains(github.event.pull_request.labels.*.name, 'run-ci')
runs-on: ubuntu-22.04
steps:
- name: Checkout Actions Repository
uses: actions/checkout@v4
- name: Spell Check Repo
uses: crate-ci/[email protected]
clang-format:
name: Check code format
if: >-
github.event.action == 'opened' ||
contains(github.event.pull_request.labels.*.name, 'run-ci')
runs-on: ubuntu-22.04
steps:
- name: Checkout Actions Repository
uses: actions/checkout@v4
- name: Install clang-format 20
run: |
wget https://apt.llvm.org/llvm.sh
chmod +x llvm.sh
sudo ./llvm.sh 20
sudo apt-get install -y clang-format-20
- name: run clang-format-20
run: |
# the old clang-format-14 which is the defaut version in ubuntu 22.04,
# is inconsistent with clang-format-20.
ls -lh /usr/bin/clang-format*
clang-format --version
clang-format-20 --version
# skip cachelib_memory_allocator
find . -type f \( -name "*.h" -o -name "*.cpp" \) | grep -v cachelib_memory_allocator | xargs clang-format-20 -style=file -i
if ! git diff --exit-code; then
echo "Please follow the .clang-format code style, try clang-format -i FILENAME"
exit 1
fi
shell: bash