Skip to content
This repository was archived by the owner on Sep 23, 2025. It is now read-only.

Commit 00b1aaf

Browse files
authored
Merge branch 'intel:main' into integrate_dpo
2 parents 8d4e480 + 320922f commit 00b1aaf

23 files changed

+114
-80
lines changed

.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ ipex:
1414
enabled: false
1515
precision: bf16
1616
model_description:
17-
model_id_or_path: meta-llama/Llama-2-7b-chat-hf
18-
tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf
17+
model_id_or_path: NousResearch/Llama-2-7b-chat-hf
18+
tokenizer_name_or_path: NousResearch/Llama-2-7b-chat-hf
1919
config:
2020
use_auth_token: ''

.github/workflows/workflow_finetune.yml

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@ on:
1111
default: '10.1.2.13:5000/llmray-build'
1212
http_proxy:
1313
type: string
14-
default: 'http://10.24.221.169:911'
14+
default: 'http://10.24.221.169:912'
1515
https_proxy:
1616
type: string
17-
default: 'http://10.24.221.169:911'
17+
default: 'http://10.24.221.169:912'
1818
runner_config_path:
1919
type: string
2020
default: '/home/ci/llm-ray-actions-runner'
@@ -34,15 +34,15 @@ jobs:
3434
name: finetune
3535
strategy:
3636
matrix:
37-
model: [ EleutherAI/gpt-j-6b, meta-llama/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b, meta-llama/Llama-2-7b-hf, mistralai/Mistral-7B-v0.1, google/gemma-2b]
37+
model: [ EleutherAI/gpt-j-6b, NousResearch/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b, NousResearch/Llama-2-7b-hf, mistralai/Mistral-7B-v0.1, google/gemma-2b]
3838
isPR:
3939
- ${{inputs.ci_type == 'pr'}}
4040

4141
exclude:
4242
- { isPR: true }
4343
include:
4444
- { model: "EleutherAI/gpt-j-6b"}
45-
- { model: "meta-llama/Llama-2-7b-chat-hf"}
45+
- { model: "NousResearch/Llama-2-7b-chat-hf"}
4646
- { model: "mistralai/Mistral-7B-v0.1"}
4747
- { model: "google/gemma-2b"}
4848

@@ -65,9 +65,6 @@ jobs:
6565
- name: Checkout
6666
uses: actions/checkout@v4
6767

68-
- name: Load environment variables
69-
run: cat /root/actions-runner-config/.env >> $GITHUB_ENV
70-
7168
- name: Build Docker Image
7269
run: |
7370
DF_SUFFIX=".cpu_and_deepspeed"
@@ -83,7 +80,7 @@ jobs:
8380
model_cache_path=${{ inputs.model_cache_path }}
8481
USE_PROXY="1"
8582
source dev/scripts/ci-functions.sh
86-
start_docker ${TARGET} ${code_checkout_path} ${model_cache_path} ${USE_PROXY} ${{env.HF_ACCESS_TOKEN}}
83+
start_docker ${TARGET} ${code_checkout_path} ${model_cache_path} ${USE_PROXY}
8784
8885
- name: Run Finetune Test
8986
run: |

.github/workflows/workflow_finetune_gpu.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,17 @@ on:
88
default: '10.1.2.13:5000/llmray-build'
99
http_proxy:
1010
type: string
11-
default: 'http://10.24.221.169:911'
11+
default: 'http://10.24.221.169:912'
1212
https_proxy:
1313
type: string
14-
default: 'http://10.24.221.169:911'
14+
default: 'http://10.24.221.169:912'
1515

1616
jobs:
1717
finetune-gpu:
1818
name: finetune-gpu
1919
strategy:
2020
matrix:
21-
model: [ meta-llama/Llama-2-7b-chat-hf ]
21+
model: [ NousResearch/Llama-2-7b-chat-hf ]
2222
runs-on: self-hosted
2323

2424
defaults:

.github/workflows/workflow_inference.yml

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@ on:
1111
default: '10.1.2.13:5000/llmray-build'
1212
http_proxy:
1313
type: string
14-
default: 'http://10.24.221.169:911'
14+
default: 'http://10.24.221.169:912'
1515
https_proxy:
1616
type: string
17-
default: 'http://10.24.221.169:911'
17+
default: 'http://10.24.221.169:912'
1818
runner_config_path:
1919
type: string
2020
default: '/home/ci/llm-ray-actions-runner'
@@ -67,9 +67,6 @@ jobs:
6767
- name: Checkout
6868
uses: actions/checkout@v4
6969

70-
- name: Load environment variables
71-
run: cat /root/actions-runner-config/.env >> $GITHUB_ENV
72-
7370
- name: Determine Target
7471
id: "target"
7572
run: |
@@ -94,7 +91,7 @@ jobs:
9491
model_cache_path=${{ inputs.model_cache_path }}
9592
USE_PROXY="1"
9693
source dev/scripts/ci-functions.sh
97-
start_docker ${TARGET} ${code_checkout_path} ${model_cache_path} ${USE_PROXY} ${{env.HF_ACCESS_TOKEN}}
94+
start_docker ${TARGET} ${code_checkout_path} ${model_cache_path} ${USE_PROXY}
9895
9996
- name: Start Ray Cluster
10097
run: |

.github/workflows/workflow_inference_gaudi2.yml

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,6 @@ jobs:
7373
- name: Checkout
7474
uses: actions/checkout@v4
7575

76-
- name: Load environment variables
77-
run: cat /root/actions-runner-config/.env >> $GITHUB_ENV
78-
7976
- name: Build Docker Image
8077
run: |
8178
DF_SUFFIX=".gaudi2"
@@ -98,7 +95,6 @@ jobs:
9895
cid=$(docker ps -a -q --filter "name=${TARGET}")
9996
if [[ ! -z "$cid" ]]; then docker rm $cid; fi
10097
docker run -tid --name="${TARGET}" --hostname="${TARGET}-container" --runtime=habana -v /home/yizhong/Model-References:/root/Model-References -v ${{ inputs.code_checkout_path }}:/root/llm-on-ray -v ${{ inputs.model_cache_path }}:/root/.cache/huggingface/hub/ -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --cap-add sys_ptrace --net=host --ipc=host ${TARGET}:habana
101-
10298
- name: Start Ray Cluster
10399
run: |
104100
TARGET=${{steps.target.outputs.target}}
@@ -117,7 +113,6 @@ jobs:
117113
conf_path = "llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml"
118114
with open(conf_path, encoding="utf-8") as reader:
119115
result = yaml.load(reader, Loader=yaml.FullLoader)
120-
result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
121116
with open(conf_path, 'w') as output:
122117
yaml.dump(result, output, sort_keys=False)
123118
EOF
@@ -128,7 +123,6 @@ jobs:
128123
elif [[ ${{ matrix.model }} == "llama-2-70b-chat-hf" ]]; then
129124
docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml --keep_serve_terminal"
130125
elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
131-
docker exec "${TARGET}" bash -c "huggingface-cli login --token ${{ env.HF_ACCESS_TOKEN }}"
132126
docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml --keep_serve_terminal"
133127
fi
134128
echo Streaming query:

.github/workflows/workflow_test_benchmark.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@ on:
1111
default: '10.1.2.13:5000/llmray-build'
1212
http_proxy:
1313
type: string
14-
default: 'http://10.24.221.169:911'
14+
default: 'http://10.24.221.169:912'
1515
https_proxy:
1616
type: string
17-
default: 'http://10.24.221.169:911'
17+
default: 'http://10.24.221.169:912'
1818
runner_config_path:
1919
type: string
2020
default: '/home/ci/llm-ray-actions-runner'

README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,14 @@ Deploy a model on Ray and expose an endpoint for serving. This command uses GPT2
7171
```bash
7272
llm_on_ray-serve --config_file llm_on_ray/inference/models/gpt2.yaml
7373
```
74-
74+
You can also use model_ids to serve directly through:
75+
```bash
76+
llm_on_ray-serve --models gpt2
77+
```
78+
List all support model_ids with config file path:
79+
```bash
80+
llm_on_ray-serve --list_model_ids
81+
```
7582
The default served method is to provide an OpenAI-compatible API server ([OpenAI API Reference](https://platform.openai.com/docs/api-reference/chat)), you can access and test it in many ways:
7683
```bash
7784
# using curl

benchmarks/run_benchmark.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,4 +229,4 @@ then
229229
fi
230230
output_tokens_length=32
231231
get_best_latency $iter "${input_tokens_length[*]}" $output_tokens_length $benchmark_dir
232-
fi
232+
fi

dev/scripts/ci-functions.sh

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
#!/usr/bin/env bash
22
set -eo pipefail
33

4-
HTTP_PROXY='http://10.24.221.169:911'
5-
HTTPS_PROXY='http://10.24.221.169:911'
4+
HTTP_PROXY='http://10.24.221.169:912'
5+
HTTPS_PROXY='http://10.24.221.169:912'
66
MODEL_CACHE_PATH_LOACL='/root/.cache/huggingface/hub'
77
CODE_CHECKOUT_PATH_LOCAL='/root/llm-on-ray'
88

@@ -39,7 +39,6 @@ start_docker() {
3939
local code_checkout_path=$2
4040
local model_cache_path=$3
4141
local USE_PROXY=$4
42-
local HF_TOKEN=$5
4342

4443
cid=$(docker ps -q --filter "name=${TARGET}")
4544
if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
@@ -66,12 +65,7 @@ start_docker() {
6665
fi
6766

6867
echo "docker run -tid "${docker_args[@]}" "${TARGET}:latest""
69-
docker run -tid "${docker_args[@]}" "${TARGET}:latest"
70-
if [ -z "$HF_TOKEN" ]; then
71-
echo "no hf token"
72-
else
73-
docker exec "${TARGET}" bash -c "huggingface-cli login --token ${HF_TOKEN}"
74-
fi
68+
docker run -tid "${docker_args[@]}" "${TARGET}:latest"
7569
}
7670

7771
install_dependencies(){

llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ deepspeed: true
88
workers_per_group: 8
99
device: hpu
1010
model_description:
11-
model_id_or_path: meta-llama/Llama-2-70b-chat-hf
12-
tokenizer_name_or_path: meta-llama/Llama-2-70b-chat-hf
11+
model_id_or_path: NousResearch/Llama-2-70b-chat-hf
12+
tokenizer_name_or_path: NousResearch/Llama-2-70b-chat-hf
1313
config:
1414
use_auth_token: ''

0 commit comments

Comments
 (0)