Merge branch 'intel:main' into integrate_dpo

minmingzhu · web-flow · commit 00b1aaf92e38 · 2024-06-18T22:54:54.000+08:00
diff --git a/.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml b/.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml
@@ -14,7 +14,7 @@ ipex:
   enabled: false
   precision: bf16
 model_description:
-  model_id_or_path: meta-llama/Llama-2-7b-chat-hf
-  tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf
+  model_id_or_path: NousResearch/Llama-2-7b-chat-hf
+  tokenizer_name_or_path: NousResearch/Llama-2-7b-chat-hf
   config:
     use_auth_token: ''
diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml
@@ -11,10 +11,10 @@ on:
         default: '10.1.2.13:5000/llmray-build'
       http_proxy:
         type: string
-        default: 'http://10.24.221.169:911'
+        default: 'http://10.24.221.169:912'
       https_proxy:
         type: string
-        default: 'http://10.24.221.169:911'
+        default: 'http://10.24.221.169:912'
       runner_config_path:
         type: string
         default: '/home/ci/llm-ray-actions-runner'
@@ -34,15 +34,15 @@ jobs:
     name: finetune
     strategy:
       matrix:
-        model: [ EleutherAI/gpt-j-6b, meta-llama/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b, meta-llama/Llama-2-7b-hf, mistralai/Mistral-7B-v0.1, google/gemma-2b]
+        model: [ EleutherAI/gpt-j-6b, NousResearch/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b, NousResearch/Llama-2-7b-hf, mistralai/Mistral-7B-v0.1, google/gemma-2b]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
         exclude:
           - { isPR: true }
         include:
           - { model: "EleutherAI/gpt-j-6b"}
-          - { model: "meta-llama/Llama-2-7b-chat-hf"}
+          - { model: "NousResearch/Llama-2-7b-chat-hf"}
           - { model: "mistralai/Mistral-7B-v0.1"}
           - { model: "google/gemma-2b"}
 
@@ -65,9 +65,6 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v4
 
-      - name: Load environment variables
-        run: cat /root/actions-runner-config/.env >> $GITHUB_ENV
-
       - name: Build Docker Image
         run: |
           DF_SUFFIX=".cpu_and_deepspeed"
@@ -83,7 +80,7 @@ jobs:
           model_cache_path=${{ inputs.model_cache_path }}
           USE_PROXY="1"
           source dev/scripts/ci-functions.sh
-          start_docker ${TARGET} ${code_checkout_path} ${model_cache_path} ${USE_PROXY} ${{env.HF_ACCESS_TOKEN}}
+          start_docker ${TARGET} ${code_checkout_path} ${model_cache_path} ${USE_PROXY}
 
       - name: Run Finetune Test
         run: |
diff --git a/.github/workflows/workflow_finetune_gpu.yml b/.github/workflows/workflow_finetune_gpu.yml
@@ -8,17 +8,17 @@ on:
         default: '10.1.2.13:5000/llmray-build'
       http_proxy:
         type: string
-        default: 'http://10.24.221.169:911'
+        default: 'http://10.24.221.169:912'
       https_proxy:
         type: string
-        default: 'http://10.24.221.169:911'
+        default: 'http://10.24.221.169:912'
 
 jobs:
   finetune-gpu:
     name: finetune-gpu
     strategy:
       matrix:
-        model: [ meta-llama/Llama-2-7b-chat-hf ]
+        model: [ NousResearch/Llama-2-7b-chat-hf ]
     runs-on: self-hosted
 
     defaults:
diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml
@@ -11,10 +11,10 @@ on:
         default: '10.1.2.13:5000/llmray-build'
       http_proxy:
         type: string
-        default: 'http://10.24.221.169:911'
+        default: 'http://10.24.221.169:912'
       https_proxy:
         type: string
-        default: 'http://10.24.221.169:911'
+        default: 'http://10.24.221.169:912'
       runner_config_path:
         type: string
         default: '/home/ci/llm-ray-actions-runner'
@@ -67,9 +67,6 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v4
 
-      - name: Load environment variables
-        run: cat /root/actions-runner-config/.env >> $GITHUB_ENV
-
       - name: Determine Target
         id: "target"
         run: |
@@ -94,7 +91,7 @@ jobs:
           model_cache_path=${{ inputs.model_cache_path }}
           USE_PROXY="1"
           source dev/scripts/ci-functions.sh
-          start_docker ${TARGET} ${code_checkout_path} ${model_cache_path} ${USE_PROXY} ${{env.HF_ACCESS_TOKEN}}
+          start_docker ${TARGET} ${code_checkout_path} ${model_cache_path} ${USE_PROXY}
 
       - name: Start Ray Cluster
         run: |
diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
@@ -73,9 +73,6 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v4
 
-      - name: Load environment variables
-        run: cat /root/actions-runner-config/.env >> $GITHUB_ENV
-
       - name: Build Docker Image
         run: |
           DF_SUFFIX=".gaudi2"
@@ -98,7 +95,6 @@ jobs:
           cid=$(docker ps -a -q --filter "name=${TARGET}")
           if [[ ! -z "$cid" ]]; then docker rm $cid; fi
           docker run -tid --name="${TARGET}" --hostname="${TARGET}-container" --runtime=habana -v /home/yizhong/Model-References:/root/Model-References -v ${{ inputs.code_checkout_path }}:/root/llm-on-ray -v ${{ inputs.model_cache_path }}:/root/.cache/huggingface/hub/ -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --cap-add sys_ptrace --net=host --ipc=host ${TARGET}:habana
-
       - name: Start Ray Cluster
         run: |
           TARGET=${{steps.target.outputs.target}}
@@ -117,7 +113,6 @@ jobs:
               conf_path = "llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml"
           with open(conf_path, encoding="utf-8") as reader:
               result = yaml.load(reader, Loader=yaml.FullLoader)
-              result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
           with open(conf_path, 'w') as output:
               yaml.dump(result, output, sort_keys=False)
           EOF
@@ -128,7 +123,6 @@ jobs:
           elif [[ ${{ matrix.model }} == "llama-2-70b-chat-hf" ]]; then
             docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml --keep_serve_terminal"
           elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
-            docker exec "${TARGET}" bash -c "huggingface-cli login --token ${{ env.HF_ACCESS_TOKEN }}"
             docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml --keep_serve_terminal" 
           fi
           echo Streaming query:
diff --git a/.github/workflows/workflow_test_benchmark.yml b/.github/workflows/workflow_test_benchmark.yml
@@ -11,10 +11,10 @@ on:
         default: '10.1.2.13:5000/llmray-build'
       http_proxy:
         type: string
-        default: 'http://10.24.221.169:911'
+        default: 'http://10.24.221.169:912'
       https_proxy:
         type: string
-        default: 'http://10.24.221.169:911'
+        default: 'http://10.24.221.169:912'
       runner_config_path:
         type: string
         default: '/home/ci/llm-ray-actions-runner'
diff --git a/README.md b/README.md
@@ -71,7 +71,14 @@ Deploy a model on Ray and expose an endpoint for serving. This command uses GPT2
 ```bash
 llm_on_ray-serve --config_file llm_on_ray/inference/models/gpt2.yaml
 ```
-
+You can also use model_ids to serve directly through:
+```bash
+llm_on_ray-serve --models gpt2
+```
+List all support model_ids with config file path:
+```bash
+llm_on_ray-serve --list_model_ids
+```
 The default served method is to provide an OpenAI-compatible API server ([OpenAI API Reference](https://platform.openai.com/docs/api-reference/chat)), you can access and test it in many ways:
 ```bash
 # using curl
diff --git a/benchmarks/run_benchmark.sh b/benchmarks/run_benchmark.sh
@@ -229,4 +229,4 @@ then
     fi
     output_tokens_length=32
     get_best_latency $iter "${input_tokens_length[*]}" $output_tokens_length $benchmark_dir
-fi
+fi
diff --git a/dev/scripts/ci-functions.sh b/dev/scripts/ci-functions.sh
@@ -1,8 +1,8 @@
 #!/usr/bin/env bash
 set -eo pipefail
 
-HTTP_PROXY='http://10.24.221.169:911'
-HTTPS_PROXY='http://10.24.221.169:911'
+HTTP_PROXY='http://10.24.221.169:912'
+HTTPS_PROXY='http://10.24.221.169:912'
 MODEL_CACHE_PATH_LOACL='/root/.cache/huggingface/hub'
 CODE_CHECKOUT_PATH_LOCAL='/root/llm-on-ray'
 
@@ -39,7 +39,6 @@ start_docker() {
     local code_checkout_path=$2
     local model_cache_path=$3
     local USE_PROXY=$4
-    local HF_TOKEN=$5
 
     cid=$(docker ps -q --filter "name=${TARGET}")
     if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
@@ -66,12 +65,7 @@ start_docker() {
     fi
 
     echo "docker run -tid  "${docker_args[@]}" "${TARGET}:latest""
-    docker run -tid  "${docker_args[@]}" "${TARGET}:latest"
-    if [ -z "$HF_TOKEN" ]; then
-        echo "no hf token"
-    else
-        docker exec "${TARGET}" bash -c "huggingface-cli login --token ${HF_TOKEN}"
-    fi
+    docker run -tid  "${docker_args[@]}" "${TARGET}:latest"   
 }
 
 install_dependencies(){
diff --git a/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml
@@ -8,7 +8,7 @@ deepspeed: true
 workers_per_group: 8
 device: hpu
 model_description:
-  model_id_or_path: meta-llama/Llama-2-70b-chat-hf
-  tokenizer_name_or_path: meta-llama/Llama-2-70b-chat-hf
+  model_id_or_path: NousResearch/Llama-2-70b-chat-hf
+  tokenizer_name_or_path: NousResearch/Llama-2-70b-chat-hf
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 8
 hpus_per_worker: 1
 device: hpu
 model_description:
-  model_id_or_path: meta-llama/Llama-2-7b-chat-hf
-  tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf
+  model_id_or_path: NousResearch/Llama-2-7b-chat-hf
+  tokenizer_name_or_path: NousResearch/Llama-2-7b-chat-hf
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml
@@ -16,7 +16,7 @@ ipex:
   enabled: false
   precision: bf16
 model_description:
-  model_id_or_path: meta-llama/Llama-2-7b-chat-hf
-  tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf
+  model_id_or_path: NousResearch/Llama-2-7b-chat-hf
+  tokenizer_name_or_path: NousResearch/Llama-2-7b-chat-hf
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml
@@ -7,7 +7,7 @@ deepspeed: true
 workers_per_group: 8
 device: hpu
 model_description:
-  model_id_or_path: meta-llama/Meta-Llama-3-70b-Instruct
-  tokenizer_name_or_path: meta-llama/Meta-Llama-3-70b-Instruct
+  model_id_or_path: NousResearch/Meta-Llama-3-70B-Instruct
+  tokenizer_name_or_path: NousResearch/Meta-Llama-3-70B-Instruct
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 8
 hpus_per_worker: 1
 device: hpu
 model_description:
-  model_id_or_path: meta-llama/Meta-Llama-3-8b-Instruct
-  tokenizer_name_or_path: meta-llama/Meta-Llama-3-8b-Instruct
+  model_id_or_path: NousResearch/Meta-Llama-3-8B-Instruct
+  tokenizer_name_or_path: NousResearch/Meta-Llama-3-8B-Instruct
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml
@@ -12,7 +12,7 @@ ipex:
   enabled: false
   precision: bf16
 model_description:
-  model_id_or_path: meta-llama/Llama-2-7b-chat-hf
-  tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf
+  model_id_or_path: NousResearch/Llama-2-7b-chat-hf
+  tokenizer_name_or_path: NousResearch/Llama-2-7b-chat-hf
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm-autoscaling.yaml b/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm-autoscaling.yaml
@@ -22,7 +22,7 @@ ipex:
   enabled: false
   precision: bf16
 model_description:
-  model_id_or_path: meta-llama/Llama-2-7b-chat-hf
-  tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf
+  model_id_or_path: NousResearch/Llama-2-7b-chat-hf
+  tokenizer_name_or_path: NousResearch/Llama-2-7b-chat-hf
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml b/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml
@@ -15,7 +15,7 @@ ipex:
   enabled: false
   precision: bf16
 model_description:
-  model_id_or_path: meta-llama/Llama-2-7b-chat-hf
-  tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf
+  model_id_or_path: NousResearch/Llama-2-7b-chat-hf
+  tokenizer_name_or_path: NousResearch/Llama-2-7b-chat-hf
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/predictors/deepspeed_predictor.py b/llm_on_ray/inference/predictors/deepspeed_predictor.py
@@ -53,11 +53,15 @@ def __init__(self, infer_conf: InferenceConfig, pad_token_id, stopping_criteria)
 
         model_desc = infer_conf.model_description
         model_config = model_desc.config
+        if infer_conf.model_description.config.use_auth_token:
+            auth_token = infer_conf.model_description.config.use_auth_token
+        else:
+            auth_token = None
         hf_config = AutoConfig.from_pretrained(
             model_desc.model_id_or_path,
             torchscript=True,
             trust_remote_code=model_config.trust_remote_code,
-            use_auth_token=infer_conf.model_description.config.use_auth_token,
+            use_auth_token=auth_token,
         )
 
         # decide correct torch type for loading HF model
@@ -75,7 +79,7 @@ def __init__(self, infer_conf: InferenceConfig, pad_token_id, stopping_criteria)
             self.model = PeftModel.from_pretrained(
                 self.model,
                 model_desc.peft_model_id_or_path,
-                use_auth_token=infer_conf.model_description.config.use_auth_token,
+                use_auth_token=auth_token,
             )
 
             self.model = self.model.merge_and_unload()
diff --git a/llm_on_ray/inference/predictors/hpu_predictor.py b/llm_on_ray/inference/predictors/hpu_predictor.py
@@ -314,11 +314,15 @@ def load_model(self):
                 model = AutoModelForCausalLM.from_config(config, torch_dtype=model_dtype)
 
             checkpoints_json = tempfile.NamedTemporaryFile(suffix=".json", mode="+w")
+            if model_desc.config.use_auth_token:
+                auth_token = model_desc.config.use_auth_token
+            else:
+                auth_token = None
             write_checkpoints_json(
                 model_desc.model_id_or_path,
                 self.local_rank,
                 checkpoints_json,
-                token=model_desc.config.use_auth_token,
+                token=auth_token,
             )
         else:
             with deepspeed.OnDevice(dtype=model_dtype, device="cpu"):
diff --git a/llm_on_ray/inference/predictors/transformer_predictor.py b/llm_on_ray/inference/predictors/transformer_predictor.py
@@ -37,11 +37,15 @@ def __init__(self, infer_conf: InferenceConfig):
         super().__init__(infer_conf)
         model_desc = infer_conf.model_description
         model_config = model_desc.config
+        if infer_conf.model_description.config.use_auth_token:
+            auth_token = infer_conf.model_description.config.use_auth_token
+        else:
+            auth_token = None
         hf_config = AutoConfig.from_pretrained(
             model_desc.model_id_or_path,
             torchscript=True,
             trust_remote_code=model_config.trust_remote_code,
-            use_auth_token=infer_conf.model_description.config.use_auth_token,
+            use_auth_token=auth_token,
         )
 
         # decide correct torch type for loading HF model
@@ -74,7 +78,7 @@ def __init__(self, infer_conf: InferenceConfig):
             model = PeftModel.from_pretrained(
                 model,
                 model_desc.peft_model_id_or_path,
-                use_auth_token=infer_conf.model_description.config.use_auth_token,
+                use_auth_token=auth_token,
             )
 
             model = model.merge_and_unload()
diff --git a/llm_on_ray/inference/serve.py b/llm_on_ray/inference/serve.py
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/tests/inference/test_serve.py b/tests/inference/test_serve.py