intel
diff --git a/‎.github/workflows/workflow_inference.yml‎
Lines changed: 8 additions & 0 deletions b/‎.github/workflows/workflow_inference.yml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎MANIFEST.in‎
Lines changed: 1 addition & 0 deletions b/‎MANIFEST.in‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/inference/api_server_langchain/openai_agent_tools_call_query_with_langchain_sdk.py‎
Lines changed: 129 additions & 0 deletions b/‎examples/inference/api_server_langchain/openai_agent_tools_call_query_with_langchain_sdk.py‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎examples/inference/api_server_langchain/query_langchain_sdk.py‎
Lines changed: 3 additions & 0 deletions b/‎examples/inference/api_server_langchain/query_langchain_sdk.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/inference/api_server_openai/openai_tools_call_query.py‎
Lines changed: 108 additions & 0 deletions b/‎examples/inference/api_server_openai/openai_tools_call_query.py‎
Lines changed: 108 additions & 0 deletions
@@ -189,6 +189,14 @@ jobs:
             docker exec "${TARGET}" bash -c "python examples/inference/api_server_openai/query_http_requests.py --model_name ${{ matrix.model }}"
           fi
 
+      - name: Run Agent tool Inference Test with REST API
+        run: |
+          TARGET=${{steps.target.outputs.target}}
+          if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf" ]]; then
+            docker exec "${TARGET}" bash -c "llm_on_ray-serve --models ${{ matrix.model }}"
+            docker exec "${TARGET}" bash -c "python examples/inference/api_server_openai/query_http_requests_tool.py --model_name ${{ matrix.model }}"
+          fi
+
       - name: Stop Ray
         run: |
           TARGET=${{steps.target.outputs.target}}
 
@@ -1,2 +1,3 @@
 # with [tools.setuptools] in pyproject.toml, the configs below work in both baremetal and container
 include inference/**/*.yaml
+include inference/**/*.jinja
@@ -0,0 +1,129 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import os
+
+from langchain_openai import ChatOpenAI
+from langchain.callbacks import StreamingStdOutCallbackHandler, StdOutCallbackHandler
+from langchain.agents import AgentExecutor, create_openai_tools_agent
+from langchain import hub
+
+parser = argparse.ArgumentParser(
+    description="Example script of enable langchain agent", add_help=True
+)
+parser.add_argument(
+    "--model_name",
+    default="mistral-7b-instruct-v0.2",
+    type=str,
+    help="The name of model to request",
+)
+parser.add_argument(
+    "--streaming_response",
+    default=False,
+    action="store_true",
+    help="Whether to enable streaming response",
+)
+parser.add_argument(
+    "--prompt_template",
+    default="hwchase17/openai-tools-agent",
+    type=str,
+    help="prompt template for openai tools agent",
+)
+parser.add_argument(
+    "--max_tokens",
+    default="512",
+    type=int,
+    help="max number of tokens used in this example",
+)
+
+args = parser.parse_args()
+
+if "OPENAI_API_KEY" in os.environ:
+    openai_api_key = os.environ["OPENAI_API_KEY"]
+else:
+    openai_api_key = "not_needed"
+
+if "OPENAI_BASE_URL" in os.environ:
+    openai_base_url = os.environ["OPENAI_BASE_URL"]
+elif openai_api_key == "not_needed":
+    openai_base_url = "http://localhost:8000/v1"
+else:
+    openai_base_url = "https://api.openai.com/v1"
+
+# ================================================ #
+# Lets define a function/tool for getting the weather. In this demo it we mockthe output
+# In real life, you'd end up calling a library/API such as PWOWM (open weather map) library:
+# Depending on your app's functionality, you may also, call vendor/external or internal custom APIs
+
+from pydantic import BaseModel, Field
+from typing import Optional, Type
+from langchain.tools import BaseTool
+
+
+def get_current_weather(location, unit):
+    # Call an external API to get relevant information (like serpapi, etc)
+    # Here for the demo we will send a mock response
+    weather_info = {
+        "location": location,
+        "temperature": "78",
+        "unit": unit,
+        "forecast": ["sunny", "with a chance of rain"],
+    }
+    return weather_info
+
+
+class GetCurrentWeatherCheckInput(BaseModel):
+    # Check the input for Weather
+    location: str = Field(
+        ..., description="The name of the location name for which we need to find the weather"
+    )
+    unit: str = Field(..., description="The unit for the temperature value")
+
+
+class GetCurrentWeatherTool(BaseTool):
+    name = "get_current_weather"
+    description = "Used to find the weather for a given location in said unit"
+
+    def _run(self, location: str, unit: str):
+        # print("I am running!")
+        weather_response = get_current_weather(location, unit)
+        return weather_response
+
+    def _arun(self, location: str, unit: str):
+        raise NotImplementedError("This tool does not support async")
+
+    args_schema: Optional[Type[BaseModel]] = GetCurrentWeatherCheckInput
+
+
+# ================================================ #
+
+tools = [GetCurrentWeatherTool()]
+prompt = hub.pull(args.prompt_template)
+llm = ChatOpenAI(
+    openai_api_base=openai_base_url,
+    model_name=args.model_name,
+    openai_api_key=openai_api_key,
+    max_tokens=args.max_tokens,
+    callbacks=[
+        StreamingStdOutCallbackHandler() if args.streaming_response else StdOutCallbackHandler()
+    ],
+    streaming=args.streaming_response,
+)
+agent = create_openai_tools_agent(tools=tools, llm=llm, prompt=prompt)
+agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
+agent_executor.invoke({"input": "what is the weather today in Boston?"})
+agent_executor.invoke({"input": "tell me a short joke?"})
@@ -29,6 +29,8 @@
     action="store_true",
     help="Whether to enable streaming response",
 )
+parser.add_argument("--max_tokens", default=256, help="The maximum numbers of tokens to generate")
+
 
 args = parser.parse_args()
 
@@ -52,6 +54,7 @@
     model_name=args.model_name,
     openai_api_key=openai_api_key,
     streaming=args.streaming_response,
+    max_tokens=args.max_tokens,
 )
 
 prompt = PromptTemplate(template="list 3 {things}", input_variables=["things"])
 
@@ -0,0 +1,108 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+from openai import OpenAI
+import os
+
+parser = argparse.ArgumentParser(
+    description="Example script to query with openai sdk", add_help=True
+)
+parser.add_argument(
+    "--model_name",
+    default="mistral-7b-instruct-v0.2",
+    type=str,
+    help="The name of model to request",
+)
+parser.add_argument(
+    "--streaming_response",
+    default=False,
+    action="store_true",
+    help="Whether to enable streaming response",
+)
+parser.add_argument(
+    "--max_new_tokens", default=512, help="The maximum numbers of tokens to generate"
+)
+args = parser.parse_args()
+
+if "OPENAI_API_KEY" in os.environ:
+    openai_api_key = os.environ["OPENAI_API_KEY"]
+else:
+    openai_api_key = "not_needed"
+
+if "OPENAI_BASE_URL" in os.environ:
+    openai_base_url = os.environ["OPENAI_BASE_URL"]
+elif openai_api_key == "not_needed":
+    openai_base_url = "http://localhost:8000/v1"
+else:
+    openai_base_url = "https://api.openai.com/v1"
+
+
+client = OpenAI(base_url=openai_base_url, api_key=openai_api_key)
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
+                "required": ["location"],
+            },
+        },
+    }
+]
+messages = [
+    [
+        {"role": "user", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "What's the weather like in Boston today?"},
+    ],
+    [
+        {"role": "user", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "Tell me a short joke?"},
+    ],
+]
+for message in messages:
+    print(f"User: {message[1]['content']}")
+    print("Assistant:", end=" ", flush=True)
+    chat_completion = client.chat.completions.create(
+        model=args.model_name,
+        messages=message,
+        max_tokens=args.max_new_tokens,
+        tools=tools,
+        tool_choice="auto",
+        stream=args.streaming_response,
+    )
+
+    if args.streaming_response:
+        for chunk in chat_completion:
+            content = chunk.choices[0].delta.content
+            if content is not None:
+                print(content, end="", flush=True)
+            tool_calls = chunk.choices[0].delta.tool_calls
+            if tool_calls is not None:
+                print(tool_calls, end="", flush=True)
+        print("")
+    else:
+        print(repr(chat_completion.choices[0].message.model_dump()))
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`# with [tools.setuptools] in pyproject.toml, the configs below work in both baremetal and container`
`2`	`2`	`include inference/*/.yaml`
	`3`	`+include inference/*/.jinja`
Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,8 @@`
`29`	`29`	`action="store_true",`
`30`	`30`	`help="Whether to enable streaming response",`
`31`	`31`	`)`
	`32`	`+parser.add_argument("--max_tokens", default=256, help="The maximum numbers of tokens to generate")`
	`33`	`+`
`32`	`34`
`33`	`35`	`args = parser.parse_args()`
`34`	`36`
`@@ -52,6 +54,7 @@`
`52`	`54`	`model_name=args.model_name,`
`53`	`55`	`openai_api_key=openai_api_key,`
`54`	`56`	`streaming=args.streaming_response,`
	`57`	`+ max_tokens=args.max_tokens,`
`55`	`58`	`)`
`56`	`59`
`57`	`60`	`prompt = PromptTemplate(template="list 3 {things}", input_variables=["things"])`