Skip to content

Commit 26e2d45

Browse files
committed
init directml chat server
0 parents  commit 26e2d45

14 files changed

+2164
-0
lines changed

.env.example

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
PORT=6979
2+
HOST="0.0.0.0"
3+
UVICORN_LOG_LEVEL="debug"
4+
MODEL_PATH=''
5+
SERVED_MODEL_NAME='phi3-mini-int4'

.gitignore

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Environment
2+
.env
3+
phi3*.py
4+
test_phi3*
5+
6+
# Python
7+
**/__pycache__
8+
**.egg-info
9+
10+
scripts/*.ps1

pyproject.toml

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
# See https://gitlab.liris.cnrs.fr/pagoda/tools/mkdocs_template/-/blob/master/user_config/pyproject.toml
2+
3+
# -----------------------------------------------------------------------------
4+
# Pytest configuration
5+
# https://docs.pytest.org/en/latest/customize.html?highlight=pyproject#pyproject-toml
6+
7+
[tool.pytest.ini_options]
8+
log_cli = true
9+
asyncio_mode = "auto"
10+
# log_cli_level = "DEBUG"
11+
addopts = "--cov=embeddedllm --doctest-modules"
12+
testpaths = ["tests"]
13+
filterwarnings = [
14+
"ignore::DeprecationWarning:tensorflow.*",
15+
"ignore::DeprecationWarning:tensorboard.*",
16+
"ignore::DeprecationWarning:matplotlib.*",
17+
"ignore::DeprecationWarning:flatbuffers.*",
18+
]
19+
20+
21+
# -----------------------------------------------------------------------------
22+
# Black (Option-less formatter) configuration
23+
# https://black.readthedocs.io/en/stable/index.html
24+
25+
[tool.black]
26+
line-length = 99
27+
target-version = ["py310"]
28+
include = '\.pyi?$|\.ipynb'
29+
30+
# -----------------------------------------------------------------------------
31+
# For sorting imports
32+
# This is used by VS Code to sort imports
33+
# https://code.visualstudio.com/docs/python/editing#_sort-imports
34+
# https://timothycrosley.github.io/isort/
35+
36+
[tool.isort]
37+
# Profile
38+
# Base profile type to use for configuration. Profiles include: black, django,
39+
# pycharm, google, open_stack, plone, attrs, hug. As well as any shared profiles.
40+
# Default: ``
41+
profile = "black"
42+
# Treat project as a git repository and ignore files listed in .gitignore
43+
# Default: `False`
44+
skip_gitignore = true
45+
# The max length of an import line (used for wrapping long imports).
46+
# Default: `79`
47+
line_length = 99
48+
known_first_party = []
49+
50+
# -----------------------------------------------------------------------------
51+
# setuptools
52+
# https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html
53+
54+
[build-system]
55+
# setuptools-scm considers all files tracked by git to be data files
56+
requires = ["setuptools>=62.0", "setuptools-scm"]
57+
build-backend = "setuptools.build_meta"
58+
59+
[project]
60+
name = "embeddedllm"
61+
description = "EmbeddedLLM: API server for Embedded Device Deployment. Currently support ONNX-DirectML."
62+
readme = "README.md"
63+
requires-python = "~=3.10"
64+
# keywords = ["one", "two"]
65+
license = { text = "Proprietary" }
66+
classifiers = [ # https://pypi.org/classifiers/
67+
"Development Status :: 3 - Alpha",
68+
"Programming Language :: Python :: 3 :: Only",
69+
"Intended Audience :: Information Technology",
70+
"Operating System :: Unix",
71+
]
72+
dependencies = [
73+
"huggingface-hub[cli]",
74+
"fastapi~=0.110.0",
75+
"gunicorn~=21.2.0",
76+
"loguru~=0.7.2",
77+
"numpy~=1.26.4",
78+
"pydantic-settings>=2.2.1",
79+
"pydantic~=2.6.3",
80+
"onnxruntime-directml",
81+
"onnxruntime-genai-directml",
82+
"loguru",
83+
"openai",
84+
"torch",
85+
"transformers",
86+
"uvicorn"
87+
] # Sort your dependencies https://sortmylist.com/
88+
dynamic = ["version"]
89+
90+
[project.optional-dependencies]
91+
lint = ["black~=24.4.2", "flake8~=7.0.0"]
92+
test = [
93+
"flaky~=3.7.0",
94+
"locust~=2.24.1",
95+
"mypy~=1.5.1",
96+
"pytest-cov~=4.1.0",
97+
"pytest~=7.4.2",
98+
]
99+
docs = [
100+
"furo~=2023.9.10", # Sphinx theme (nice looking, with dark mode)
101+
"myst-parser~=2.0.0",
102+
"sphinx-autobuild~=2021.3.14",
103+
"sphinx-copybutton~=0.5.2",
104+
"sphinx~=7.2.6",
105+
"sphinx_rtd_theme~=1.3.0", # Sphinx theme
106+
]
107+
build = [
108+
"build",
109+
"twine",
110+
] # https://realpython.com/pypi-publish-python-package/#build-your-package
111+
all = [
112+
"embeddedllm[lint,test,docs,build]", # https://hynek.me/articles/python-recursive-optional-dependencies/
113+
]
114+
115+
# [project.scripts]
116+
# embeddedllm = "embeddedllm.scripts.example:main_cli"
117+
118+
[tool.setuptools.dynamic]
119+
version = { attr = "embeddedllm.version.__version__" }
120+
121+
[tool.setuptools.packages.find]
122+
where = ["src"]
123+
124+
[tool.setuptools.package-data]
125+
owl = ["**/*.json"]

scripts/python/get_model.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import httpx
2+
import asyncio
3+
4+
async def fetch_models():
5+
url = "http://localhost:6979/v1/models"
6+
async with httpx.AsyncClient() as client:
7+
response = await client.get(url)
8+
return response.json()
9+
10+
async def main():
11+
models = await fetch_models()
12+
print(models)
13+
14+
if __name__ == "__main__":
15+
asyncio.run(main())

scripts/python/httpx_client.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import httpx
2+
import asyncio
3+
4+
async def stream_chat_completion(url: str, payload: dict):
5+
async with httpx.AsyncClient() as client:
6+
async with client.stream("POST", url, json=payload) as response:
7+
if response.status_code == 200:
8+
async for data in response.aiter_bytes():
9+
if data:
10+
print(data.decode('utf-8'))
11+
else:
12+
print(f"Error: {response.status_code}")
13+
print(await response.text())
14+
15+
# Example usage
16+
if __name__ == "__main__":
17+
url = "http://localhost:6979/v1/chat/completions"
18+
payload = {
19+
"messages": [{"role": "user", "content": "Hello!"}],
20+
"model": "phi3-mini-int4",
21+
"max_tokens": 80,
22+
"temperature": 0.0,
23+
"stream": True
24+
}
25+
asyncio.run(stream_chat_completion(url, payload))

scripts/python/litellm_client.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import litellm
2+
3+
messages = [{"role": "user", "content": "Hey, how's it going?"}]
4+
5+
response = litellm.completion(
6+
model="phi3-mini-int4", # pass the vllm model name
7+
messages=messages,
8+
api_base="http://localhost:6979/v1",
9+
api_key="EMPTY",
10+
temperature=0,
11+
max_tokens=80, stream=True,
12+
custom_llm_provider="openai")
13+
14+
for part in response:
15+
print(part.choices[0].delta.content or "")

scripts/python/openai_client.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# from openai import AsyncOpenAI
2+
# import asyncio
3+
# import time
4+
5+
# client = AsyncOpenAI(
6+
# base_url="http://localhost:6979/v1",
7+
# api_key='ellm'
8+
# )
9+
10+
11+
# async def main():
12+
# stream = await client.chat.completions.create(
13+
# model="phi3-mini-int4",
14+
# messages=[{"role": "user", "content": "Say this is a test"}],
15+
# max_tokens=80,
16+
# temperature=0,
17+
# stream=True,
18+
# )
19+
# print(stream)
20+
# async for chunk in stream:
21+
# print(chunk.choices[0].delta.content or "", end="", flush=True)
22+
23+
24+
# asyncio.run(main())
25+
26+
from openai import OpenAI
27+
28+
client = OpenAI(
29+
base_url="http://localhost:6979/v1",
30+
api_key='ellm')
31+
32+
stream = client.chat.completions.create(
33+
model="phi3-mini-int4",
34+
messages=[{"role": "user", "content": "Say this is a test"}],
35+
max_tokens=80,
36+
temperature=0,
37+
stream=True,
38+
)
39+
for chunk in stream:
40+
print(chunk.choices[0].delta.content or "", end="")

0 commit comments

Comments
 (0)