EmbeddedLLM
diff --git a/‎.env.example
Lines changed: 5 additions & 0 deletions b/‎.env.example
Lines changed: 5 additions & 0 deletions
diff --git a/‎.gitignore
Lines changed: 10 additions & 0 deletions b/‎.gitignore
Lines changed: 10 additions & 0 deletions
diff --git a/‎pyproject.toml
Lines changed: 125 additions & 0 deletions b/‎pyproject.toml
Lines changed: 125 additions & 0 deletions
diff --git a/‎scripts/python/get_model.py
Lines changed: 15 additions & 0 deletions b/‎scripts/python/get_model.py
Lines changed: 15 additions & 0 deletions
diff --git a/‎scripts/python/httpx_client.py
Lines changed: 25 additions & 0 deletions b/‎scripts/python/httpx_client.py
Lines changed: 25 additions & 0 deletions
diff --git a/‎scripts/python/litellm_client.py
Lines changed: 15 additions & 0 deletions b/‎scripts/python/litellm_client.py
Lines changed: 15 additions & 0 deletions
diff --git a/‎scripts/python/openai_client.py
Lines changed: 40 additions & 0 deletions b/‎scripts/python/openai_client.py
Lines changed: 40 additions & 0 deletions
@@ -0,0 +1,5 @@
+PORT=6979
+HOST="0.0.0.0"
+UVICORN_LOG_LEVEL="debug"
+MODEL_PATH=''
+SERVED_MODEL_NAME='phi3-mini-int4'
@@ -0,0 +1,10 @@
+# Environment
+.env 
+phi3*.py
+test_phi3*
+
+# Python
+**/__pycache__
+**.egg-info
+
+scripts/*.ps1
@@ -0,0 +1,125 @@
+# See https://gitlab.liris.cnrs.fr/pagoda/tools/mkdocs_template/-/blob/master/user_config/pyproject.toml
+
+# -----------------------------------------------------------------------------
+# Pytest configuration
+# https://docs.pytest.org/en/latest/customize.html?highlight=pyproject#pyproject-toml
+
+[tool.pytest.ini_options]
+log_cli = true
+asyncio_mode = "auto"
+# log_cli_level = "DEBUG"
+addopts = "--cov=embeddedllm --doctest-modules"
+testpaths = ["tests"]
+filterwarnings = [
+    "ignore::DeprecationWarning:tensorflow.*",
+    "ignore::DeprecationWarning:tensorboard.*",
+    "ignore::DeprecationWarning:matplotlib.*",
+    "ignore::DeprecationWarning:flatbuffers.*",
+]
+
+
+# -----------------------------------------------------------------------------
+# Black (Option-less formatter) configuration
+# https://black.readthedocs.io/en/stable/index.html
+
+[tool.black]
+line-length = 99
+target-version = ["py310"]
+include = '\.pyi?$|\.ipynb'
+
+# -----------------------------------------------------------------------------
+# For sorting imports
+# This is used by VS Code to sort imports
+# https://code.visualstudio.com/docs/python/editing#_sort-imports
+# https://timothycrosley.github.io/isort/
+
+[tool.isort]
+# Profile
+# Base profile type to use for configuration. Profiles include: black, django,
+# pycharm, google, open_stack, plone, attrs, hug. As well as any shared profiles.
+# Default: ``
+profile = "black"
+# Treat project as a git repository and ignore files listed in .gitignore
+# Default: `False`
+skip_gitignore = true
+# The max length of an import line (used for wrapping long imports).
+# Default: `79`
+line_length = 99
+known_first_party = []
+
+# -----------------------------------------------------------------------------
+# setuptools
+# https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html
+
+[build-system]
+# setuptools-scm considers all files tracked by git to be data files
+requires = ["setuptools>=62.0", "setuptools-scm"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "embeddedllm"
+description = "EmbeddedLLM: API server for Embedded Device Deployment. Currently support ONNX-DirectML."
+readme = "README.md"
+requires-python = "~=3.10"
+# keywords = ["one", "two"]
+license = { text = "Proprietary" }
+classifiers = [ # https://pypi.org/classifiers/
+    "Development Status :: 3 - Alpha",
+    "Programming Language :: Python :: 3 :: Only",
+    "Intended Audience :: Information Technology",
+    "Operating System :: Unix",
+]
+dependencies = [
+    "huggingface-hub[cli]",
+    "fastapi~=0.110.0",
+    "gunicorn~=21.2.0",
+    "loguru~=0.7.2",
+    "numpy~=1.26.4",
+    "pydantic-settings>=2.2.1",
+    "pydantic~=2.6.3",
+    "onnxruntime-directml",
+    "onnxruntime-genai-directml",
+    "loguru",
+    "openai",
+    "torch",
+    "transformers",
+    "uvicorn"
+] # Sort your dependencies https://sortmylist.com/
+dynamic = ["version"]
+
+[project.optional-dependencies]
+lint = ["black~=24.4.2", "flake8~=7.0.0"]
+test = [
+    "flaky~=3.7.0",
+    "locust~=2.24.1",
+    "mypy~=1.5.1",
+    "pytest-cov~=4.1.0",
+    "pytest~=7.4.2",
+]
+docs = [
+    "furo~=2023.9.10",             # Sphinx theme (nice looking, with dark mode)
+    "myst-parser~=2.0.0",
+    "sphinx-autobuild~=2021.3.14",
+    "sphinx-copybutton~=0.5.2",
+    "sphinx~=7.2.6",
+    "sphinx_rtd_theme~=1.3.0",     # Sphinx theme
+]
+build = [
+    "build",
+    "twine",
+] # https://realpython.com/pypi-publish-python-package/#build-your-package
+all = [
+    "embeddedllm[lint,test,docs,build]", # https://hynek.me/articles/python-recursive-optional-dependencies/
+]
+
+# [project.scripts]
+# embeddedllm = "embeddedllm.scripts.example:main_cli"
+
+[tool.setuptools.dynamic]
+version = { attr = "embeddedllm.version.__version__" }
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.setuptools.package-data]
+owl = ["**/*.json"]
@@ -0,0 +1,15 @@
+import httpx
+import asyncio
+
+async def fetch_models():
+    url = "http://localhost:6979/v1/models"
+    async with httpx.AsyncClient() as client:
+        response = await client.get(url)
+        return response.json()
+
+async def main():
+    models = await fetch_models()
+    print(models)
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,25 @@
+import httpx
+import asyncio
+
+async def stream_chat_completion(url: str, payload: dict):
+    async with httpx.AsyncClient() as client:
+        async with client.stream("POST", url, json=payload) as response:
+            if response.status_code == 200:
+                async for data in response.aiter_bytes():
+                    if data:
+                        print(data.decode('utf-8'))
+            else:
+                print(f"Error: {response.status_code}")
+                print(await response.text())
+
+# Example usage
+if __name__ == "__main__":
+    url = "http://localhost:6979/v1/chat/completions"
+    payload = {
+        "messages": [{"role": "user", "content": "Hello!"}],
+        "model": "phi3-mini-int4",
+        "max_tokens": 80,
+        "temperature": 0.0,
+        "stream": True
+    }
+    asyncio.run(stream_chat_completion(url, payload))
@@ -0,0 +1,15 @@
+import litellm 
+
+messages = [{"role": "user", "content": "Hey, how's it going?"}]
+
+response = litellm.completion(
+            model="phi3-mini-int4", # pass the vllm model name
+            messages=messages,
+            api_base="http://localhost:6979/v1",
+            api_key="EMPTY",
+            temperature=0,
+            max_tokens=80, stream=True,
+            custom_llm_provider="openai")
+
+for part in response:
+    print(part.choices[0].delta.content or "")
@@ -0,0 +1,40 @@
+# from openai import AsyncOpenAI
+# import asyncio
+# import time
+
+# client = AsyncOpenAI(
+#     base_url="http://localhost:6979/v1",
+#     api_key='ellm'
+# )
+
+
+# async def main():
+#     stream = await client.chat.completions.create(
+#         model="phi3-mini-int4",
+#         messages=[{"role": "user", "content": "Say this is a test"}],
+#         max_tokens=80,
+#         temperature=0,
+#         stream=True,
+#     )
+#     print(stream)
+#     async for chunk in stream:
+#         print(chunk.choices[0].delta.content or "", end="", flush=True)
+
+
+# asyncio.run(main())
+
+from openai import OpenAI
+
+client = OpenAI(
+    base_url="http://localhost:6979/v1",
+    api_key='ellm')
+
+stream = client.chat.completions.create(
+        model="phi3-mini-int4",
+        messages=[{"role": "user", "content": "Say this is a test"}],
+        max_tokens=80,
+        temperature=0,
+        stream=True,
+)
+for chunk in stream:
+    print(chunk.choices[0].delta.content or "", end="")