myshell-ai · jordan-barrett-jm · Mar 8, 2024 · Mar 8, 2024 · Mar 8, 2024 · Mar 13, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -1,13 +1,18 @@
 FROM python:3.9-slim
-WORKDIR /app
-COPY . /app
-
 RUN apt-get update && apt-get install -y \
     build-essential libsndfile1 \
     && rm -rf /var/lib/apt/lists/*
 
+WORKDIR /app
+COPY . /app
+
 RUN pip install -e .
 RUN python -m unidic download
 RUN python melo/init_downloads.py
 
-CMD ["python", "./melo/app.py", "--host", "0.0.0.0", "--port", "8888"]
+# Copy entrypoint script and make it executable
+COPY entrypoint.sh /usr/local/bin/entrypoint.sh
+RUN chmod +x /usr/local/bin/entrypoint.sh
+
+# Set the entrypoint script
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
diff --git a/docs/install.md b/docs/install.md
@@ -31,13 +31,19 @@ docker build -t melotts .
 ```
 
 **Run Docker**
+Run as a default Gradio app:
 ```bash
 docker run -it -p 8888:8888 melotts
 ```
 If your local machine has GPU, then you can choose to run:
 ```bash
 docker run --gpus all -it -p 8888:8888 melotts
 ```
+
+Run as a FastAPI streaming server:
+```bash
+docker run --gpus all -it -p 8888:8888 -e APP_MODE=api melotts
+```
 Then open [http://localhost:8888](http://localhost:8888) in your browser to use the app.
 
 ## Usage
@@ -51,6 +57,44 @@ melo-ui
 # Or: python melo/app.py
 ```
 
+### Streaming API
+One application for the streaming API could be for an AI assistant. The following block of code provides some guidance on how to read from the stream:
+```python
+import requests
+import subprocess
+
+def stream_ffplay(audio_stream):
+  ffplay_cmd = ["ffplay", "-nodisp", "-probesize", "2048", "-autoexit", "-"]
+  ffplay_proc = subprocess.Popen(ffplay_cmd, stdin=subprocess.PIPE)
+
+  for chunk in audio_stream:
+      if chunk is not None:
+          ffplay_proc.stdin.write(chunk)
+
+  # close on finish
+  ffplay_proc.stdin.close()
+  ffplay_proc.wait()
+
+def tts(text, speaker='EN-US', language='EN', speed=1):
+  res = requests.post(
+          "http://localhost:8888/stream",
+          json={
+            "text": text,
+            "language": language,
+            "speed": speed,
+            "speaker": speaker
+          },
+          stream=True,
+      )
+  for chunk in res.iter_content(chunk_size=512):
+        if chunk:
+            yield chunk
+
+stream_ffplay(
+  tts("Ahoy there matey! How goes it?")
+)
+```
+
 ### CLI
 
 You may use the MeloTTS CLI to interact with MeloTTS. The CLI may be invoked using either `melotts` or `melo`. Here are some examples:

diff --git a/entrypoint.sh b/entrypoint.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# Default to FastAPI if no APP_MODE specified
+APP_MODE=${APP_MODE:-fastapi}
+
+if [ "$APP_MODE" = "api" ]; then
+    exec uvicorn melo.fastapi_server:app --host "0.0.0.0" --port "8888" --reload
+else
+    exec python ./melo/app.py --host "0.0.0.0" --port "8888"
+fi
diff --git a/melo/fastapi_server.py b/melo/fastapi_server.py
@@ -0,0 +1,39 @@
+from fastapi import FastAPI, File, UploadFile
+from pydantic import BaseModel
+import io
+from melo.api import TTS
+from fastapi.responses import StreamingResponse
+
+app = FastAPI()
+
+# Initialize the TTS models as before
+device = 'auto'
+models = {
+    'EN': TTS(language='EN', device=device),
+    'ES': TTS(language='ES', device=device),
+    'FR': TTS(language='FR', device=device),
+    'ZH': TTS(language='ZH', device=device),
+    'JP': TTS(language='JP', device=device),
+    'KR': TTS(language='KR', device=device),
+}
+
+class SynthesizePayload(BaseModel):
+    text: str = 'Ahoy there matey! There she blows!'
+    language: str = 'EN'
+    speaker: str = 'EN-US'
+    speed: float = 1.0
+
+@app.post("/stream")
+async def synthesize_stream(payload: SynthesizePayload):
+    language = payload.language
+    text = payload.text
+    speaker = payload.speaker or list(models[language].hps.data.spk2id.keys())[0]
+    speed = payload.speed
+
+    def audio_stream():
+        bio = io.BytesIO()
+        models[language].tts_to_file(text, models[language].hps.data.spk2id[speaker], bio, speed=speed, format='wav')
+        audio_data = bio.getvalue()
+        yield audio_data
+
+    return StreamingResponse(audio_stream(), media_type="audio/wav")
diff --git a/requirements.txt b/requirements.txt
@@ -28,3 +28,6 @@ langid==1.1.6
 tqdm
 tensorboard==2.16.2
 loguru==0.7.2
+fastapi
+uvicorn 
+pydantic