From 8c9408f825ccce33695e21441b21488903b0bdee Mon Sep 17 00:00:00 2001
From: ZKA SUPER <zeroknowledge0x@users.noreply.github.com>
Date: Thu, 18 Jun 2026 17:55:20 +0000
Subject: [PATCH] feat: add Google Gemini AI transcription guide for Sapat

- Add guide for running Sapat with Google Gemini in a Daytona workspace
- Add multimodal transcription definition
- Add author profile for zeroknowledge0x
- Add SVG workflow diagram

Signed-off-by: zeroknowledge0x <zeroknowledge0x@users.noreply.github.com>
---
 authors/zeroknowledge0x.md                    |   1 +
 ...618_definition_multimodal_transcription.md |  41 ++
 ...ide_ai_transcription_with_google_gemini.md | 380 ++++++++++++++++++
 ...nscription_with_google_gemini_workflow.svg |  70 ++++
 4 files changed, 492 insertions(+)
 create mode 100644 authors/zeroknowledge0x.md
 create mode 100644 definitions/20260618_definition_multimodal_transcription.md
 create mode 100644 guides/20260618_guide_ai_transcription_with_google_gemini.md
 create mode 100644 guides/assets/20260618_guide_ai_transcription_with_google_gemini_workflow.svg

diff --git a/authors/zeroknowledge0x.md b/authors/zeroknowledge0x.md
new file mode 100644
index 00000000..796d9c74
--- /dev/null
+++ b/authors/zeroknowledge0x.md
@@ -0,0 +1 @@
+Author: zeroknowledge0x Title: AI Engineer Description: Open-source contributor focused on AI tooling, developer experience, and reproducible workflows. Interested in speech-to-text pipelines, LLM orchestration, and containerized development environments. Author Image: Author LinkedIn: Author Twitter: Company Name: Independent Company Description: Independent open-source contributor. Company Logo Dark: Company Logo White:
diff --git a/definitions/20260618_definition_multimodal_transcription.md b/definitions/20260618_definition_multimodal_transcription.md
new file mode 100644
index 00000000..2fa6d5f4
--- /dev/null
+++ b/definitions/20260618_definition_multimodal_transcription.md
@@ -0,0 +1,41 @@
+---
+title: 'Multimodal Transcription'
+description: 'Using multimodal AI models to transcribe audio by combining speech recognition with natural language understanding.'
+date: 2026-06-18
+author: 'zeroknowledge0x'
+---
+
+# Multimodal Transcription
+
+## Definition
+
+Multimodal transcription is the process of converting audio to text using
+general-purpose multimodal AI models rather than dedicated speech-to-text
+systems. Instead of relying solely on acoustic models trained on speech data,
+multimodal transcription sends audio alongside text instructions to a large
+language model capable of processing both modalities. This approach enables
+combined workflows where transcription, summarization, translation, or
+structured extraction happen in a single API call.
+
+## Context and Usage
+
+Traditional speech-to-text services like Whisper or Deepgram use encoder-decoder
+architectures specifically trained on audio data. They excel at accurate
+word-level transcription but operate as single-purpose tools: audio in, text
+out.
+
+Multimodal models like Google Gemini, GPT-4o, and Claude accept audio as part
+of a broader conversation context. An AI engineer can attach an audio file and
+a prompt such as "Transcribe this meeting and extract action items," receiving
+both the transcript and structured output in one response.
+
+This paradigm is gaining adoption in AI engineering pipelines because it
+reduces the number of API calls, simplifies orchestration, and allows
+contextual instructions to improve transcription quality for domain-specific
+content. Tools like [Sapat](https://github.com/nibzard/sapat) support
+multimodal providers (e.g., `--provider gemini`) alongside traditional ones,
+letting teams choose the right approach for each use case.
+
+Trade-offs include higher per-token costs, larger payload sizes (audio must be
+base64-encoded inline), and less fine-grained control over acoustic model
+parameters compared to dedicated speech-to-text APIs.
diff --git a/guides/20260618_guide_ai_transcription_with_google_gemini.md b/guides/20260618_guide_ai_transcription_with_google_gemini.md
new file mode 100644
index 00000000..e1f75117
--- /dev/null
+++ b/guides/20260618_guide_ai_transcription_with_google_gemini.md
@@ -0,0 +1,380 @@
+---
+title: "AI Transcription with Google Gemini in Daytona"
+description: "Build a reproducible AI transcription pipeline using Sapat and Google Gemini inside a Daytona workspace."
+date: 2026-06-18
+author: "zeroknowledge0x"
+tags: ["transcription", "gemini", "ai", "daytona", "sapat", "speech-to-text"]
+---
+
+# AI Transcription with Google Gemini in Daytona
+
+# Introduction
+
+Audio transcription has become a critical workflow for AI engineers building
+meeting summarizers, podcast indexers, accessibility tools, and knowledge-base
+pipelines. Google Gemini brings a multimodal approach to transcription: instead
+of a dedicated speech-to-text model, Gemini's `generateContent` endpoint
+accepts inline base64 audio alongside a text prompt, letting you combine
+transcription with summarization, translation, or structured extraction in a
+single API call.
+
+[Sapat](https://github.com/nibzard/sapat) is an open-source transcription CLI
+that converts media files with ffmpeg and routes the resulting audio to a
+pluggable provider registry. It already ships a Gemini provider that wraps the
+`generateContent` REST flow, so you can run `sapat --provider gemini` without
+writing custom HTTP code.
+
+This guide walks you through setting up a fully reproducible [Daytona](https://www.daytona.io/)
+workspace that installs Sapat, configures a Google API key as a workspace
+secret, transcribes audio files with Gemini, and validates the output. By the
+end you will have a containerized environment that any teammate can clone and
+run in under five minutes.
+
+## TL;DR
+
+- **Install Daytona and create a workspace** from the Sapat repository.
+- **Set `GOOGLE_API_KEY`** as an environment variable inside the workspace.
+- **Run `sapat <file> --provider gemini`** to transcribe audio with Google Gemini.
+- **Verify** the `.txt` transcript output alongside the original media file.
+
+## Prerequisites
+
+To follow this guide you will need:
+
+- A Google Cloud account with the [Generative Language API](https://ai.google.dev/docs) enabled.
+- A `GOOGLE_API_KEY` (create one in the [Google AI Studio](https://aistudio.google.com/apikey)).
+- [Docker](https://www.docker.com/) installed on your machine.
+- [Daytona](https://www.daytona.io/docs/installation/installation/) v0.30+ installed.
+- Basic familiarity with [Python](../definitions/20240820_defintion_python.md) and the command line.
+
+## Step 1: Create a Daytona Workspace
+
+Daytona provisions [Development Environments](../definitions/20240819_definition_development environment.md)
+from Git repositories. Start the Daytona server and create a workspace from the
+Sapat repository.
+
+### Step 1.1: Start the Daytona Server
+
+```bash
+daytona server
+```
+
+The server runs in the foreground. Open a second terminal for the remaining
+steps.
+
+### Step 1.2: Add a Git Provider
+
+If you have not already added GitHub as a Git provider, run:
+
+```bash
+daytona git-provider add
+```
+
+Select **GitHub** and paste a personal access token with `repo` scope.
+
+### Step 1.3: Create the Workspace
+
+```bash
+daytona create https://github.com/nibzard/sapat --code
+```
+
+Daytona will pull the repository, build the container image defined in
+`.devcontainer/`, and open the workspace in your preferred IDE. If you have not
+set an IDE preference yet, run `daytona ide` first.
+
+## Step 2: Configure the Google API Key
+
+Sapat's Gemini provider reads `GOOGLE_API_KEY` (or the alias `GEMINI_API_KEY`)
+from the environment. You must set this variable inside the workspace without
+committing it to version control.
+
+### Step 2.1: Export the Key in the Workspace Shell
+
+Open the workspace terminal and run:
+
+```bash
+export GOOGLE_API_KEY="your-api-key-here"
+```
+
+For persistence across sessions, add the export to `~/.bashrc` or
+`~/.profile` inside the container:
+
+```bash
+echo 'export GOOGLE_API_KEY="your-api-key-here"' >> ~/.bashrc
+source ~/.bashrc
+```
+
+### Step 2.2: Verify the Variable Is Set
+
+```bash
+echo "${GOOGLE_API_KEY:0:6}..."
+```
+
+You should see the first six characters of your key followed by `...`. Never
+print the full key in logs or shared terminals.
+
+## Step 3: Install Sapat
+
+Sapat can be installed from the cloned repository using pip.
+
+```bash
+cd /workspaces/sapat
+pip install -e .
+```
+
+This installs the `sapat` CLI and all dependencies listed in
+`requirements.txt`, including `requests` for HTTP calls to the Gemini API.
+
+Verify the installation:
+
+```bash
+sapat --help
+```
+
+You should see usage information including the `--provider` flag.
+
+## Step 4: Transcribe Audio with Gemini
+
+Sapat accepts individual files or directories. It automatically converts
+non-MP3 media to MP3 using ffmpeg before sending the audio to the selected
+provider.
+
+### Step 4.1: Transcribe a Single File
+
+Place an audio or video file in your workspace (or copy one in), then run:
+
+```bash
+sapat recording.mp4 --provider gemini
+```
+
+Sapat will:
+
+1. Convert `recording.mp4` to `recording.mp3` using ffmpeg.
+2. Base64-encode the MP3 data.
+3. Send a `POST` request to `https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent`
+   with the audio as inline data and a transcription prompt.
+4. Extract the text from the Gemini response and write it to `recording.txt`.
+
+### Step 4.2: Specify a Language
+
+For non-English audio, pass the `--language` flag:
+
+```bash
+sapat interview_es.mp4 --provider gemini --language es
+```
+
+Gemini's prompt will include `Language: es`, guiding the model toward Spanish
+transcription.
+
+### Step 4.3: Add a Custom Prompt
+
+Use `--prompt` to give Gemini additional context about the audio content. This
+is especially useful for domain-specific vocabulary:
+
+```bash
+sapat medical_lecture.mp4 --provider gemini \
+  --prompt "This is a cardiology lecture. Use standard medical terminology."
+```
+
+### Step 4.4: Adjust Quality and Temperature
+
+- `--quality` controls MP3 bitrate: `L` (low), `M` (medium, default), `H`
+  (high). Higher quality increases file size but may improve accuracy.
+- `--temperature` (0 to 1) controls generation randomness. Use `0` for
+  deterministic output.
+
+```bash
+sapat keynote.mp4 --provider gemini --quality H --temperature 0
+```
+
+### Step 4.5: Batch-Process a Directory
+
+Point Sapat at a directory to transcribe every `.mp4` file:
+
+```bash
+sapat ./recordings/ --provider gemini
+```
+
+Each file produces a corresponding `.txt` transcript in the same directory.
+
+## Step 5: Review the Output
+
+After transcription, Sapat creates a text file alongside the original media.
+Open it in your IDE or cat it from the terminal:
+
+```bash
+cat recording.txt
+```
+
+The transcript contains only the spoken text. If you passed a custom prompt,
+Gemini may also include speaker labels or section headers depending on your
+instructions.
+
+## How the Gemini Provider Works
+
+Understanding the provider internals helps with troubleshooting and
+customization.
+
+### Request Flow
+
+The Gemini provider in `sapat/providers/gemini.py` follows this sequence:
+
+1. **Audio encoding**: The provider reads the MP3 file and base64-encodes it.
+2. **Prompt construction**: A system-level prompt instructs Gemini to
+   transcribe the audio and return only the text. Language and custom prompt
+   are appended.
+3. **API call**: The provider sends a `POST` request to the Gemini
+   `generateContent` endpoint with `x-goog-api-key` authentication.
+4. **Response parsing**: The provider extracts text from the first candidate's
+   `content.parts[].text` field. If no candidates are returned, it raises a
+   `RuntimeError` with the `promptFeedback` for debugging.
+
+### Supported Models
+
+The Gemini provider supports model aliases:
+
+| Alias | Model |
+|---|---|
+| `gemini-2.0-flash` (default) | `gemini-2.0-flash` |
+| `flash` | `gemini-2.0-flash` |
+| `pro` | `gemini-2.0-pro` |
+| `flash15` | `gemini-1.5-flash` |
+
+To use a specific model:
+
+```bash
+sapat recording.mp4 --provider gemini --model pro
+```
+
+### Supported Audio Formats
+
+The Gemini provider accepts these formats for inline upload:
+
+- `.mp3` (`audio/mp3`)
+- `.wav` (`audio/wav`)
+- `.flac` (`audio/flac`)
+
+Other formats (`.mp4`, `.mkv`, `.webm`) are automatically converted to MP3
+by ffmpeg before reaching the provider.
+
+### File Size Limit
+
+The Gemini provider has a 14 MB file size limit for inline base64 audio. For
+longer recordings, consider splitting the audio with ffmpeg or using a
+different provider that supports streaming uploads.
+
+## Common Issues and Troubleshooting
+
+**Problem:** `RuntimeError: Gemini transcription failed with status 400`
+
+**Solution:** This usually means the audio file is too large for inline upload
+or the format is unsupported. Convert to MP3 manually and ensure the file is
+under 14 MB:
+
+```bash
+ffmpeg -i large_file.wav -b:a 64k small_file.mp3
+sapat small_file.mp3 --provider gemini
+```
+
+**Problem:** `RuntimeError: Gemini transcription returned no candidates`
+
+**Solution:** Gemini's safety filters may have blocked the request. Check the
+`promptFeedback` in the error message. If the audio contains content that
+triggers safety filters, try a different model (`--model flash15`) or adjust
+the prompt to be more neutral.
+
+**Problem:** `RuntimeError: Gemini transcription request failed: ConnectionError`
+
+**Solution:** Verify network connectivity and that the Generative Language API
+is enabled in your Google Cloud project. Test with curl:
+
+```bash
+curl -s "https://generativelanguage.googleapis.com/v1beta/models?key=$GOOGLE_API_KEY" | head
+```
+
+If you see a list of models, the API is enabled. If you see an error, enable
+the API in the [Google Cloud Console](https://console.cloud.google.com/apis/library/generativelanguage.googleapis.com).
+
+**Problem:** The transcript is in the wrong language
+
+**Solution:** Explicitly set the `--language` flag to match the audio:
+
+```bash
+sapat japanese_meeting.mp4 --provider gemini --language ja
+```
+
+**Problem:** `GOOGLE_API_KEY` is not recognized
+
+**Solution:** Ensure the environment variable is exported in the current shell
+session. Run `echo $GOOGLE_API_KEY` to verify. If empty, re-export it or
+source your shell profile.
+
+## Using Gemini for Advanced Workflows
+
+Gemini's multimodal nature lets you go beyond simple transcription.
+
+### Transcription + Summarization
+
+Use a custom prompt to ask Gemini to both transcribe and summarize:
+
+```bash
+sapat meeting.mp4 --provider gemini \
+  --prompt "Transcribe this meeting. After the transcript, provide a 3-bullet summary of key decisions."
+```
+
+### Translation + Transcription
+
+Ask Gemini to transcribe and translate in one call:
+
+```bash
+sapat spanish_talk.mp4 --provider gemini \
+  --prompt "Transcribe this audio in Spanish, then provide an English translation below."
+```
+
+### Speaker Diarization
+
+While Gemini does not natively perform speaker diarization, you can prompt it
+to attempt speaker identification:
+
+```bash
+sapat podcast.mp4 --provider gemini \
+  --prompt "Identify different speakers. Label each segment with Speaker A, Speaker B, etc."
+```
+
+Results vary depending on audio quality and speaker distinctiveness.
+
+## Comparison with Other Providers
+
+Gemini differs from dedicated speech-to-text providers in several ways:
+
+- **Multimodal prompts**: Unlike Whisper-based providers, Gemini accepts
+  instructions alongside the audio, enabling combined tasks.
+- **No fine-tuning**: Gemini is a general-purpose model. For specialized
+  vocabulary, use the `--prompt` flag rather than fine-tuning.
+- **Rate limits**: Google enforces per-minute request quotas. Check your
+  [quota page](https://console.cloud.google.com/apis/api/generativelanguage.googleapis.com/quotas)
+  if you hit rate limits during batch processing.
+- **Cost**: Gemini pricing is per-token (input + output). Long audio files
+  generate many input tokens. Monitor usage in the
+  [Google AI Studio dashboard](https://aistudio.google.com/).
+
+## Conclusion
+
+You have set up a reproducible Daytona workspace that transcribes audio files
+using Google Gemini through Sapat. The workspace can be shared with teammates,
+committed to version control (without secrets), and rebuilt identically on any
+machine with Docker.
+
+The Gemini provider's multimodal approach lets you combine transcription with
+summarization, translation, or structured extraction in a single API call,
+making it a versatile choice for AI engineers building audio processing
+pipelines.
+
+## References
+
+- [Sapat GitHub Repository](https://github.com/nibzard/sapat)
+- [Google Gemini API Documentation](https://ai.google.dev/docs)
+- [Google AI Studio](https://aistudio.google.com/)
+- [Daytona Documentation](https://www.daytona.io/docs/)
+- [Daytona Installation Guide](https://www.daytona.io/docs/installation/installation/)
+- [ffmpeg Documentation](https://ffmpeg.org/documentation.html)
diff --git a/guides/assets/20260618_guide_ai_transcription_with_google_gemini_workflow.svg b/guides/assets/20260618_guide_ai_transcription_with_google_gemini_workflow.svg
new file mode 100644
index 00000000..d660fcaa
--- /dev/null
+++ b/guides/assets/20260618_guide_ai_transcription_with_google_gemini_workflow.svg
@@ -0,0 +1,70 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 900 320" fill="none">
+  <defs>
+    <marker id="arrow" markerWidth="10" markerHeight="7" refX="10" refY="3.5" orient="auto">
+      <polygon points="0 0, 10 3.5, 0 7" fill="#4A90D9"/>
+    </marker>
+    <linearGradient id="headerGrad" x1="0%" y1="0%" x2="100%" y2="0%">
+      <stop offset="0%" style="stop-color:#4285F4;stop-opacity:1"/>
+      <stop offset="100%" style="stop-color:#34A853;stop-opacity:1"/>
+    </linearGradient>
+  </defs>
+
+  <!-- Background -->
+  <rect width="900" height="320" rx="12" fill="#F8F9FA" stroke="#E0E0E0" stroke-width="1"/>
+
+  <!-- Header -->
+  <rect x="0" y="0" width="900" height="48" rx="12" fill="url(#headerGrad)"/>
+  <rect x="0" y="24" width="900" height="24" fill="url(#headerGrad)"/>
+  <text x="450" y="32" text-anchor="middle" fill="white" font-family="Arial, sans-serif" font-size="16" font-weight="bold">Sapat + Google Gemini Transcription Pipeline</text>
+
+  <!-- Step 1: Media File -->
+  <rect x="30" y="80" width="120" height="60" rx="8" fill="#E3F2FD" stroke="#4285F4" stroke-width="2"/>
+  <text x="90" y="105" text-anchor="middle" fill="#1A73E8" font-family="Arial, sans-serif" font-size="11" font-weight="bold">Media File</text>
+  <text x="90" y="122" text-anchor="middle" fill="#5F6368" font-family="Arial, sans-serif" font-size="10">.mp4 / .wav / .flac</text>
+
+  <!-- Arrow 1 -->
+  <line x1="150" y1="110" x2="190" y2="110" stroke="#4A90D9" stroke-width="2" marker-end="url(#arrow)"/>
+
+  <!-- Step 2: ffmpeg -->
+  <rect x="190" y="80" width="120" height="60" rx="8" fill="#FFF3E0" stroke="#FB8C00" stroke-width="2"/>
+  <text x="250" y="105" text-anchor="middle" fill="#E65100" font-family="Arial, sans-serif" font-size="11" font-weight="bold">ffmpeg</text>
+  <text x="250" y="122" text-anchor="middle" fill="#5F6368" font-family="Arial, sans-serif" font-size="10">Convert to MP3</text>
+
+  <!-- Arrow 2 -->
+  <line x1="310" y1="110" x2="350" y2="110" stroke="#4A90D9" stroke-width="2" marker-end="url(#arrow)"/>
+
+  <!-- Step 3: Sapat CLI -->
+  <rect x="350" y="80" width="120" height="60" rx="8" fill="#E8F5E9" stroke="#43A047" stroke-width="2"/>
+  <text x="410" y="105" text-anchor="middle" fill="#2E7D32" font-family="Arial, sans-serif" font-size="11" font-weight="bold">Sapat CLI</text>
+  <text x="410" y="122" text-anchor="middle" fill="#5F6368" font-family="Arial, sans-serif" font-size="10">Base64 encode</text>
+
+  <!-- Arrow 3 -->
+  <line x1="470" y1="110" x2="510" y2="110" stroke="#4A90D9" stroke-width="2" marker-end="url(#arrow)"/>
+
+  <!-- Step 4: Gemini API -->
+  <rect x="510" y="70" width="160" height="80" rx="8" fill="#E8EAF6" stroke="#5C6BC0" stroke-width="2"/>
+  <text x="590" y="95" text-anchor="middle" fill="#283593" font-family="Arial, sans-serif" font-size="11" font-weight="bold">Gemini API</text>
+  <text x="590" y="112" text-anchor="middle" fill="#5F6368" font-family="Arial, sans-serif" font-size="10">generateContent</text>
+  <text x="590" y="128" text-anchor="middle" fill="#5F6368" font-family="Arial, sans-serif" font-size="10">gemini-2.0-flash</text>
+
+  <!-- Arrow 4 -->
+  <line x1="670" y1="110" x2="710" y2="110" stroke="#4A90D9" stroke-width="2" marker-end="url(#arrow)"/>
+
+  <!-- Step 5: Transcript -->
+  <rect x="710" y="80" width="150" height="60" rx="8" fill="#F3E5F5" stroke="#AB47BC" stroke-width="2"/>
+  <text x="785" y="105" text-anchor="middle" fill="#7B1FA2" font-family="Arial, sans-serif" font-size="11" font-weight="bold">Transcript Output</text>
+  <text x="785" y="122" text-anchor="middle" fill="#5F6368" font-family="Arial, sans-serif" font-size="10">.txt file</text>
+
+  <!-- Environment Variables Box -->
+  <rect x="250" y="200" width="400" height="80" rx="8" fill="#FFF8E1" stroke="#FFB300" stroke-width="2"/>
+  <text x="450" y="225" text-anchor="middle" fill="#F57F17" font-family="Arial, sans-serif" font-size="12" font-weight="bold">Environment Configuration</text>
+  <text x="450" y="245" text-anchor="middle" fill="#5F6368" font-family="Arial, sans-serif" font-size="10">GOOGLE_API_KEY — Required for Gemini authentication</text>
+  <text x="450" y="262" text-anchor="middle" fill="#5F6368" font-family="Arial, sans-serif" font-size="10">GEMINI_API_KEY — Alternative alias for the same key</text>
+
+  <!-- Connection from env to Gemini -->
+  <line x1="450" y1="200" x2="590" y2="150" stroke="#FFB300" stroke-width="1.5" stroke-dasharray="4,3" marker-end="url(#arrow)"/>
+
+  <!-- Footer -->
+  <text x="450" y="305" text-anchor="middle" fill="#9E9E9E" font-family="Arial, sans-serif" font-size="10">Sapat Provider: gemini | Model: gemini-2.0-flash | Max file size: 14 MB</text>
+</svg>