Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .mcp.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
"args": ["run", "sanzaru"],
"env": {
"OPENAI_API_KEY": "${OPENAI_API_KEY}",
"SANZARU_MEDIA_PATH": "${SANZARU_MEDIA_PATH}"
"SANZARU_MEDIA_PATH": "${SANZARU_MEDIA_PATH}",
"GOOGLE_API_KEY": "${GOOGLE_API_KEY}",
"GOOGLE_GENAI_USE_VERTEXAI": "${GOOGLE_GENAI_USE_VERTEXAI}",
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}",
"GOOGLE_CLOUD_LOCATION": "${GOOGLE_CLOUD_LOCATION}"
}
}
}
Expand Down
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,13 @@ audio = [
image = [
"pillow>=12.0.0",
]
google = [
"google-genai>=1.0.0",
"pillow>=12.0.0",
]
databricks = [] # httpx already a core dep; extra exists for signaling intent
all = [
"sanzaru[video,audio,image]", # databricks intentionally excluded from "all"
"sanzaru[video,audio,image,google]", # databricks intentionally excluded from "all"
]

[dependency-groups]
Expand Down
70 changes: 69 additions & 1 deletion src/sanzaru/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,20 @@
- Logging setup
"""

from __future__ import annotations

import logging
import os
import pathlib
import sys
from functools import lru_cache
from typing import Literal
from typing import TYPE_CHECKING, Literal

from openai import AsyncOpenAI

if TYPE_CHECKING:
from google import genai

# ---------- Logging configuration ----------
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper()
logging.basicConfig(
Expand All @@ -43,6 +48,69 @@ def get_client() -> AsyncOpenAI:
return AsyncOpenAI(api_key=api_key)


# ---------- Google Gen AI client (stateless) ----------
def get_google_client() -> genai.Client:
"""Get a Google Gen AI client instance.

Supports Vertex AI (ADC or Express mode) and Gemini Developer API via env var auto-detection.

Auth is fully driven by environment variables — no explicit credential loading required.

Vertex AI (GOOGLE_GENAI_USE_VERTEXAI=True):
Standard mode (ADC) — for teams using service accounts, gcloud, or attached SA:
GOOGLE_CLOUD_PROJECT=<project> (required)
GOOGLE_CLOUD_LOCATION=<region> (optional, default: us-central1)
GOOGLE_APPLICATION_CREDENTIALS=... (optional — SA key file, WIF config, etc.)

Express mode — simplified access for paid-tier projects via API key:
GOOGLE_API_KEY=<google-cloud-api-key>
GOOGLE_CLOUD_PROJECT=<project> (optional, but recommended)
GOOGLE_CLOUD_LOCATION=<region> (optional, default: us-central1)

Gemini Developer API (no GOOGLE_GENAI_USE_VERTEXAI):
GOOGLE_API_KEY=<gemini-api-key>

Returns:
Configured Google Gen AI Client

Raises:
ImportError: If google-genai package is not installed
RuntimeError: If required environment variables are not set
"""
try:
from google import genai
except ImportError as e:
raise ImportError("google-genai package is required. Install with: uv add 'sanzaru[google]'") from e

use_vertex = os.getenv("GOOGLE_GENAI_USE_VERTEXAI", "").lower() in ("true", "1")

if use_vertex:
project = os.getenv("GOOGLE_CLOUD_PROJECT")
location = os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1")
api_key = os.getenv("GOOGLE_API_KEY")

if not project and not api_key:
raise RuntimeError(
"Vertex AI requires GOOGLE_CLOUD_PROJECT (ADC/service-account auth) "
"or GOOGLE_API_KEY (Express mode) when GOOGLE_GENAI_USE_VERTEXAI=True"
)

# API key and project/location are mutually exclusive in the SDK.
# Express mode: api_key only. Standard mode: project + location + ADC.
if api_key:
return genai.Client(vertexai=True, api_key=api_key)
return genai.Client(vertexai=True, project=project, location=location)

api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
raise RuntimeError(
"Google credentials not configured. "
"Set GOOGLE_GENAI_USE_VERTEXAI=True + GOOGLE_CLOUD_PROJECT (Vertex AI) "
"or GOOGLE_API_KEY (Gemini Developer API)"
)
return genai.Client(api_key=api_key)


# ---------- Path configuration (runtime) ----------

# Mapping from path_type to (individual env var, subdirectory under SANZARU_MEDIA_PATH)
Expand Down
150 changes: 76 additions & 74 deletions src/sanzaru/descriptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,98 +185,100 @@

# ==================== IMAGE GENERATION TOOL DESCRIPTIONS ====================

CREATE_IMAGE = """Non-blocking async image generation with gpt-image-1.5 support.
CREATE_IMAGE = """Create an async image generation job via OpenAI Responses API.

Creates images from text prompts OR edits existing images by providing reference images.
Returns immediately with a response_id - use get_image_status() to poll for completion.
Supports iterative refinement via previous_response_id.
Returns immediately with a response_id. Poll with get_image_status() until completed, then download_image().
Best for: parallel generation (multiple images at once) and iterative refinement chains (previous_response_id).

**Best for:** parallel generation (multiple images at once), iterative refinement chains,
and workflows where you need to do other work while images generate.
For simple one-shot generation, generate_image is simpler (no polling needed).

**Text-only generation (no input_images):**
- Generates image from scratch based on prompt

**Image editing (with input_images):**
- Modifies existing images based on prompt
- Combines multiple images into new composition
- First image receives highest detail preservation
- Prompt describes desired changes, not what's already in images
For synchronous one-shot generation (no polling), use generate_image instead.
For Google Nano Banana generation, use create_image_google.

Parameters:
- prompt: Text description (required)
* Without input_images: Describe what to generate
* With input_images: Describe what changes to make
- model: Mainline model - "gpt-5.2" (default), "gpt-5.1", "gpt-5", etc.
- tool_config: Optional ImageGeneration configuration object (optional)
* Supports all fields: model, size, quality, moderation, input_fidelity, etc.
* MCP library handles serialization automatically
* See examples below for common configurations
- previous_response_id: Refine previous image iteratively (optional)
- input_images: List of filenames from IMAGE_PATH (optional)
* Example: ["cat.png"] or ["lotion.jpg", "soap.png", "bomb.jpg"]
* Use list_reference_images() to discover available images
* Supported formats: JPEG, PNG, WEBP
- mask_filename: PNG with alpha channel for inpainting (optional)
* Defines which region of first input image to edit
* Transparent = edit this area, black = keep original
* Requires input_images parameter

**Image generation models (tool_config.model):**
- gpt-image-1.5: STATE-OF-THE-ART (RECOMMENDED) - Best quality, better instruction following, improved text rendering
- gpt-image-1: High quality image generation
- gpt-image-1-mini: Fast, cost-effective generation

Common tool_config examples:

Best quality with GPT Image 1.5:
tool_config={"type": "image_generation", "model": "gpt-image-1.5"}

Fast generation with mini model:
tool_config={"type": "image_generation", "model": "gpt-image-1-mini"}

Lower content moderation:
tool_config={"type": "image_generation", "moderation": "low"}

High-fidelity with custom settings:
tool_config={
"type": "image_generation",
"model": "gpt-image-1.5",
"quality": "high",
"input_fidelity": "high",
"size": "1536x1024"
}
- prompt: Text description of image to generate (required)
- model: OpenAI model ID (default: "gpt-5.2")
- tool_config: ImageGeneration config object to control the image generation tool:
* gpt-image-1.5: STATE-OF-THE-ART (recommended)
* gpt-image-1: High quality
* gpt-image-1-mini: Fast, cost-effective
- previous_response_id: Refine a previous generation iteratively (optional)
- input_images: List of reference image filenames from IMAGE_PATH (optional)
- mask_filename: PNG with alpha channel for inpainting (optional, requires input_images)

Returns ImageResponse with {id, status, created_at} — poll then download.

Workflows:

1. Text-only generation (recommended):
1. Text-only generation:
create_image("sunset over mountains", tool_config={"type": "image_generation", "model": "gpt-image-1.5"})

2. Single image editing:
2. Image editing:
create_image("add a flamingo to the pool", input_images=["lounge.png"])

3. Multi-image composition:
create_image("gift basket with all these items", input_images=["lotion.png", "soap.png", "bomb.jpg"])
create_image("gift basket with all items", input_images=["lotion.png", "soap.png"])

4. High-fidelity logo placement:
create_image(
"add logo to woman's shirt",
input_images=["woman.jpg", "logo.png"],
tool_config={"type": "image_generation", "input_fidelity": "high"}
)

5. Masked inpainting:
4. Masked inpainting:
create_image("add flamingo", input_images=["pool.png"], mask_filename="pool_mask.png")

6. Fast generation with mini model:
create_image("quick sketch of a cat", tool_config={"type": "image_generation", "model": "gpt-image-1-mini"})

7. Iterative refinement:
5. Iterative refinement:
resp1 = create_image("a cyberpunk character")
resp2 = create_image("add neon details", previous_response_id=resp1.id)

Returns ImageResponse with: id, status, created_at"""
tool_config examples:
Best quality: {"type": "image_generation", "model": "gpt-image-1.5"}
Fast: {"type": "image_generation", "model": "gpt-image-1-mini"}
High-fidelity: {"type": "image_generation", "model": "gpt-image-1.5", "quality": "high", "size": "1536x1024"}"""

CREATE_IMAGE_GOOGLE = """Generate an image using Google Nano Banana (Gemini image models). Synchronous — image ready immediately.

No polling required. Returns the saved filename, dimensions, and format directly.
Supports reference images for editing, style transfer, and multi-image composition (up to 14 images).

Models:
- "gemini-3.1-flash-image-preview": Nano Banana 2 (DEFAULT, RECOMMENDED) — Flash speed + Pro quality, thinking-enhanced
- "gemini-3-pro-image-preview": Nano Banana Pro — max quality, complex instructions, slowest
- "gemini-2.5-flash-image": Nano Banana — fastest, high-volume generation

Parameters:
- prompt: Text description (required). When using input_images, describe only the desired edits/transformation.
- model: Google model ID (default: "gemini-3.1-flash-image-preview")
- aspect_ratio: "1:1" (default), "16:9", "9:16", "4:3", "3:4", "3:2", "2:3", "21:9", "5:4", "4:5"
- image_size: Output resolution: "1K" (default), "2K", "4K"
- filename: Custom output filename (auto-generated if omitted)
- input_images: List of reference image filenames from IMAGE_PATH (optional, max 14).
Supported formats: JPEG, PNG, WEBP. Use list_reference_images to find available images.
- safety_settings: List of {"category", "threshold"} dicts. All OFF by default.
Categories: HARM_CATEGORY_HATE_SPEECH, HARM_CATEGORY_DANGEROUS_CONTENT,
HARM_CATEGORY_SEXUALLY_EXPLICIT, HARM_CATEGORY_HARASSMENT
Thresholds: "OFF" (default), "BLOCK_LOW_AND_ABOVE", "BLOCK_MEDIUM_AND_ABOVE", "BLOCK_HIGH_AND_ABOVE"

Returns ImageDownloadResult with {filename, size, format} — ready immediately.

Workflows:

1. Text-only generation:
create_image_google("a futuristic cityscape at dusk")

2. Landscape with high resolution:
create_image_google("mountain vista at golden hour", aspect_ratio="16:9", image_size="4K")

3. Max quality (Nano Banana Pro):
create_image_google("detailed product render", model="gemini-3-pro-image-preview")

4. Image editing with reference:
create_image_google("make this watercolor style", input_images=["photo.png"])

5. Multi-image composition:
create_image_google("combine these into a collage", input_images=["img1.png", "img2.png", "img3.png"])

6. Character consistency (same character, new scene):
create_image_google("place this character in a forest", input_images=["character.png"])

7. Style transfer from reference:
create_image_google("apply this art style to a cityscape", input_images=["style_ref.png"])

8. Custom filename:
create_image_google("a cute robot", filename="robot_concept.png")"""

GET_IMAGE_STATUS = """Check status and progress of image generation.

Expand Down
44 changes: 44 additions & 0 deletions src/sanzaru/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,49 @@ def check_databricks_storage() -> bool:
return True


def check_google_available() -> bool:
"""Check if Google Nano Banana image generation is available.

Requires:
1. google-genai package installed
2. Either:
- GOOGLE_GENAI_USE_VERTEXAI=True + GOOGLE_CLOUD_PROJECT (Vertex AI, recommended for teams)
- GOOGLE_API_KEY (Gemini Developer API)

Returns:
True if google-genai is installed and credentials are configured, False otherwise
"""
try:
import google.genai # noqa: F401
except ImportError:
return False

use_vertex = os.getenv("GOOGLE_GENAI_USE_VERTEXAI", "").lower() in ("true", "1")
if use_vertex:
project = os.getenv("GOOGLE_CLOUD_PROJECT")
api_key = os.getenv("GOOGLE_API_KEY")
if project and api_key:
logger.info(
"Google Nano Banana available via Vertex AI Express (api_key takes precedence, project=%s ignored)",
project,
)
return True
if project:
logger.info("Google Nano Banana available via Vertex AI ADC (project=%s)", project)
return True
if api_key:
logger.info("Google Nano Banana available via Vertex AI Express (api_key only)")
return True
logger.info("GOOGLE_GENAI_USE_VERTEXAI=True but neither GOOGLE_CLOUD_PROJECT nor GOOGLE_API_KEY set")
return False

if os.getenv("GOOGLE_API_KEY"):
logger.info("Google Nano Banana available via Gemini Developer API")
return True

return False


def get_available_features() -> dict[str, bool]:
"""Get a dictionary of available features.

Expand All @@ -130,4 +173,5 @@ def get_available_features() -> dict[str, bool]:
"video": check_video_available(),
"audio": check_audio_available(),
"image": check_image_available(),
"google": check_google_available(),
}
Loading
Loading