diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index b6f8c8c..5161251 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -11,7 +11,7 @@ "name": "agora", "source": "./", "description": "Real-time communication with Agora SDKs — RTC, RTM, Conversational AI, and token generation", - "version": "1.1.0" + "version": "1.2.0" } ] } diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index ba113b9..dd532da 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "agora", "description": "Real-time communication with Agora SDKs — RTC, RTM, Conversational AI, and token generation", - "version": "1.1.0", + "version": "1.2.0", "author": { "name": "Agora" }, diff --git a/CHANGELOG.md b/CHANGELOG.md index b07c2b3..804ff77 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,32 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [1.2.0] + +### Added + +- RTC React Native reference (`references/rtc/react-native.md`) — `react-native-agora`: engine init, events, `RtcSurfaceView`, cleanup +- RTC Flutter reference (`references/rtc/flutter.md`) — `agora_rtc_engine`: engine init, `AgoraVideoView`, `RtcEngineEventHandler`, cleanup +- RTM iOS reference (`references/rtm/ios.md`) — `AgoraRtmClientKit` v2 (Swift): init, login, subscribe, publish, presence, delegate +- RTM Android reference (`references/rtm/android.md`) — `RtmClient` v2 (Kotlin): init, login, subscribe, publish, event listener +- ConvoAI iOS toolkit reference (`references/conversational-ai/agent-toolkit-ios.md`) — `ConversationalAIAPIImpl` Swift patterns +- ConvoAI Android toolkit reference (`references/conversational-ai/agent-toolkit-android.md`) — `ConversationalAIAPIImpl` Kotlin patterns +- Multi-product integration guide (`references/integration-patterns.md`) — RTC+RTM+ConvoAI init order, UID strategy, channel naming, token matrix, codec selection, cleanup sequence +- Testing guidance expanded — RTC React Native, Flutter, RTM Web/iOS/Android mocking patterns; token renewal section; table of contents + +### Changed + +- `rtc/react.md`: add codec interop note — `vp8` recommended; `vp9` hardware-limited on older iOS Safari; `h264` does not scale for multi-user +- `rtc/cross-platform-coordination.md`: corrected codec table — `vp8` is the safe default; `vp9` requires iPhone 15 Pro / M3+ hardware on iOS Safari; `h264` avoid for multi-user +- `rtc/README.md`: updated codec interop note to match corrected recommendation +- `rtm/ios.md`, `rtm/android.md`: added v2 to titles to prevent v1 API misuse +- `rtm/README.md`: added Platform Scope section clarifying client-side only, all v2, no server/desktop variant +- `rtm/web.md`: removed RTM v1 legacy section; constructor wrapped in try/catch; token-only login form +- `conversational-ai/README.md`: added SDK-vs-REST routing table; RTM channel name = RTC channel name gotcha; scoped auth section to direct REST implementors +- `conversational-ai/auth-flow.md`: scoped to REST API implementors; added SDK-skip callout at top +- `SKILL.md`: bumped version to 1.2.0; added Multi-Product Integration entry; expanded RTC platform list (React Native, Flutter) and RTM platform list (iOS, Android) +- `CLAUDE.md`, `README.md`: updated file structure trees and product lists to reflect all new files and platforms + ## [1.1.0] ### Added diff --git a/CLAUDE.md b/CLAUDE.md index 4fdd9cf..d7479b8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -14,9 +14,10 @@ skills/ └── references/ ├── doc-fetching.md ← two-tier lookup procedure (agent-facing) ├── mcp-tools.md ← Agora MCP server install guide (user-facing) - ├── rtc/ - ├── rtm/ - ├── conversational-ai/ + ├── integration-patterns.md ← multi-product coordination (RTC+RTM+ConvoAI) + ├── rtc/ ← Web, React, Next.js, iOS, Android, React Native, Flutter + ├── rtm/ ← Web, iOS, Android (all v2) + ├── conversational-ai/ ← REST API, server SDKs, iOS/Android toolkits, React hooks ├── server/ ├── cloud-recording/ ├── server-gateway/ @@ -45,9 +46,13 @@ Before adding any inline content, ask: **will this still be correct in 6 months |---|---| | RTC initialization, track management, event registration | Inline — `references/rtc/` | | RTM messaging and presence patterns | Inline — `references/rtm/` | -| Token generation (RTC, RTM, AccessToken2) | Inline — `references/server/` | -| ConvoAI gotchas, field-type rules, lifecycle | Inline — `references/conversational-ai/README.md` | -| ConvoAI quick-start code (Python, Go, Java) | Level 2 fetch | +| Token generation (RTC, RTM, AccessToken2) | Inline — `references/server/` (`server/` is token generation only — not ConvoAI SDKs) | +| ConvoAI gotchas, field-type rules, REST API lifecycle | Inline — `references/conversational-ai/README.md` | +| ConvoAI server SDK patterns (auth modes, state machine, session lifecycle, avatar+TTS) | Inline — `references/conversational-ai/server-sdks.md` | +| ConvoAI auth flow (three-token distinction, sequence diagram) | Inline — `references/conversational-ai/auth-flow.md` | +| ConvoAI Python SDK patterns (sync/async, deprecations, debug) | Inline — `references/conversational-ai/python-sdk.md` | +| ConvoAI Go SDK patterns (context.Context, functional options, status constants) | Inline — `references/conversational-ai/go-sdk.md` | +| ConvoAI quick-start code with vendor-specific imports (Python, Go, Java) | Level 2 fetch | | ConvoAI full request/response schemas | Level 2 fetch | | TTS / ASR / LLM vendor configs and model names | Level 2 fetch | | Cloud Recording REST API field details | Level 2 fetch | diff --git a/README.md b/README.md index 9b9b786..e71e613 100644 --- a/README.md +++ b/README.md @@ -66,13 +66,14 @@ This repo contains markdown skill files that give AI coding assistants deep know **Products covered:** -- **RTC (Video/Voice SDK)** — Web, React, Next.js, iOS (Swift), Android (Kotlin/Java) -- **RTM (Signaling)** — Web (JS/TS) messaging, presence, metadata, stream channels -- **Conversational AI** — REST API, agent config, Gemini Live + OpenAI Realtime MLLM, 6 recipe repos (agent-samples, agent-toolkit, agent-client-toolkit-react, agent-ui-kit, server-custom-llm, server-mcp) +- **RTC (Video/Voice SDK)** — Web, React, Next.js, iOS (Swift), Android (Kotlin/Java), React Native, Flutter +- **RTM (Signaling)** — Web (JS/TS), iOS (Swift), Android (Kotlin) — all v2; messaging, presence, metadata, stream channels +- **Conversational AI** — REST API, agent config, Gemini Live + OpenAI Realtime MLLM, iOS/Android toolkits, 6 recipe repos (agent-samples, agent-toolkit, agent-client-toolkit-react, agent-ui-kit, server-custom-llm, server-mcp) - **Cloud Recording** — REST API acquire/start/query/stop lifecycle - **Server Gateway** — Linux SDK (C++) for server-side RTC - **Server-Side** — Token generation for Node.js, Python, Go -- **Testing Guidance** — ConvoAI and RTC testing patterns +- **Multi-Product Integration** — RTC + RTM + ConvoAI initialization order, UID strategy, codec selection, token matrix +- **Testing Guidance** — Mocking patterns for all platforms (Web, React, iOS, Android, React Native, Flutter, RTM) ## Design — 4-Layer Progressive Disclosure @@ -104,30 +105,42 @@ ConvoAI files are aligned 1:1 with repos in [AgoraIO-Conversational-AI](https:// ``` skills/ └── agora/ Skill root - ├── SKILL.md Entry point, product index + ├── SKILL.md Entry point, product index (v1.2.0) ├── intake/ │ └── SKILL.md Multi-product needs analysis router └── references/ ├── doc-fetching.md Two-tier lookup procedure (agent-facing) ├── mcp-tools.md MCP tool reference and graceful degradation + ├── integration-patterns.md RTC+RTM+ConvoAI: init order, UID strategy, codec, tokens ├── rtc/ RTC (Video/Voice SDK) │ ├── README.md Critical rules, encoder profiles, cross-platform notes │ ├── web.md agora-rtc-sdk-ng: client, tracks, events, screen share - │ ├── react.md agora-rtc-react: hooks, custom patterns + │ ├── react.md agora-rtc-react: hooks, codec interop, custom patterns │ ├── nextjs.md Next.js / SSR dynamic import patterns │ ├── ios.md AgoraRtcEngineKit (Swift): setup, delegation - │ └── android.md RtcEngine (Kotlin/Java): setup, callbacks - ├── rtm/ RTM (Signaling / Messaging) - │ ├── README.md Key concepts, platform links - │ └── web.md agora-rtm v2: messaging, presence, stream channels + │ ├── android.md RtcEngine (Kotlin/Java): setup, callbacks + │ ├── react-native.md react-native-agora: engine init, events, video views + │ ├── flutter.md agora_rtc_engine (Dart): engine init, AgoraVideoView + │ └── cross-platform-coordination.md UID strategy, codec interop, screen share + ├── rtm/ RTM Signaling SDK v2 + │ ├── README.md Key concepts, gotchas, platform links + │ ├── web.md agora-rtm v2: messaging, presence, stream channels + │ ├── ios.md AgoraRtmClientKit (Swift): init, login, subscribe, publish + │ └── android.md RtmClient (Kotlin): init, login, subscribe, publish ├── conversational-ai/ Conversational AI (Voice AI Agents) │ ├── README.md Architecture, endpoints, auth, lifecycle, gotchas │ ├── agent-samples.md Backend, React clients, profiles, MLLM, deployment │ ├── agent-toolkit.md @agora/conversational-ai SDK: API, helpers, hooks │ ├── agent-client-toolkit-react.md React hooks: provider, transcript, state │ ├── agent-ui-kit.md @agora/agent-ui-kit React components + │ ├── agent-toolkit-ios.md iOS ConversationalAIAPIImpl Swift toolkit + │ ├── agent-toolkit-android.md Android ConversationalAIAPIImpl Kotlin toolkit │ ├── server-custom-llm.md Custom LLM proxy: RAG, tools, memory - │ └── server-mcp.md MCP memory server: persistent per-user memory + │ ├── server-mcp.md MCP memory server: persistent per-user memory + │ ├── auth-flow.md Three-token flow for direct REST API implementors + │ ├── python-sdk.md agora-agent Python SDK patterns + │ ├── go-sdk.md agora-agent-server-sdk-go patterns + │ └── server-sdks.md TypeScript/Node.js server SDK patterns ├── cloud-recording/ Cloud Recording (REST API) │ └── README.md acquire/start/query/stop lifecycle, storage config ├── server-gateway/ Server Gateway (Linux SDK) @@ -137,7 +150,7 @@ skills/ │ ├── README.md Token types, when tokens are needed │ └── tokens.md Token generation TOC + links to official docs └── testing-guidance/ Testing Patterns - └── SKILL.md ConvoAI and RTC test setup, mocking patterns + └── SKILL.md Mocking patterns: Web, React, iOS, Android, RN, Flutter, RTM ``` ## Maintaining and Extending diff --git a/skills/agora/SKILL.md b/skills/agora/SKILL.md index ea5e91f..f46166d 100644 --- a/skills/agora/SKILL.md +++ b/skills/agora/SKILL.md @@ -3,12 +3,12 @@ name: agora description: Write code using Agora SDKs (agora.io) for real-time communication. Covers RTC (video/voice calling, live streaming, screen sharing), RTM (signaling, messaging, presence), Conversational AI (voice AI agents), Cloud Recording, Server Gateway, and server-side token generation. Use when the user wants to build real-time audio/video applications, integrate Agora SDKs (Web JS/TS, React, iOS Swift, Android Kotlin/Java, Go, Python), manage channels, tracks, tokens, use RTM for messaging/signaling, record RTC sessions, or build Conversational AI with the agent-toolkit. Triggers on mentions of Agora, agora.io, RTC, RTM, video calling, voice calling, real-time communication, screen share, screen sharing, record session, record calls, Cloud Recording, Server Gateway, Linux media SDK, agora-rtc-sdk-ng, agora-rtc-react, agora-rtm, conversational AI with Agora, Agora token generation, Agora authentication, agora-agent-client-toolkit, agora-agent-client-toolkit-react, agora-agent-server-sdk, AgoraVoiceAI, AgoraClient, useConversationalAI, useTranscript, useAgentState, agent transcript, agent state hook. metadata: author: agora - version: '1.1.0' + version: '1.2.0' --- # Agora (agora.io) -Skill version: 1.1.0 +Skill version: 1.2.0 Build real-time communication applications using Agora SDKs across Web, iOS, Android, and server-side platforms. @@ -28,13 +28,13 @@ Read the README for the product the user needs. Only load what is needed. Real-time audio and video. Users join channels, publish local tracks, subscribe to remote tracks. -**[references/rtc/README.md](references/rtc/README.md)** — Platforms: Web, React, iOS, Android +**[references/rtc/README.md](references/rtc/README.md)** — Platforms: Web, React, Next.js, iOS, Android, React Native, Flutter. Windows/Electron/Unity exist but require Level 2 fetch — no inline reference files. ### RTM (Signaling / Messaging) Text messaging, signaling, presence, and metadata. Independent from RTC — channel namespaces are separate. -**[references/rtm/README.md](references/rtm/README.md)** — Platforms: Web +**[references/rtm/README.md](references/rtm/README.md)** — Platforms: Web, iOS, Android (all v2). RTM is a client-side SDK — there is no server-side or desktop (Electron/Windows) RTM variant. ### Conversational AI (Voice AI Agents) @@ -60,6 +60,12 @@ Self-hosted Linux SDK for server-side audio/video stream transmission. Use when **[references/server-gateway/README.md](references/server-gateway/README.md)** — C++, Java, Go, Python +### Multi-Product Integration + +Initialization order, UID strategy, channel naming, token matrix, and cleanup sequence when combining RTC + RTM + ConvoAI. + +**[references/integration-patterns.md](references/integration-patterns.md)** + ### Testing Guidance Mocking patterns and testing requirements for Agora SDK integration code. diff --git a/skills/agora/references/conversational-ai/README.md b/skills/agora/references/conversational-ai/README.md index 9f0b358..4d36f49 100644 --- a/skills/agora/references/conversational-ai/README.md +++ b/skills/agora/references/conversational-ai/README.md @@ -2,8 +2,19 @@ REST API-driven voice AI agents. Create agents that join RTC channels and converse with users via speech. Front-end clients connect via RTC+RTM. -The TypeScript, Go, and Python SDKs are convenience wrappers around this REST API. -For any other backend language (Java, Ruby, PHP, C#, etc.), call the REST API directly. +## SDK vs. Direct REST API + +**Default to the SDK for the user's backend language.** The TypeScript, Go, and Python SDKs wrap the REST API and handle auth, token generation, and session lifecycle automatically. + +| Backend language | Path | +|---|---| +| TypeScript / Node.js | `agora-agent-server-sdk` — see [server-sdks.md](server-sdks.md) | +| Python | `agora-agent` — see [python-sdk.md](python-sdk.md) | +| Go | `agora-agent-server-sdk-go` — see [go-sdk.md](go-sdk.md) | +| Java, Ruby, PHP, C#, other | Call the REST API directly — see [auth-flow.md](auth-flow.md) | + +Direct REST API use is fully supported for languages without an SDK. The [auth-flow.md](auth-flow.md) file covers the end-to-end auth and token flow for REST API implementors. If the user has an SDK available, start with that instead — the SDK eliminates the need to manually build tokens for the ConvoAI server. + The live OpenAPI spec is the authoritative source for request/response schemas: ``` @@ -45,9 +56,11 @@ current and covers every endpoint and field: See [../doc-fetching.md](../doc-fetching.md) for the full procedure. -## Authentication +## Authentication (Direct REST API) -Two methods are supported. **Token-based auth is preferred** — it avoids storing long-lived Customer Secret credentials on your server. +This section covers auth for implementors calling the REST API directly (non-SDK languages). **If the user has a TypeScript/Python/Go SDK available, skip this — the SDK handles auth automatically in App Credentials mode.** See [server-sdks.md](server-sdks.md) instead. + +Two methods are supported for direct REST API calls. **Token-based auth is preferred** — it avoids storing long-lived Customer Secret credentials on your server. ### Option A: Agora Token (recommended) @@ -114,6 +127,7 @@ Things the official docs don't emphasize that cause frequent mistakes: - **`/speak` priority enum** — `"INTERRUPT"` (immediate, default), `"APPEND"` (queued after current speech), `"IGNORE"` (skip if agent is busy). `interruptable: false` prevents users from cutting in. - **20 PCU default limit** — max 20 concurrent agents per App ID. Exceeding returns error on `/join`. Contact Agora support to increase. - **Event notifications require two flags** — `advanced_features.enable_rtm: true` AND `parameters.data_channel: "rtm"` in the join config. Without both, `onAgentStateChanged`/`onAgentMetrics`/`onAgentError` won't fire. Additionally: `parameters.enable_metrics: true` for metrics, `parameters.enable_error_message: true` for errors. +- **RTM channel name matches the RTC channel name** — the agent publishes transcripts and state events to the RTM channel with the same name as the RTC channel it joined. Subscribe the RTM client to the same channel name you passed to the agent's `properties.channel`. - **Custom LLM interruptable metadata** — the first SSE chunk can be `{"object": "chat.completion.custom_metadata", "metadata": {"interruptable": false}}` to prevent user speech from interrupting critical responses (e.g., compliance disclaimers). Subsequent chunks use standard `chat.completion.chunk` format. - **Error response format** — non-200 responses return `{ "detail": "...", "reason": "..." }`. - **MLLM `location` not `region`** — use `params.location: "us-central1"`, not `region`. The field name is `location` at every level (join payload and backend env vars). @@ -135,14 +149,22 @@ For test setup and mocking patterns, see [references/testing-guidance/SKILL.md]( ## Reference Files -Each file maps to one repo in [AgoraIO-Conversational-AI](https://github.com/AgoraIO-Conversational-AI): - -- **[agent-samples.md](agent-samples.md)** — Backend (simple-backend), React clients, profile system, MLLM/Gemini config, deployment -- **[agent-toolkit.md](agent-toolkit.md)** — `agora-agent-client-toolkit` + `agora-agent-client-toolkit-react`: AgoraVoiceAI, events, transcript, sendText, interrupt, React hooks -- **[agent-client-toolkit-react.md](agent-client-toolkit-react.md)** — React hooks detail: ConversationalAIProvider, useTranscript, useAgentState, useAgentError, useAgentMetrics, useConversationalAI -- **[agent-ui-kit.md](agent-ui-kit.md)** — `@agora/agent-ui-kit` React components: voice, chat, video, settings -- **[server-custom-llm.md](server-custom-llm.md)** — Custom LLM proxy: RAG, tool calling, conversation memory -- **[server-mcp.md](server-mcp.md)** — MCP memory server: persistent per-user memory via tool calling +Use the file that matches what the user is building: + +| User's question / task | Read this file | +|---|---| +| Node.js/Python/Go backend — starting agent, auth, session lifecycle | [server-sdks.md](server-sdks.md) | +| Python SDK specifics (async, deprecations, debug) | [python-sdk.md](python-sdk.md) | +| Go SDK specifics (context, builder, status constants) | [go-sdk.md](go-sdk.md) | +| Auth flow, token types, direct REST API (non-SDK languages) | [auth-flow.md](auth-flow.md) | +| Full working demo app architecture, profiles, MLLM/Gemini | [agent-samples.md](agent-samples.md) | +| Web/React client: transcripts, agent state, sendText, interrupt | [agent-toolkit.md](agent-toolkit.md) | +| React hooks in depth (useTranscript, useAgentState, provider) | [agent-client-toolkit-react.md](agent-client-toolkit-react.md) | +| React UI components (voice visualizer, chat UI, video) | [agent-ui-kit.md](agent-ui-kit.md) | +| iOS client: ConversationalAIAPIImpl, Swift | [agent-toolkit-ios.md](agent-toolkit-ios.md) | +| Android client: ConversationalAIAPIImpl, Kotlin | [agent-toolkit-android.md](agent-toolkit-android.md) | +| Custom LLM backend: RAG, tool calling, conversation memory | [server-custom-llm.md](server-custom-llm.md) | +| Persistent per-user memory via MCP | [server-mcp.md](server-mcp.md) | ## REST API Reference @@ -157,6 +179,22 @@ Full request/response details for all endpoints — **always fetch these; do not - **[Interrupt Agent](https://docs-md.agora.io/en/conversational-ai/rest-api/agent/interrupt.md)** — POST /interrupt - **[Conversation History](https://docs-md.agora.io/en/conversational-ai/rest-api/agent/history.md)** — GET /history +## Production: Platform Webhooks + +The ConvoAI platform can POST event notifications to your server endpoint when agent state changes. These are distinct from: +- The SDK's in-process `session.on()` events (fire in your Node.js/Python/Go process) +- The client toolkit's `AGENT_STATE_CHANGED` event (fires in the browser via RTM) + +Webhooks are the correct pattern for **production stateless deployments** where you do not hold the `AgentSession` object in memory between requests. Your server receives a POST when agent state changes, correlates using the agent ID (returned by `session.start()` / the `/join` response), and updates your application state accordingly. + +Webhook payload schemas and registration are REST API surface — do not rely on inline content here. Fetch from the Agora docs: + +``` +GET https://docs-md.agora.io/en/conversational-ai/rest-api/agent/join.md +``` + +or via MCP: search for "conversational AI webhook" in the Agora docs tool. + ## Agent Configuration (join payload `properties` object) Fetch these before answering questions about vendor configs, model names, or join payload fields: diff --git a/skills/agora/references/conversational-ai/agent-client-toolkit-react.md b/skills/agora/references/conversational-ai/agent-client-toolkit-react.md index 94c682d..242abcd 100644 --- a/skills/agora/references/conversational-ai/agent-client-toolkit-react.md +++ b/skills/agora/references/conversational-ai/agent-client-toolkit-react.md @@ -134,6 +134,44 @@ const { metrics, agentUserId } = useAgentMetrics(); Only fires when agent start config includes `parameters.enable_metrics: true`. +## `useConversationalAI` — Batteries-Included Alternative + +For simple single-page apps or demos where all ConvoAI state is consumed in one component, `useConversationalAI` is a drop-in alternative to the full Provider + hooks pattern. If multiple components need transcript, agent state, or errors independently, use `ConversationalAIProvider` + standalone hooks instead. + +```tsx +import { useConversationalAI } from 'agora-agent-client-toolkit-react'; +import { useMemo } from 'react'; + +function VoiceSession() { + const config = useMemo(() => ({ + channel: 'my-channel', + rtmConfig: { rtmEngine: rtmClient }, + }), []); + + const { + transcript, + agentState, + isConnected, + error, + interrupt, + sendMessage, + metrics, + } = useConversationalAI(config); + + return ( +
+

Agent: {agentState ?? 'idle'}

+ + +
+ ); +} +``` + +**Config stability rule** — same as `ConversationalAIProvider`: wrap config in `useMemo`. The hook re-initializes if the config object identity changes. + +The hook internally calls `AgoraVoiceAI.init()`, `subscribeMessage()`, and `destroy()` automatically. No manual lifecycle management needed. + ## Critical Rules 1. **Wrap `config` in `useMemo`** — `ConversationalAIProvider` depends on `config.channel`. An inline object creates a new reference every render, causing unnecessary re-init cycles. diff --git a/skills/agora/references/conversational-ai/agent-toolkit-android.md b/skills/agora/references/conversational-ai/agent-toolkit-android.md new file mode 100644 index 0000000..8340e21 --- /dev/null +++ b/skills/agora/references/conversational-ai/agent-toolkit-android.md @@ -0,0 +1,209 @@ +# ConvoAI Agent Toolkit — Android (Kotlin) + +The Android agent toolkit (`ConversationalAIAPIImpl`) wraps RTC + RTM to deliver AI transcripts, agent state, interrupts, and metrics. It is sourced from the Conversational AI Demo repo — not from the Agora SDK packages. + +Source: `convoaiApi/IConversationalAIAPI.kt` and `ConversationalAIAPIImpl.kt` in the demo repo. + +## Table of Contents + +- [Setup](#setup) +- [Initialization](#initialization) +- [Audio Configuration](#audio-configuration) +- [Subscribing to Events](#subscribing-to-events) +- [Sending Messages to the Agent](#sending-messages-to-the-agent) +- [Interrupting the Agent](#interrupting-the-agent) +- [Handling Events](#handling-events) +- [Cleanup](#cleanup) + +## Setup + +The toolkit takes an existing `RtcEngine` and `RtmClient` — initialize both SDKs first. + +```kotlin +import io.agora.rtc2.RtcEngine +import io.agora.rtm.RtmClient +// Copy or import from the demo repo: +// io.agora.scene.convoai.convoaiApi.* +``` + +## Initialization + +```kotlin +// 1. Create your RTC engine and RTM client first (standard SDK init) +val rtcEngine: RtcEngine = // ... your existing engine +val rtmClient: RtmClient = // ... your existing RTM client (already logged in) + +// 2. Create the toolkit config +val config = ConversationalAIAPIConfig( + rtcEngine = rtcEngine, + rtmClient = rtmClient, + renderMode = TranscriptRenderMode.Word, // Word (word-by-word) or Text (full sentence) + enableLog = true, + enableRenderModeFallback = true // fall back to Text if server lacks word timestamps +) + +// 3. Create the API instance +val api = ConversationalAIAPIImpl(config) + +// 4. Register your event handler +api.addHandler(eventHandler) +``` + +## Audio Configuration + +**Must be called before `rtcEngine.joinChannel()`** to configure optimal AI audio settings. + +```kotlin +// Standard ConvoAI mode +api.loadAudioSettings() +// Equivalent to: api.loadAudioSettings(Constants.AUDIO_SCENARIO_AI_CLIENT) + +// If using Avatar (requires audio mixing) +api.loadAudioSettings(Constants.AUDIO_SCENARIO_DEFAULT) + +// Then join RTC channel +rtcEngine.joinChannel(token, channelName, uid, channelMediaOptions) +``` + +## Subscribing to Events + +Subscribe after logging in to RTM and before the agent starts speaking. + +```kotlin +api.subscribeMessage(channelName) { error -> + if (error != null) { + Log.e("ConvoAI", "Subscribe failed: ${error.errorMessage}") + return@subscribeMessage + } + Log.d("ConvoAI", "Subscribed — ready to receive agent events") +} + +// When done +api.unsubscribeMessage(channelName) { error -> } +``` + +## Sending Messages to the Agent + +```kotlin +// Text message (default priority: INTERRUPT) +val textMsg = TextMessage( + text = "What is the weather today?", + priority = Priority.INTERRUPT, + responseInterruptable = true +) +api.chat(agentUserId, textMsg) { error -> + if (error != null) Log.e("ConvoAI", "Chat failed: ${error.errorMessage}") +} + +// Text with APPEND priority (queue after current response) +val appendMsg = TextMessage(text = "And tomorrow?", priority = Priority.APPEND) +api.chat(agentUserId, appendMsg) { } + +// Image message (URL-based; keep base64 under 32KB total) +val imageMsg = ImageMessage(uuid = UUID.randomUUID().toString(), imageUrl = "https://example.com/photo.jpg") +api.chat(agentUserId, imageMsg) { error -> } +``` + +## Interrupting the Agent + +```kotlin +api.interrupt(agentUserId) { error -> + if (error != null) Log.e("ConvoAI", "Interrupt failed: ${error.errorMessage}") +} +``` + +## Handling Events + +Implement `IConversationalAIAPIEventHandler`: + +```kotlin +val eventHandler = object : IConversationalAIAPIEventHandler { + + // Agent state: IDLE | SILENT | LISTENING | THINKING | SPEAKING | UNKNOWN + override fun onAgentStateChanged(agentUserId: String, event: StateChangeEvent) { + Log.d("ConvoAI", "Agent $agentUserId state: ${event.state}, turn: ${event.turnId}") + runOnUiThread { updateStateIndicator(event.state) } + } + + // Transcript update (fires frequently — dedup by turnId if needed) + override fun onTranscriptUpdated(agentUserId: String, transcript: Transcript) { + // transcript.type: TranscriptType.AGENT or .USER + // transcript.status: IN_PROGRESS | END | INTERRUPTED | UNKNOWN + // transcript.renderMode: Word or Text + runOnUiThread { updateTranscriptUI(transcript) } + } + + // Agent interrupted mid-speech + override fun onAgentInterrupted(agentUserId: String, event: InterruptEvent) { + Log.d("ConvoAI", "Interrupted turn: ${event.turnId}") + } + + // Performance metrics (LLM/TTS latency) + override fun onAgentMetrics(agentUserId: String, metric: Metric) { + Log.d("ConvoAI", "Metric: ${metric.type} ${metric.name} = ${metric.value}ms") + } + + // Agent-side error (LLM/TTS failure) + override fun onAgentError(agentUserId: String, error: ModuleError) { + Log.e("ConvoAI", "Agent error: ${error.type} code=${error.code} ${error.message}") + } + + // Message send error (e.g., image too large) + override fun onMessageError(agentUserId: String, error: MessageError) { + Log.e("ConvoAI", "Message error: ${error.chatMessageType} code=${error.code}") + } + + // Message receipt (server acknowledged image/text) + override fun onMessageReceiptUpdated(agentUserId: String, receipt: MessageReceipt) { + Log.d("ConvoAI", "Receipt: ${receipt.type} turnId=${receipt.turnId}") + } + + // Voiceprint registration status (technical preview) + override fun onAgentVoiceprintStateChanged(agentUserId: String, event: VoiceprintStateChangeEvent) { + Log.d("ConvoAI", "Voiceprint: ${event.status}") + } + + // Internal debug messages — useful during development + override fun onDebugLog(log: String) { + Log.v("ConvoAI", log) + } +} +``` + +## Cleanup + +```kotlin +fun cleanup() { + api.unsubscribeMessage(channelName) { } + api.removeHandler(eventHandler) + api.destroy() +} +``` + +Call `destroy()` when the conversation session ends. After this call the instance cannot be reused. + +## Key Types Reference + +| Type | Purpose | +|------|---------| +| `ConversationalAIAPIConfig` | Init config: `rtcEngine`, `rtmClient`, `renderMode`, `enableLog` | +| `ConversationalAIAPIImpl` | Concrete implementation — create one per session | +| `IConversationalAIAPIEventHandler` | Interface for receiving all events | +| `TextMessage` | Text to send: `text`, `priority`, `responseInterruptable` | +| `ImageMessage` | Image to send: `uuid`, `imageUrl` or `imageBase64` (≤32KB) | +| `Priority` | `INTERRUPT` / `APPEND` / `IGNORE` | +| `AgentState` | `IDLE` / `SILENT` / `LISTENING` / `THINKING` / `SPEAKING` | +| `Transcript` | `turnId`, `text`, `type` (AGENT/USER), `status`, `renderMode` | +| `TranscriptRenderMode` | `Word` (word-level) or `Text` (full sentence) | +| `StateChangeEvent` | `state`, `turnId`, `timestamp` | +| `Metric` | `type` (LLM/TTS), `name`, `value` (ms), `timestamp` | +| `ConversationalAIAPIError` | Sealed: `RtmError(code, msg)`, `RtcError(code, msg)`, `UnknownError(msg)` | + +## Notes + +- `onTranscriptUpdated` fires at high frequency. Deduplicate on `turnId` in your UI if needed. +- All callbacks are dispatched on the main thread — safe for UI updates. +- The `renderMode = Word` setting falls back to `Text` automatically if the server doesn't provide word timestamps (when `enableRenderModeFallback = true`). +- Agent state arrives via RTM presence events (REMOTE_STATE_CHANGED); transcripts arrive via RTM channel messages. +- Image payloads via `imageBase64` must keep the total JSON message under 32KB. Use `imageUrl` for larger images. +- Audio routing changes re-apply audio parameters automatically via `onAudioRouteChanged`. diff --git a/skills/agora/references/conversational-ai/agent-toolkit-ios.md b/skills/agora/references/conversational-ai/agent-toolkit-ios.md new file mode 100644 index 0000000..21eec52 --- /dev/null +++ b/skills/agora/references/conversational-ai/agent-toolkit-ios.md @@ -0,0 +1,208 @@ +# ConvoAI Agent Toolkit — iOS (Swift) + +The iOS agent toolkit (`ConversationalAIAPIImpl`) wraps RTC + RTM to deliver AI transcripts, agent state, interrupts, and metrics. It is sourced from the Conversational AI Demo repo — not from the Agora SDK packages. + +Source: `ConversationalAIAPI/ConversationalAIAPI.swift` and `ConversationalAIAPIImpl.swift` in the demo repo. + +## Table of Contents + +- [Setup](#setup) +- [Initialization](#initialization) +- [Audio Configuration](#audio-configuration) +- [Subscribing to Events](#subscribing-to-events) +- [Sending Messages to the Agent](#sending-messages-to-the-agent) +- [Interrupting the Agent](#interrupting-the-agent) +- [Handling Events](#handling-events) +- [Cleanup](#cleanup) + +## Setup + +The toolkit takes an existing `AgoraRtcEngineKit` and `AgoraRtmClientKit` — initialize both SDKs first. + +```swift +import AgoraRtcKit +import AgoraRtmKit +import ConversationalAIAPI // or copy files directly from the demo repo +``` + +## Initialization + +```swift +// 1. Create your RTC engine and RTM client first (standard SDK init) +let rtcEngine: AgoraRtcEngineKit = // ... your existing engine +let rtmKit: AgoraRtmClientKit = // ... your existing RTM client (already logged in) + +// 2. Create the toolkit config +let config = ConversationalAIAPIConfig( + rtcEngine: rtcEngine, + rtmEngine: rtmKit, + renderMode: .words, // .words (word-by-word) or .text (full sentence) + enableLog: true, + enableRenderModeFallback: true // fall back to .text if server lacks word timestamps +) + +// 3. Create the API instance +let api = ConversationalAIAPIImpl(config: config) + +// 4. Register your event handler +api.addHandler(handler: self) // self implements ConversationalAIAPIEventHandler +``` + +## Audio Configuration + +**Must be called before `joinChannel`** on every join to configure optimal AI audio settings. + +```swift +// Standard ConvoAI mode +api.loadAudioSettings() +// Equivalent to: api.loadAudioSettings(secnario: .aiClient) + +// If using Avatar (requires audio mixing) +api.loadAudioSettings(secnario: .default) + +// Then join RTC channel +rtcEngine.joinChannel(byToken: token, channelId: channelName, info: nil, uid: userId) +``` + +## Subscribing to Events + +Subscribe after logging in to RTM and before the agent starts speaking. + +```swift +api.subscribeMessage(channelName: channelName) { error in + if let error = error { + print("Subscribe failed: \(error)") + return + } + print("Subscribed — ready to receive agent events") +} + +// When done +api.unsubscribeMessage(channelName: channelName) { error in } +``` + +## Sending Messages to the Agent + +```swift +// Text message (default priority: INTERRUPT) +let textMsg = TextMessage( + text: "What is the weather today?", + priority: .interrupt, + responseInterruptable: true +) +api.chat(agentUserId: agentUid, message: textMsg) { error in + if let error = error { print("Chat failed: \(error)") } +} + +// Text with APPEND priority (queue after current response) +let appendMsg = TextMessage(text: "And tomorrow?", priority: .append) +api.chat(agentUserId: agentUid, message: appendMsg) { _ in } + +// Image message (URL-based; keep base64 under 32KB total) +let imageMsg = ImageMessage(uuid: UUID().uuidString, imageUrl: "https://example.com/photo.jpg") +api.chat(agentUserId: agentUid, message: imageMsg) { error in } +``` + +## Interrupting the Agent + +```swift +api.interrupt(agentUserId: agentUid) { error in + if let error = error { print("Interrupt failed: \(error)") } +} +``` + +## Handling Events + +Implement `ConversationalAIAPIEventHandler`: + +```swift +extension YourViewController: ConversationalAIAPIEventHandler { + + // Agent state: .silent | .listening | .thinking | .speaking | .idle | .unknown + func onAgentStateChanged(agentUserId: String, event: StateChangeEvent) { + print("Agent \(agentUserId) state: \(event.state), turn: \(event.turnId)") + DispatchQueue.main.async { + self.updateStateIndicator(event.state) + } + } + + // Transcript update (fires frequently — dedup by turnId if needed) + func onTranscriptUpdated(agentUserId: String, transcript: Transcript) { + // transcript.type: .agent or .user + // transcript.status: .inProgress | .end | .interrupted | .unknown + // transcript.renderMode: .words or .text + DispatchQueue.main.async { + self.updateTranscriptUI(transcript) + } + } + + // Agent interrupted mid-speech + func onAgentInterrupted(agentUserId: String, event: InterruptEvent) { + print("Interrupted turn: \(event.turnId)") + } + + // Performance metrics (LLM/TTS latency) + func onAgentMetrics(agentUserId: String, metrics: Metric) { + print("Metric: \(metrics.type) \(metrics.name) = \(metrics.value)ms") + } + + // Agent-side error (LLM/TTS failure) + func onAgentError(agentUserId: String, error: ModuleError) { + print("Agent error: \(error.type) code=\(error.code) \(error.message)") + } + + // Message send error (e.g., image too large) + func onMessageError(agentUserId: String, error: MessageError) { + print("Message error: \(error.chatMessageType) code=\(error.code)") + } + + // Message receipt (server acknowledged image/text) + func onMessageReceiptUpdated(agentUserId: String, messageReceipt: MessageReceipt) { + print("Receipt: \(messageReceipt.type) turnId=\(messageReceipt.turnId)") + } + + // Voiceprint registration status (technical preview) + func onAgentVoiceprintStateChanged(agentUserId: String, event: VoiceprintStateChangeEvent) { + print("Voiceprint: \(event.status)") + } + + func onDebugLog(log: String) { + // Internal debug messages — useful during development + } +} +``` + +## Cleanup + +```swift +func cleanup() { + api.unsubscribeMessage(channelName: channelName) { _ in } + api.removeHandler(handler: self) + api.destroy() +} +``` + +Call `destroy()` when the conversation session ends. After this call the instance cannot be reused. + +## Key Types Reference + +| Type | Purpose | +|------|---------| +| `ConversationalAIAPIConfig` | Init config: `rtcEngine`, `rtmEngine`, `renderMode`, `enableLog` | +| `ConversationalAIAPIImpl` | Concrete implementation — create one per session | +| `ConversationalAIAPIEventHandler` | Protocol for receiving all events | +| `TextMessage` | Text to send: `text`, `priority`, `responseInterruptable` | +| `ImageMessage` | Image to send: `uuid`, `imageUrl` or `imageBase64` (≤32KB) | +| `Priority` | `.interrupt` / `.append` / `.ignore` | +| `AgentState` | `.silent` / `.listening` / `.thinking` / `.speaking` | +| `Transcript` | `turnId`, `text`, `type` (agent/user), `status`, `renderMode` | +| `TranscriptRenderMode` | `.words` (word-level) or `.text` (full sentence) | +| `StateChangeEvent` | `state`, `turnId`, `timestamp` | +| `Metric` | `type` (LLM/TTS), `name`, `value` (ms), `timestamp` | + +## Notes + +- `onTranscriptUpdated` fires at high frequency. Deduplicate on `turnId` in your UI if needed. +- The `renderMode: .words` setting falls back to `.text` automatically if the server doesn't provide word timestamps (when `enableRenderModeFallback: true`). +- Agent state arrives via RTM presence events; transcripts arrive via RTM channel messages. +- Image payloads via `imageBase64` must keep the total JSON message under 32KB. Use `imageUrl` for larger images. diff --git a/skills/agora/references/conversational-ai/agent-toolkit.md b/skills/agora/references/conversational-ai/agent-toolkit.md index f919011..24d81e4 100644 --- a/skills/agora/references/conversational-ai/agent-toolkit.md +++ b/skills/agora/references/conversational-ai/agent-toolkit.md @@ -66,10 +66,19 @@ ai.subscribeMessage('CHANNEL'); |-------|------|----------|-------------| | `rtcEngine` | `IAgoraRTCClient` | Yes | Your existing Agora RTC client | | `rtmConfig` | `{ rtmEngine: RTMClient }` | No | Pass your RTM client for sendText/interrupt | -| `renderMode` | `TranscriptHelperMode` | No | `TEXT`, `WORD`, `CHUNK`, `AUTO` (default: `AUTO`) | +| `renderMode` | `TranscriptHelperMode` | No | `TEXT`, `WORD`, `CHUNK`, `AUTO` (default: `AUTO`) — see table below | | `enableLog` | `boolean` | No | Debug logging (default: `false`) | | `enableAgoraMetrics` | `boolean` | No | Load `@agora-js/report` for usage metrics | +### Render Modes + +| Mode | Update cadence | Word timing in metadata | PTS required | When to use | +|------|---------------|------------------------|--------------|-------------| +| `TEXT` | Per sentence (`final: true`) | No | No | Lowest overhead; subtitles | +| `WORD` | Per word | Yes (`words[].start_ms`, `duration_ms`) | **Yes** (before RTC client creation) | Karaoke-style highlight | +| `CHUNK` | When all parts reassembled | No | No | Fragmented transport | +| `AUTO` | Detected from first message | Depends on detected mode | If WORD detected | Default; fine unless you need WORD and must pre-configure PTS | + ## Events Register handlers before calling `subscribeMessage()`. All 9 events: @@ -136,14 +145,33 @@ await ai.sendText(agentUserId, { // Send image to the agent await ai.sendImage(agentUserId, { messageType: ChatMessageType.IMAGE, - uuid: crypto.randomUUID(), - url: 'https://example.com/image.png', + uuid: crypto.randomUUID(), // caller-generated unique ID for receipt correlation + url: 'https://example.com/image.png', // or: base64: '' }); // Interrupt the agent's current speech await ai.interrupt(agentUserId); ``` +### `ChatMessagePriority` values + +| Value | Behavior | +|-------|----------| +| `INTERRUPTED` | Sends the message and immediately interrupts any speech the agent is currently producing | +| `APPEND` | Queues the message to be processed after the agent finishes its current speech turn | +| `IGNORE` | Drops the message silently if the agent is busy — use for low-priority updates only relevant when idle | + +`responseInterruptable: boolean` on `ChatMessageText` — when `true`, the agent's response to this message can itself be interrupted by subsequent user input. + +### `ChatMessageImage` fields + +| Field | Required | Description | +|-------|----------|-------------| +| `messageType` | Yes | Must be `ChatMessageType.IMAGE` | +| `uuid` | Yes | Caller-generated unique ID for receipt correlation via `MESSAGE_RECEIPT_UPDATED` | +| `url` | One of url/base64 | Publicly accessible image URL | +| `base64` | One of url/base64 | Inline image data | + ## Cleanup ```typescript @@ -162,6 +190,7 @@ await rtmClient.logout(); // you manage RTM lifecycle 5. **RTM is optional but required for several features** — `sendText`, `sendImage`, and `interrupt` throw `RTMRequiredError` without `rtmConfig`. `AGENT_STATE_CHANGED`, `MESSAGE_RECEIPT_UPDATED`, `MESSAGE_ERROR`, `MESSAGE_SAL_STATUS` only fire with RTM. 6. **Agent start config flags are required for some events** — `AGENT_STATE_CHANGED` requires `advanced_features.enable_rtm: true` AND `parameters.data_channel: "rtm"`. `AGENT_METRICS` requires `parameters.enable_metrics: true`. `AGENT_ERROR` requires `parameters.enable_error_message: true`. 7. **Toolkit does not wrap join/publish** — call `rtcClient.join()` and `rtcClient.publish()` yourself before `subscribeMessage()`. +8. **WORD mode requires PTS metadata enabled before RTC client creation** — if `renderMode` is `TranscriptHelperMode.WORD`, call `AgoraRTC.setParameter('ENABLE_AUDIO_PTS_METADATA', true)` before calling `AgoraRTC.createClient()`. Setting it after client creation produces no error — word-level timing data simply never arrives. This also applies when using `AUTO` mode if WORD is detected; prefer explicit `WORD` mode when karaoke display is required so the pre-configuration step is obvious. ## React Hooks diff --git a/skills/agora/references/conversational-ai/auth-flow.md b/skills/agora/references/conversational-ai/auth-flow.md new file mode 100644 index 0000000..18da8d9 --- /dev/null +++ b/skills/agora/references/conversational-ai/auth-flow.md @@ -0,0 +1,150 @@ +--- +name: agora-convoai-auth-flow +description: | + End-to-end token and authentication flow for Agora Conversational AI — for implementors + calling the REST API directly (Java, Ruby, PHP, C#, or any language without an SDK). + The TypeScript/Python/Go SDKs handle ConvoAI token generation automatically in App + Credentials mode — use server-sdks.md for those. + Use this file when: the user needs to understand which token goes where, is building a + non-SDK backend, or explicitly asks about token types / buildTokenWithRtm / auth flow. + Triggers on: RTC token, RTM token, ConvoAI token, token flow, auth flow, buildTokenWithRtm, + three tokens, REST API auth, Java backend, direct REST. +license: MIT +metadata: + author: agora + version: '1.0.0' +--- + +# ConvoAI Auth Flow — End to End + +> **SDK users:** If you are using the TypeScript, Python, or Go SDK, you do not need to implement this flow manually. Pass `appId + appCertificate` to the SDK client and it generates the ConvoAI token per request automatically. See [server-sdks.md](server-sdks.md). This file is for backends calling the REST API directly. + +Three separate tokens exist in a ConvoAI integration. Developers routinely confuse them because they all use the same App ID + App Certificate as inputs. + +## The Three Tokens + +| Token | Generated by | Used by | Where | +|-------|-------------|---------|-------| +| **RTC client token** | Your server | Browser / mobile RTC client | `rtcClient.join(appId, channel, token, uid)` | +| **RTM client token** | Your server | Browser / mobile RTM client | `rtmClient.login({ token })` | +| **ConvoAI server token** | SDK (App Credentials mode) or you (Token Auth mode) | Your server → ConvoAI REST API | `Authorization: agora token=` | + +The **ConvoAI server token** is a combined RTC + RTM token generated with `RtcTokenBuilder.buildTokenWithRtm()`. It is **not** the same as the RTC token the client uses to join the channel — even though both use the same builder. + +## Sequence Diagram + +``` +Your Server ConvoAI Platform Browser Client + | | | + |-- POST /join -------->| | + | Authorization: | | + | agora token= | + | agent starts | + |<-- agentId -----------| | + | | | + |-- RTC token + RTM token ---------------------->| + | | | + | |<-- rtcClient.join() ---| + | |<-- rtmClient.login() --| + | | | + | |<==== RTC audio =======>| + | |<==== RTM messages ====>| +``` + +## Worked Example + +### 1. Your server generates client tokens + +```typescript +import { RtcTokenBuilder, RtmTokenBuilder, RtcRole } from 'agora-token'; + +const uid = 12345; // user's numeric UID +const channel = 'my-channel'; +const expiry = 3600; // 1 hour + +// RTC token — for the client to join the RTC channel +const rtcToken = RtcTokenBuilder.buildTokenWithUid( + appId, appCertificate, channel, uid, RtcRole.PUBLISHER, + expiry, expiry +); + +// RTM token — for the client to log into RTM (required for sendText/interrupt) +const rtmToken = RtmTokenBuilder.buildToken( + appId, appCertificate, String(uid), expiry +); +``` + +### 2. Your server starts the agent + +Using App Credentials mode — the SDK generates the ConvoAI token automatically: + +```typescript +import { AgoraClient, Agent } from 'agora-agent-server-sdk'; + +const client = new AgoraClient({ + appId: process.env.AGORA_APP_ID, + appCertificate: process.env.AGORA_APP_CERTIFICATE, // SDK handles ConvoAI token +}); + +const session = agent.createSession({ channel, agentUid: 0 }); +const agentId = await session.start(); // ConvoAI token generated internally per request +``` + +Using Token Auth mode — you generate the ConvoAI token yourself: + +```typescript +// ConvoAI token = combined RTC + RTM token, bound to the agent's uid on the channel +const convoAIToken = RtcTokenBuilder.buildTokenWithRtm( + appId, appCertificate, channel, + 'agent-account', // use account (string) not numeric uid for the agent + RtcRole.PUBLISHER, + expiry, expiry +); + +const client = new AgoraClient({ + appId: process.env.AGORA_APP_ID, + authToken: convoAIToken, // reused for every request until you replace it +}); +``` + +### 3. Your server returns client tokens to the browser + +```typescript +// Express handler example +app.post('/start-session', async (req, res) => { + const { uid, channel } = req.body; + + const rtcToken = /* generate as above */; + const rtmToken = /* generate as above */; + const agentId = await session.start(); + + res.json({ rtcToken, rtmToken, agentId, channel }); +}); +``` + +### 4. Browser initializes with both tokens + +```typescript +import AgoraRTC from 'agora-rtc-sdk-ng'; +import AgoraRTM from 'agora-rtm'; +import { AgoraVoiceAI } from 'agora-agent-client-toolkit'; + +const { rtcToken, rtmToken, channel } = await fetch('/start-session', { ... }).then(r => r.json()); + +const rtcClient = AgoraRTC.createClient({ mode: 'rtc', codec: 'vp8' }); +const rtmClient = new AgoraRTM.RTM(appId, String(uid)); +await rtmClient.login({ token: rtmToken }); // RTM token here + +const ai = await AgoraVoiceAI.init({ + rtcEngine: rtcClient, + rtmConfig: { rtmEngine: rtmClient }, +}); + +await rtcClient.join(appId, channel, rtcToken, uid); // RTC token here +``` + +## App Credentials Shortcut + +With `appId + appCertificate` passed to `AgoraClient`, the SDK generates the ConvoAI token per request. The developer only needs to manage the two client-side tokens (RTC + RTM). This is the recommended path for production. + +App Certificate never leaves your server in either case — the browser only ever receives the scoped RTC and RTM tokens. diff --git a/skills/agora/references/conversational-ai/go-sdk.md b/skills/agora/references/conversational-ai/go-sdk.md new file mode 100644 index 0000000..36f7267 --- /dev/null +++ b/skills/agora/references/conversational-ai/go-sdk.md @@ -0,0 +1,184 @@ +--- +name: agora-server-sdk-go +description: | + Go SDK for Agora Conversational AI server-side integration. Use when the user is + building a Go backend to start/stop/manage ConvoAI agents. Triggers on: + agora-agent-server-sdk-go, agentkit Go, AgentSession Go, Go ConvoAI server, + context.Context agent, go get agora agent. +license: MIT +metadata: + author: agora + version: '1.0.0' +--- + +# ConvoAI Server SDK — Go + +Go SDK for managing Agora Conversational AI agents from a server-side application. Wraps the ConvoAI REST API. + +**Module:** `github.com/AgoraIO-Community/agora-agent-server-sdk-go` +**Minimum Go version:** 1.21 +**Repo:** + +## Installation + +```bash +go get github.com/AgoraIO-Community/agora-agent-server-sdk-go +``` + +## Quick Start + +```go +package main + +import ( + "context" + "fmt" + "log" + "time" + + "github.com/AgoraIO-Community/agora-agent-server-sdk-go/agentkit" +) + +func main() { + client, err := agentkit.NewAgora( + agentkit.WithAppID("YOUR_APP_ID"), + agentkit.WithAppCertificate("YOUR_APP_CERTIFICATE"), + ) + if err != nil { + log.Fatal(err) + } + + agent := agentkit.NewAgent( + agentkit.WithName("my_agent"), + agentkit.WithInstructions("You are a helpful voice assistant."), + agentkit.WithLlm(agentkit.OpenAI{APIKey: "OPENAI_KEY"}), + agentkit.WithTts(agentkit.ElevenLabs{APIKey: "ELEVENLABS_KEY"}), + agentkit.WithStt(agentkit.Deepgram{APIKey: "DEEPGRAM_KEY"}), + ) + + session := agent.CreateSession(agentkit.SessionOptions{ + Channel: "my-channel", + AgentUID: 0, + }) + + // Bound start time to 10 seconds + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + agentID, err := session.Start(ctx) + if err != nil { + log.Fatal(err) + } + fmt.Printf("Agent started: %s\n", agentID) + + // Stop from the same process + if err := session.Stop(context.Background()); err != nil { + log.Fatal(err) + } +} +``` + +## context.Context Pattern + +Every session method takes `ctx context.Context` as its first argument. Use this to bound operation time: + +```go +// Bound start — fails after 10s if the agent hasn't connected +ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) +defer cancel() +agentID, err := session.Start(ctx) + +// Stateless stop (different request handler) — use a fresh context +agentID, err := client.StopAgent(context.Background(), agentID) +``` + +## Error Handling + +All methods return `(result, error)`. Idiomatic check: + +```go +agentID, err := session.Start(ctx) +if err != nil { + // handle error + return fmt.Errorf("start agent: %w", err) +} + +// Stop returns nil for 404 (agent already stopped) — same graceful behavior as TypeScript/Python +err = session.Stop(context.Background()) +if err != nil { + // genuine error — not "already stopped" +} +``` + +## Builder Pattern (Functional Options) + +Go uses functional options (`With*` functions) instead of chained methods or object literals: + +```go +// TypeScript equivalent: new Agent({ name: "..." }).withLlm(new OpenAI({ ... })) +agent := agentkit.NewAgent( + agentkit.WithName("my_agent"), + agentkit.WithLlm(agentkit.OpenAI{APIKey: "OPENAI_KEY", Model: "gpt-4o"}), + agentkit.WithTts(agentkit.ElevenLabs{APIKey: "ELEVENLABS_KEY", VoiceID: "..."}), +) +``` + +## Session Status Constants + +Check `session.Status` before calling methods: + +| Constant | Meaning | +|----------|---------| +| `agentkit.StatusIdle` | Ready, not started | +| `agentkit.StatusStarting` | Start in progress | +| `agentkit.StatusRunning` | Active — `Stop`, `Say`, `Interrupt`, `Update` available | +| `agentkit.StatusStopping` | Stop in progress | +| `agentkit.StatusStopped` | Stopped — `Start` available again | +| `agentkit.StatusError` | Error — `Start` available again | + +## Token Helpers + +```go +// Generate an RTC token +rtcToken, err := agentkit.GenerateRTCToken(agentkit.TokenOptions{ + AppID: "YOUR_APP_ID", + Certificate: "YOUR_CERTIFICATE", + Channel: "my-channel", + UID: 12345, + ExpiresIn: agentkit.ExpiresInHours(1), +}) + +// Generate a combined RTC+RTM ConvoAI token (for Token Auth mode) +convoAIToken, err := agentkit.GenerateConvoAIToken(agentkit.TokenOptions{ + AppID: "YOUR_APP_ID", + Certificate: "YOUR_CERTIFICATE", + Channel: "my-channel", + Account: "agent-account", + ExpiresIn: agentkit.ExpiresInHours(1), +}) +``` + +## Auth Modes + +Same three modes as TypeScript and Python. Pass exactly one set of credentials: + +```go +// App Credentials (recommended) — SDK generates ConvoAI token per request +client, _ := agentkit.NewAgora( + agentkit.WithAppID("..."), + agentkit.WithAppCertificate("..."), +) + +// Token Auth — pre-built combined RTC+RTM token; reused until you replace it +client, _ := agentkit.NewAgora( + agentkit.WithAppID("..."), + agentkit.WithAuthToken("YOUR_TOKEN"), +) + +// Basic Auth — Customer ID + Secret; for testing only +client, _ := agentkit.NewAgora( + agentkit.WithAppID("..."), + agentkit.WithCustomerID("..."), + agentkit.WithCustomerSecret("..."), +) +``` diff --git a/skills/agora/references/conversational-ai/python-sdk.md b/skills/agora/references/conversational-ai/python-sdk.md new file mode 100644 index 0000000..eed7f72 --- /dev/null +++ b/skills/agora/references/conversational-ai/python-sdk.md @@ -0,0 +1,120 @@ +--- +name: agora-server-sdk-python +description: | + Python SDK for Agora Conversational AI server-side integration. Use when the user is + building a Python backend to start/stop/manage ConvoAI agents. Triggers on: + agora-agent Python, agent_server_sdk_python, AsyncAgora, AsyncAgentSession, pip install + agora-agent, Python ConvoAI server, agora_agent. +license: MIT +metadata: + author: agora + version: '1.0.0' +--- + +# ConvoAI Server SDK — Python + +Python SDK for managing Agora Conversational AI agents from a server-side application. Wraps the ConvoAI REST API. + +**Package:** `agora-agent` +**Repo:** + +## Installation + +```bash +pip install agora-agent +# or with Poetry: +poetry add agora-agent +``` + +## Sync vs Async + +Two parallel APIs exist: + +| Use case | Classes | +|----------|---------| +| Sync (scripts, Flask, Django) | `Agora`, `AgentSession` | +| Async (FastAPI, aiohttp, asyncio apps) | `AsyncAgora`, `AsyncAgentSession` | + +**Rule:** use the async variants in any async framework. Mixing sync calls into an async event loop blocks it. + +## Async Example + +```python +import asyncio +from agora_agent import AsyncAgora, Agent +from agora_agent.agentkit import OpenAI, ElevenLabsTTS, DeepgramSTT + +async def main(): + client = AsyncAgora( + app_id="YOUR_APP_ID", + app_certificate="YOUR_APP_CERTIFICATE", # App Credentials mode + ) + + agent = ( + Agent(name="my_agent", instructions="You are a helpful voice assistant.") + .with_stt(DeepgramSTT(api_key="DEEPGRAM_KEY")) + .with_llm(OpenAI(api_key="OPENAI_KEY")) + .with_tts(ElevenLabsTTS(api_key="ELEVENLABS_KEY")) + ) + + session = agent.create_session(channel="my-channel", agent_uid=0) + + agent_id = await session.start() + print(f"Agent started: {agent_id}") + + # Later — stop from the same process + await session.stop() + + # Or stop from a stateless handler (different request) + await client.stop_agent(agent_id) + +asyncio.run(main()) +``` + +## Naming Conventions + +All method names are snake_case — same API surface as TypeScript but with Python naming: + +| TypeScript | Python | +|-----------|--------| +| `session.start()` | `session.start()` | +| `session.stop()` | `session.stop()` | +| `session.getHistory()` | `session.get_history()` | +| `session.getInfo()` | `session.get_info()` | +| `client.generateRtcToken()` | `client.generate_rtc_token()` | + +## Deprecation Warnings + +Three patterns generate `DeprecationWarning` at runtime. Suppress them by migrating to the replacement: + +| Deprecated | Replacement | +|-----------|-------------| +| `TurnDetection.type` field | Use `config.start_of_speech` / `config.end_of_speech` directly | +| `InterruptMode` on standard LLM sessions | Only valid for MLLM with `server_vad` or `semantic_vad` | +| `Eagerness` parameter | MLLM-only — remove from standard LLM configs | + +## Debug Logging + +```python +client = AsyncAgora( + app_id="YOUR_APP_ID", + app_certificate="YOUR_APP_CERTIFICATE", + debug=True, # enables request/response logging +) +# Auth headers are redacted automatically — logs show "Basic ***", not the actual value +``` + +## Auth Modes + +Same three modes as TypeScript. Pass exactly one: + +```python +# App Credentials (recommended) — SDK generates ConvoAI token per request +client = AsyncAgora(app_id="...", app_certificate="...") + +# Token Auth — you supply a pre-built combined RTC+RTM token; reused until replaced +client = AsyncAgora(app_id="...", auth_token="YOUR_TOKEN") + +# Basic Auth — Customer ID + Secret; for testing only +client = AsyncAgora(app_id="...", customer_id="...", customer_secret="...") +``` diff --git a/skills/agora/references/conversational-ai/server-sdks.md b/skills/agora/references/conversational-ai/server-sdks.md new file mode 100644 index 0000000..69272f1 --- /dev/null +++ b/skills/agora/references/conversational-ai/server-sdks.md @@ -0,0 +1,126 @@ +--- +name: agora-convoai-server-sdks +description: | + Server-side SDKs for Agora Conversational AI: TypeScript, Python, and Go wrappers around the + ConvoAI REST API. Use when the user is building a backend to start/stop/manage ConvoAI agents. + Triggers on: agora-agent-server-sdk, AgoraClient, AgentSession, session.start, session.stop, + agent server SDK, ConvoAI backend, ConvoAI server, withStt, withLlm, withTts. +license: MIT +metadata: + author: agora + version: '1.0.0' +--- + +# ConvoAI Server SDKs + +TypeScript, Go, and Python SDKs — convenience wrappers around the ConvoAI REST API. For any other backend language, call the REST API directly. Fetch the live OpenAPI spec for the full schema: `https://docs-md.agora.io/api/conversational-ai-api-v2.x.yaml` + +## TypeScript — `agora-agent-server-sdk` + +```bash +npm install agora-agent-server-sdk +``` + +Builder pattern — configure the AI pipeline then create sessions: + +```typescript +import { AgoraClient, Agent, Area } from 'agora-agent-server-sdk'; + +const client = new AgoraClient({ + area: Area.US, + appId: process.env.AGORA_APP_ID, + appCertificate: process.env.AGORA_APP_CERTIFICATE, +}); + +const agent = new Agent({ + name: `agent_${crypto.randomUUID().slice(0, 8)}`, // must be unique per project + instructions: 'You are a helpful voice assistant.', + greeting: 'Hello! How can I help you today?', +}) + .withStt(new DeepgramSTT({ apiKey: process.env.DEEPGRAM_API_KEY })) + .withLlm(new OpenAI({ apiKey: process.env.OPENAI_API_KEY })) + .withTts(new ElevenLabsTTS({ apiKey: process.env.ELEVENLABS_API_KEY })); + +// Start a session (joins the agent to a channel) +const session = agent.createSession({ channel: 'my-channel', agentUid: 0 }); +const sessionId = await session.start(); + +// Stop from the same process +await session.stop(); + +// Stop from a stateless server (e.g. a different request handler) +await client.stopAgent(sessionId); +``` + +Token auth is handled automatically when `appCertificate` is provided. For vendor-specific STT/LLM/TTS import paths and MLLM (OpenAI Realtime, Gemini Live) config, see the [SDK README](https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-ts). + +## Auth Modes + +Three modes, in order of recommendation: + +- **App Credentials** (`appId` + `appCertificate`): SDK generates a fresh ConvoAI token per REST call. No token management needed. The App Certificate never leaves your server. Recommended for production. +- **Token Auth** (`authToken`): A pre-built combined RTC+RTM token you supply. This token is **reused for every request** until you replace the SDK instance — the SDK does not refresh it. You are responsible for refreshing before expiry. Max token validity is 24 hours. +- **Basic Auth** (`customerId` + `customerSecret`): Credentials never expire but are long-lived secrets that grant access to every project on your account. Use for local testing only; do not ship to production. + +## Session State Machine + +Sessions follow a strict state sequence. Calling methods outside the valid states throws an error: + +| State | `start()` | `stop()` | `say()` / `interrupt()` / `update()` | +|-------|-----------|----------|---------------------------------------| +| `idle` | ✅ | ❌ | ❌ | +| `starting` | ❌ | ❌ | ❌ | +| `running` | ❌ | ✅ | ✅ | +| `stopping` | ❌ | ❌ | ❌ | +| `stopped` | ✅ | ❌ | ❌ | +| `error` | ✅ | ❌ | ❌ | + +`stop()` on a 404 (agent already stopped on the platform) resolves without throwing — the SDK treats it as already stopped. + +## In-Process Events (TypeScript) + +`AgentSession` emits events within your Node.js process — these are not HTTP webhooks: + +```typescript +session.on('started', () => { + // Agent has connected to the RTC channel and is ready + // session.id is now set; session.status === 'running' +}); + +session.on('stopped', () => { + // Agent has left the channel +}); + +session.on('error', (err) => { + // Non-recoverable error; session.status === 'error' + // Safe to call session.start() again from this state +}); +``` + +## Avatar + TTS Sample Rate + +HeyGen and Akool avatars require a specific TTS sample rate. The SDK validates this at `session.start()` and throws if mismatched. The error message identifies the avatar config as the problem — the root cause is actually the TTS sample rate. + +| Avatar vendor | Required TTS sample rate | +|--------------|--------------------------| +| HeyGen | **24000 Hz** | +| Akool | **16000 Hz** | + +Whenever an avatar vendor is set, explicitly configure the TTS sample rate to match. Do not rely on defaults. + +```typescript +// HeyGen — must pair with 24 kHz TTS +const agent = new Agent({ ... }) + .withAvatar(new HeyGen({ ... })) + .withTts(new ElevenLabsTTS({ sampleRate: 24000, ... })); + +// Akool — must pair with 16 kHz TTS +const agent = new Agent({ ... }) + .withAvatar(new Akool({ ... })) + .withTts(new ElevenLabsTTS({ sampleRate: 16000, ... })); +``` + +## Python and Go SDKs + +- **[python-sdk.md](python-sdk.md)** — Python SDK: sync vs async, deprecation warnings, debug logging. +- **[go-sdk.md](go-sdk.md)** — Go SDK: context.Context pattern, builder syntax, SessionStatus constants, token helpers. diff --git a/skills/agora/references/integration-patterns.md b/skills/agora/references/integration-patterns.md new file mode 100644 index 0000000..4c85b9e --- /dev/null +++ b/skills/agora/references/integration-patterns.md @@ -0,0 +1,164 @@ +# Multi-Product Integration Patterns + +This file covers how RTC, RTM, and Conversational AI work together. For SDK-specific code, follow the Level 2 fetch links in each product's reference file. + +## When to Use Multiple Products + +| Scenario | Products | +|---|---| +| Video/voice call only | RTC only | +| Video call + chat or presence | RTC + RTM | +| AI agent in a call | RTC + ConvoAI REST API (server-side) | +| AI agent + live transcripts or state | RTC + RTM + ConvoAI REST API | +| Text-only signaling (no media) | RTM only | + +## RTC + RTM Together + +### Initialization Order + +Always initialize both SDKs independently — they share no initialization path. The recommended order: + +1. Create and initialize the RTC engine/client +2. Create and log in to the RTM client +3. Subscribe to the RTM channel +4. Join the RTC channel + +Subscribe to RTM **before** joining RTC so that presence events for peers already in the channel are not missed. + +### UID Strategy + +RTC and RTM use different UID types: + +| SDK | Platform | UID type | +|---|---|---| +| RTC | Web | `number` | +| RTC | iOS | `UInt` (unsigned 32-bit) | +| RTC | Android | `Int` (signed 32-bit) | +| RTM | All platforms | `String` | + +Use `String(rtcUid)` as the RTM `userId`. This is the standard convention across platform demos and toolkits. UIDs greater than 2,147,483,647 wrap to negative on Android RTC — avoid them if Android clients are present. + +### Channel Name Convention + +Use the **same channel name** for both RTC and RTM. The RTM channel is the coordination layer for the RTC session — same name makes routing unambiguous and is required by the ConvoAI toolkit. + +### Token Matrix + +Both SDKs require separate tokens in production. Generate them independently on your server: + +| Token | Scope | Renew on | +|---|---|---| +| RTC token | `appId` + `channelName` + `uid` | `token-privilege-will-expire` / `onTokenPrivilegeWillExpire` | +| RTM token | `appId` + `userId` | RTM connection state `TOKEN_EXPIRED` or equivalent | + +See [server/tokens.md](server/tokens.md) for token generation details. + +For Level 2 fetch: fetch `https://docs.agora.io/en/llms.txt`, find the token management guide for your platform, then fetch it. + +## RTC + RTM + ConvoAI + +### How the Products Connect + +``` +Client Server +────────────────────────────────────────────────────── +RTC Engine/Client ──── media ────► Agora RTC +RTM Client ────── messages ──────► Agora RTM + │ + [ConvoAI agent joins + same RTC channel, + publishes transcripts + and state to same + RTM channel] + ▲ +App Server ── REST POST /join ──────► Agora ConvoAI API +``` + +### Initialization Order for ConvoAI + +1. Initialize RTC (but do not join the channel yet) +2. Initialize RTM and log in +3. Subscribe to the RTM channel (to receive agent events) +4. Call ConvoAI REST `POST /join` from your **app server** with `channelName`, `uid` (your user's RTC uid), and the RTM flags (see below) +5. Join the RTC channel — the agent will already be there or will join shortly + +### Required ConvoAI Flags for RTM Delivery + +When calling `POST /join`, include both of these in the request body: + +```json +{ + "advanced_features": { + "enable_rtm": true + }, + "parameters": { + "data_channel": "rtm" + } +} +``` + +Both flags are required. Omitting either one means the agent publishes transcripts via RTC data channel instead of RTM, and RTM message events will not fire. + +### RTM Channel Name = RTC Channel Name + +The ConvoAI agent joins the RTM channel with the **same name** as the RTC channel it joined. When handling `onMessageEvent` / `didReceiveMessageEvent`, filter by `channelName` matching your RTC channel if you are subscribed to multiple channels. + +### Agent Events via RTM + +The agent publishes JSON messages to the RTM channel. Parse `event.message.data` (Android) or `event.message.stringData` (iOS) as JSON. The `customType` field indicates the event type (transcript, interruption, state change, etc.). + +For the full event schema and `customType` values, use Level 2 fetch: fetch `https://docs.agora.io/en/llms.txt`, locate the Conversational AI event reference, then fetch it. + +### Token Matrix for ConvoAI + +ConvoAI sessions require three separate tokens: + +| Token | Purpose | Who generates | +|---|---|---| +| RTC token (user) | User joins RTC channel | Your app server | +| RTM token (user) | User logs into RTM | Your app server | +| RTC token (agent) | ConvoAI agent joins RTC | Your app server, passed in POST /join | + +See [conversational-ai/auth-flow.md](conversational-ai/auth-flow.md) for the full three-token flow (REST API implementors) or [conversational-ai/README.md](conversational-ai/README.md) for SDK-based flows. + +## Codec Selection for Mixed Platforms + +When Web clients share a channel with iOS or Android native clients: + +- Web defaults to `codec: 'vp8'` — this scales well for multi-user calls and is the recommended choice +- `'vp9'` is also a good option with better compression +- `'h264'` does not scale well beyond small groups — avoid it for multi-user scenarios +- If codecs differ between Web and native clients, Agora transcodes server-side — this works but adds latency and is billed separately + +See [rtc/cross-platform-coordination.md](rtc/cross-platform-coordination.md) for full cross-platform interop notes. + +## Cleanup Order + +Reverse of initialization: + +1. Leave the RTC channel +2. Unsubscribe from RTM channels +3. Log out of RTM +4. Release both engines/clients + +For ConvoAI: call `DELETE /leave` (or the SDK equivalent) from your app server before leaving the RTC channel, to give the agent time to exit gracefully. + +## Reference Files by Product + +| Product | Platform | File | +|---|---|---| +| RTC | Web | [rtc/web.md](rtc/web.md) | +| RTC | React | [rtc/react.md](rtc/react.md) | +| RTC | Next.js | [rtc/nextjs.md](rtc/nextjs.md) | +| RTC | iOS | [rtc/ios.md](rtc/ios.md) | +| RTC | Android | [rtc/android.md](rtc/android.md) | +| RTC | React Native | [rtc/react-native.md](rtc/react-native.md) | +| RTC | Flutter | [rtc/flutter.md](rtc/flutter.md) | +| RTM | Web | [rtm/web.md](rtm/web.md) | +| RTM | iOS | [rtm/ios.md](rtm/ios.md) | +| RTM | Android | [rtm/android.md](rtm/android.md) | +| ConvoAI | All | [conversational-ai/README.md](conversational-ai/README.md) | +| ConvoAI toolkit | iOS | [conversational-ai/agent-toolkit-ios.md](conversational-ai/agent-toolkit-ios.md) | +| ConvoAI toolkit | Android | [conversational-ai/agent-toolkit-android.md](conversational-ai/agent-toolkit-android.md) | +| Tokens | Server | [server/tokens.md](server/tokens.md) | +| Cross-platform | All | [rtc/cross-platform-coordination.md](rtc/cross-platform-coordination.md) | diff --git a/skills/agora/references/rtc/README.md b/skills/agora/references/rtc/README.md index 3202470..d550034 100644 --- a/skills/agora/references/rtc/README.md +++ b/skills/agora/references/rtc/README.md @@ -6,11 +6,11 @@ Real-time audio and video communication. Users join channels, publish local trac 1. **Register event handlers BEFORE joining** the channel, or you will miss events for users already present. 2. **`user-published` fires separately** for audio and video. A user publishing both triggers two events — handle each. -3. **Audio autoplay**: Browsers block audio autoplay. Require user interaction (click/tap) before playing remote audio. -4. **Track cleanup**: Always `stop()` then `close()` local tracks before setting to null. Failure to clean up causes memory leaks and device locks. -5. **HTTPS required** for Web SDK (except `localhost`). -6. **Token management is mandatory in production**. Handle `token-privilege-will-expire` (Web) / `onTokenPrivilegeWillExpire` (native) to renew tokens. UID in token must match UID used to join. -7. **Stream bombing prevention**: In production, generate tokens with subscriber role (`kRoleSubscriber` / `RtcRole.SUBSCRIBER`) for audience-only users to prevent unauthorized publishing. +3. **Track cleanup**: Always `stop()` then `close()` local tracks before setting to null. Failure to clean up causes memory leaks and device locks. (React SDK hooks handle this automatically — see `react.md`.) +4. **HTTPS required** for Web SDK (except `localhost`). +5. **Token management is mandatory in production**. Handle `token-privilege-will-expire` (Web) / `onTokenPrivilegeWillExpire` (native) to renew tokens. UID in token must match UID used to join. +6. **Stream bombing prevention**: In production, generate tokens with subscriber role (`kRoleSubscriber` / `RtcRole.SUBSCRIBER`) for audience-only users to prevent unauthorized publishing. +7. **Audio autoplay (non-standard flows only)**: Browser autoplay policy is not an issue in typical RTC flows because the user has already clicked to join (satisfying the gesture requirement). However, if you auto-join on page load, trigger audio from a non-user event, or run in a headless/test environment, `audioTrack.play()` may be silently blocked. Wrap the join + publish sequence in a user gesture handler in these cases. ## Channel Profiles @@ -76,7 +76,7 @@ Screen share typically uses a separate client instance to avoid replacing the ca When Web, iOS, and Android clients share the same channel: -- **Codec**: Web defaults to `"vp8"` but native SDKs typically negotiate H.264. Use `codec: "h264"` on Web for best native interop. If codecs differ, Agora's server transcodes transparently (works but adds latency). +- **Codec**: `"vp8"` and `"vp9"` scale better in multi-user calls — prefer these over `"h264"`, which does not scale well beyond small groups. If codecs differ between Web and native clients, Agora's server transcodes transparently (works but adds latency). - **UID types**: iOS uses `UInt` (unsigned 32-bit), Android uses `Int` (signed 32-bit), Web uses `number`. UIDs > 2,147,483,647 wrap to negative on Android. RTM uses **string UIDs** — use `String(rtcUid)` as a mapping convention. - **Audio profiles**: Align encoder settings across platforms to avoid one side sending stereo 128kbps while another expects mono. Use `"speech_standard"` (Web) / `AUDIO_PROFILE_DEFAULT` (native) for voice calls. - **Orientation**: Mobile uses adaptive orientation (rotates with device). Web cameras are typically landscape. Handle aspect ratio changes on the viewer side. @@ -91,6 +91,8 @@ Read the file matching the user's platform: - **[nextjs.md](nextjs.md)** — Next.js / SSR dynamic import patterns (App Router + Pages Router) - **[ios.md](ios.md)** — `AgoraRtcEngineKit` (Swift): engine setup, delegation, permissions - **[android.md](android.md)** — `RtcEngine` (Kotlin/Java): engine setup, callbacks, permissions +- **[react-native.md](react-native.md)** — `react-native-agora`: engine init, events, video views, complete example +- **[flutter.md](flutter.md)** — `agora_rtc_engine` (Dart): engine init, events, AgoraVideoView, complete example - **[cross-platform-coordination.md](cross-platform-coordination.md)** — UID strategy, codec interop, screen sharing across platforms, audio routing, common cross-platform bugs For additional platforms and advanced features: — voice-only: @@ -99,4 +101,4 @@ For test setup and mocking patterns, see [references/testing-guidance/SKILL.md]( ## When to Fetch More -Always use Level 2 fetch for: encoder profile parameter details, error code listings, release notes, Flutter/Windows/Electron/React Native platform quick-starts. See [../doc-fetching.md](../doc-fetching.md). +Always use Level 2 fetch for: encoder profile parameter details, error code listings, release notes, Windows/Electron/Unity platform quick-starts. See [../doc-fetching.md](../doc-fetching.md). diff --git a/skills/agora/references/rtc/android.md b/skills/agora/references/rtc/android.md index c25a28a..a8cebec 100644 --- a/skills/agora/references/rtc/android.md +++ b/skills/agora/references/rtc/android.md @@ -35,6 +35,8 @@ Add permissions to `AndroidManifest.xml`: + + ``` Request runtime permissions for `CAMERA` and `RECORD_AUDIO` before initializing. @@ -112,11 +114,9 @@ agoraEngine.setupRemoteVideo( // Enable audio (enabled by default) agoraEngine.enableAudio() -// Audio profile -agoraEngine.setAudioProfile( - Constants.AUDIO_PROFILE_DEFAULT, - Constants.AUDIO_SCENARIO_DEFAULT -) +// Audio profile (SDK 4.x: set profile and scenario separately) +agoraEngine.setAudioProfile(Constants.AUDIO_PROFILE_DEFAULT) +agoraEngine.setAudioScenario(Constants.AUDIO_SCENARIO_DEFAULT) // Mute/unmute local audio agoraEngine.muteLocalAudioStream(true) // mute @@ -183,7 +183,7 @@ private val rtcEventHandler = object : IRtcEngineEventHandler() { // Network quality override fun onNetworkQuality(uid: Int, txQuality: Int, rxQuality: Int) { - // 0=unknown, 1=excellent, 2=good, 3=poor, 4=bad, 5=very bad + // 0=unknown, 1=excellent, 2=good, 3=poor, 4=bad, 5=very bad, 6=disconnected } } ``` diff --git a/skills/agora/references/rtc/cross-platform-coordination.md b/skills/agora/references/rtc/cross-platform-coordination.md index e4a1205..fad76d6 100644 --- a/skills/agora/references/rtc/cross-platform-coordination.md +++ b/skills/agora/references/rtc/cross-platform-coordination.md @@ -16,18 +16,12 @@ Agora handles codec negotiation automatically for most scenarios. What to know: | Codec | Notes | |-------|-------| -| H.264 | Default on iOS and Android. Web supports it but may require software decode on low-end devices. | -| VP8 | Web default. iOS/Android require transcoding — adds ~50–100ms latency. | +| VP8 | Web default. Scales well in multi-user calls. Supported on Safari 13+. Recommended. | +| VP9 | Better compression than VP8. Scales well on desktop. **iOS Safari: hardware-only** — requires iPhone 15 Pro / M3 Mac or newer; software fallback degrades battery significantly on older devices. | +| H.264 | Default on iOS and Android native SDKs. Does not scale well beyond small groups — avoid for multi-user Web calls. | | H.265 (HEVC) | Not universally supported on Web; avoid for cross-platform channels. | -**Recommendation**: Enable H.264 explicitly on Web clients when iOS/Android users are present. Transcoding introduces latency and is billed separately. - -```javascript -// Web: force H.264 to match mobile clients -AgoraRTC.setParameter('CODEC', 'h264'); -// or via client config: -const client = AgoraRTC.createClient({ mode: 'rtc', codec: 'h264' }); -``` +**Recommendation**: `'vp8'` is the safest default for multi-user Web calls — scales well and works on all modern Safari (13+). Use `'vp9'` only if you can ensure participants are on modern hardware. Avoid `'h264'` for multi-user Web scenarios. If codecs differ between Web and native clients, Agora's server transcodes transparently, which adds latency and is billed separately. ## Screen Sharing (Cross-Platform) diff --git a/skills/agora/references/rtc/flutter.md b/skills/agora/references/rtc/flutter.md new file mode 100644 index 0000000..6924631 --- /dev/null +++ b/skills/agora/references/rtc/flutter.md @@ -0,0 +1,282 @@ +# Agora RTC Flutter SDK + +## Table of Contents + +- [Installation](#installation) +- [Engine Initialization](#engine-initialization) +- [Joining a Channel](#joining-a-channel) +- [Video Setup](#video-setup) +- [Audio Setup](#audio-setup) +- [Event Handling](#event-handling) +- [Leaving and Cleanup](#leaving-and-cleanup) +- [Complete Example](#complete-example) + +API Reference: + +## Installation + +Add to `pubspec.yaml`: + +```yaml +dependencies: + agora_rtc_engine: ^6.5.0 +``` + +Run `flutter pub get`. + +Add permissions to `AndroidManifest.xml`: + +```xml + + + + + + + + +``` + +Add to `Info.plist` (iOS): + +```xml +NSCameraUsageDescription +Camera access for video calls +NSMicrophoneUsageDescription +Microphone access for audio calls +``` + +## Engine Initialization + +```dart +import 'package:agora_rtc_engine/agora_rtc_engine.dart'; + +late RtcEngine _engine; + +Future initializeAgora() async { + _engine = createAgoraRtcEngine(); + await _engine.initialize(const RtcEngineContext( + appId: 'your-app-id', + channelProfile: ChannelProfileType.channelProfileCommunication, + )); + await _engine.enableVideo(); +} +``` + +## Joining a Channel + +```dart +Future joinChannel() async { + await _engine.joinChannel( + token: null, // null for testing + channelId: 'channel-name', + uid: 0, // 0 for auto-assignment + options: const ChannelMediaOptions( + clientRoleType: ClientRoleType.clientRoleBroadcaster, + publishMicrophoneTrack: true, + publishCameraTrack: true, + autoSubscribeAudio: true, + autoSubscribeVideo: true, + ), + ); +} +``` + +## Video Setup + +```dart +// Local preview — use uid=0 +AgoraVideoView( + controller: VideoViewController( + rtcEngine: _engine, + canvas: const VideoCanvas(uid: 0), + ), +) + +// Remote video — use the remote user's uid +AgoraVideoView( + controller: VideoViewController.remote( + rtcEngine: _engine, + canvas: VideoCanvas(uid: remoteUid), + connection: const RtcConnection(channelId: 'channel-name'), + ), +) +``` + +## Audio Setup + +```dart +// Mute/unmute local audio +await _engine.muteLocalAudioStream(true) // mute +await _engine.muteLocalAudioStream(false) // unmute + +// Mute/unmute local video +await _engine.muteLocalVideoStream(true) // video off +await _engine.muteLocalVideoStream(false) // video on + +// Speaker vs earpiece +await _engine.setEnableSpeakerphone(true) // speaker +await _engine.setEnableSpeakerphone(false) // earpiece +``` + +## Event Handling + +Register handlers **before** joining the channel. + +```dart +_engine.registerEventHandler( + RtcEngineEventHandler( + onJoinChannelSuccess: (RtcConnection connection, int elapsed) { + print('Joined: ${connection.channelId}, uid: ${connection.localUid}'); + }, + onUserJoined: (RtcConnection connection, int remoteUid, int elapsed) { + print('Remote user joined: $remoteUid'); + setState(() => _remoteUid = remoteUid); + }, + onUserOffline: (RtcConnection connection, int remoteUid, UserOfflineReasonType reason) { + print('Remote user left: $remoteUid'); + setState(() => _remoteUid = null); + }, + onTokenPrivilegeWillExpire: (RtcConnection connection, String token) async { + final newToken = await fetchTokenFromServer(); + await _engine.renewToken(newToken); + }, + onError: (ErrorCodeType err, String msg) { + print('Error: $err $msg'); + }, + ), +); +``` + +## Leaving and Cleanup + +```dart +Future leaveChannel() async { + await _engine.leaveChannel(); +} + +// Full cleanup when widget is disposed +@override +void dispose() { + _engine.leaveChannel(); + _engine.release(); + super.dispose(); +} +``` + +Always call `release()` when the engine is no longer needed. + +## Complete Example + +```dart +import 'package:flutter/material.dart'; +import 'package:agora_rtc_engine/agora_rtc_engine.dart'; + +const appId = 'your-app-id'; +const token = null; // null for testing +const channel = 'test'; + +class VideoCallPage extends StatefulWidget { + const VideoCallPage({super.key}); + @override + State createState() => _VideoCallPageState(); +} + +class _VideoCallPageState extends State { + late RtcEngine _engine; + bool _joined = false; + int? _remoteUid; + + @override + void initState() { + super.initState(); + _initAgora(); + } + + Future _initAgora() async { + _engine = createAgoraRtcEngine(); + await _engine.initialize(const RtcEngineContext(appId: appId)); + await _engine.enableVideo(); + + _engine.registerEventHandler(RtcEngineEventHandler( + onJoinChannelSuccess: (_, __) => setState(() => _joined = true), + onUserJoined: (_, uid, __) => setState(() => _remoteUid = uid), + onUserOffline: (_, uid, __) => setState(() => _remoteUid = null), + )); + } + + Future _join() async { + await _engine.startPreview(); + await _engine.joinChannel( + token: token, + channelId: channel, + uid: 0, + options: const ChannelMediaOptions( + clientRoleType: ClientRoleType.clientRoleBroadcaster, + publishMicrophoneTrack: true, + publishCameraTrack: true, + ), + ); + } + + Future _leave() async { + await _engine.leaveChannel(); + setState(() { + _joined = false; + _remoteUid = null; + }); + } + + @override + void dispose() { + _engine.release(); + super.dispose(); + } + + @override + Widget build(BuildContext context) { + return Scaffold( + body: Stack( + children: [ + if (_joined) + AgoraVideoView( + controller: VideoViewController( + rtcEngine: _engine, + canvas: const VideoCanvas(uid: 0), + ), + ), + if (_remoteUid != null) + Positioned( + right: 16, top: 16, width: 120, height: 160, + child: AgoraVideoView( + controller: VideoViewController.remote( + rtcEngine: _engine, + canvas: VideoCanvas(uid: _remoteUid!), + connection: const RtcConnection(channelId: channel), + ), + ), + ), + Positioned( + bottom: 32, left: 0, right: 0, + child: Center( + child: ElevatedButton( + onPressed: _joined ? _leave : _join, + child: Text(_joined ? 'Leave' : 'Join'), + ), + ), + ), + ], + ), + ); + } +} +``` + +For test setup and mocking patterns, see [references/testing-guidance/SKILL.md](../testing-guidance/SKILL.md). + +## Official Documentation + +For APIs or features not covered above: + +- Quick-start guide: +- API Reference: diff --git a/skills/agora/references/rtc/react-native.md b/skills/agora/references/rtc/react-native.md new file mode 100644 index 0000000..f4e6e73 --- /dev/null +++ b/skills/agora/references/rtc/react-native.md @@ -0,0 +1,266 @@ +# Agora RTC React Native SDK + +## Table of Contents + +- [Installation](#installation) +- [Engine Initialization](#engine-initialization) +- [Joining a Channel](#joining-a-channel) +- [Video Setup](#video-setup) +- [Audio Setup](#audio-setup) +- [Event Handling](#event-handling) +- [Leaving and Cleanup](#leaving-and-cleanup) +- [Complete Example](#complete-example) + +API Reference: + +## Installation + +```bash +npm install react-native-agora +``` + +For iOS, run `pod install` in the `ios/` directory. Android requires no extra steps beyond Gradle sync. + +Add permissions to `AndroidManifest.xml`: + +```xml + + + + + + + + +``` + +Add to `Info.plist` (iOS): + +```xml +NSCameraUsageDescription +Camera access for video calls +NSMicrophoneUsageDescription +Microphone access for audio calls +``` + +Request runtime permissions before initializing (use `react-native-permissions` or the built-in `PermissionsAndroid`). + +## Engine Initialization + +```typescript +import { + createAgoraRtcEngine, + IRtcEngine, + ChannelProfileType, + ClientRoleType, +} from 'react-native-agora' + +let agoraEngine: IRtcEngine + +function initializeAgora() { + agoraEngine = createAgoraRtcEngine() + agoraEngine.initialize({ + appId: 'your-app-id', + channelProfile: ChannelProfileType.ChannelProfileCommunication, + }) + agoraEngine.enableVideo() +} +``` + +## Joining a Channel + +```typescript +import { ChannelMediaOptions, ClientRoleType } from 'react-native-agora' + +function joinChannel() { + const options: ChannelMediaOptions = { + clientRoleType: ClientRoleType.ClientRoleBroadcaster, + publishMicrophoneTrack: true, + publishCameraTrack: true, + autoSubscribeAudio: true, + autoSubscribeVideo: true, + } + agoraEngine.joinChannel(token, 'channel-name', uid, options) + // token: null for testing, uid: 0 for auto-assignment +} +``` + +## Video Setup + +```typescript +import { RtcSurfaceView, VideoCanvas, RenderModeType } from 'react-native-agora' + +// Local preview — use uid=0 + + +// Remote video — use the remote user's uid + +``` + +## Audio Setup + +```typescript +import { AudioProfileType, AudioScenarioType } from 'react-native-agora' + +// Mute/unmute local audio +agoraEngine.muteLocalAudioStream(true) // mute +agoraEngine.muteLocalAudioStream(false) // unmute + +// Mute/unmute local video +agoraEngine.muteLocalVideoStream(true) // video off +agoraEngine.muteLocalVideoStream(false) // video on + +// Speaker vs earpiece (Android/iOS) +agoraEngine.setEnableSpeakerphone(true) // speaker +agoraEngine.setEnableSpeakerphone(false) // earpiece +``` + +## Event Handling + +Register event handlers **before** joining the channel. + +```typescript +import { IRtcEngineEventHandler } from 'react-native-agora' + +const eventHandler: IRtcEngineEventHandler = { + onJoinChannelSuccess: (connection, elapsed) => { + console.log('Joined channel:', connection.channelId, 'uid:', connection.localUid) + }, + + onUserJoined: (connection, remoteUid, elapsed) => { + console.log('Remote user joined:', remoteUid) + // Update state to render + }, + + onUserOffline: (connection, remoteUid, reason) => { + console.log('Remote user left:', remoteUid) + }, + + onTokenPrivilegeWillExpire: (connection, token) => { + // Fetch new token and renew + fetchNewToken().then(newToken => { + agoraEngine.renewToken(newToken) + }) + }, + + onError: (err, msg) => { + console.error('Agora error:', err, msg) + }, +} + +agoraEngine.registerEventHandler(eventHandler) +``` + +## Leaving and Cleanup + +```typescript +function leaveChannel() { + agoraEngine.leaveChannel() +} + +// Full cleanup when component unmounts +function destroyAgora() { + agoraEngine.leaveChannel() + agoraEngine.unregisterEventHandler(eventHandler) + agoraEngine.release() +} +``` + +Always call `release()` when the engine is no longer needed to free native resources. + +## Complete Example + +```typescript +import React, { useEffect, useState } from 'react' +import { View, Button } from 'react-native' +import { + createAgoraRtcEngine, + IRtcEngine, + IRtcEngineEventHandler, + ChannelProfileType, + ClientRoleType, + RtcSurfaceView, + RenderModeType, +} from 'react-native-agora' + +const APP_ID = 'your-app-id' +const TOKEN = null // null for testing +const CHANNEL = 'test' +const UID = 0 + +export default function VideoCall() { + const [engine, setEngine] = useState(null) + const [joined, setJoined] = useState(false) + const [remoteUid, setRemoteUid] = useState(null) + + useEffect(() => { + const rtcEngine = createAgoraRtcEngine() + rtcEngine.initialize({ + appId: APP_ID, + channelProfile: ChannelProfileType.ChannelProfileCommunication, + }) + rtcEngine.enableVideo() + + const handler: IRtcEngineEventHandler = { + onJoinChannelSuccess: () => setJoined(true), + onUserJoined: (_, uid) => setRemoteUid(uid), + onUserOffline: () => setRemoteUid(null), + } + rtcEngine.registerEventHandler(handler) + setEngine(rtcEngine) + + return () => { + rtcEngine.leaveChannel() + rtcEngine.unregisterEventHandler(handler) + rtcEngine.release() + } + }, []) + + const join = () => { + engine?.joinChannel(TOKEN, CHANNEL, UID, { + clientRoleType: ClientRoleType.ClientRoleBroadcaster, + publishMicrophoneTrack: true, + publishCameraTrack: true, + }) + } + + const leave = () => { + engine?.leaveChannel() + setJoined(false) + setRemoteUid(null) + } + + return ( + + {joined && ( + + )} + {remoteUid !== null && ( + + )} +