From dbcfebc3e559a130f700c83445430c06b139f5b5 Mon Sep 17 00:00:00 2001 From: Mattt Zmuda Date: Sun, 2 Nov 2025 05:56:49 -0800 Subject: [PATCH 01/13] Initial implementation of Gemini support --- README.md | 29 + .../Models/GeminiLanguageModel.swift | 503 ++++++++++++++++++ 2 files changed, 532 insertions(+) create mode 100644 Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift diff --git a/README.md b/README.md index e4ce41ed..0f84a2c3 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,7 @@ print(response.content) - [x] [llama.cpp](https://github.com/ggml-org/llama.cpp) (GGUF models) - [x] Ollama [HTTP API](https://github.com/ollama/ollama/blob/main/docs/api.md) - [x] Anthropic [Messages API](https://docs.claude.com/en/api/messages) +- [x] Google [Gemini API](https://ai.google.dev/api/generate-content) - [x] OpenAI [Chat Completions API](https://platform.openai.com/docs/api-reference/chat) - [x] OpenAI [Responses API](https://platform.openai.com/docs/api-reference/responses) @@ -228,6 +229,34 @@ let response = try await session.respond { } ``` +### Google Gemini + +Uses the [Gemini API](https://ai.google.dev/api/generate-content) with Gemini models: + +```swift +let model = GeminiLanguageModel( + apiKey: ProcessInfo.processInfo.environment["GEMINI_API_KEY"]!, + model: "gemini-2.5-flash" +) + +let session = LanguageModelSession(model: model, tools: [WeatherTool()]) +let response = try await session.respond { + Prompt("What's the weather like in Tokyo?") +} +``` + +Enable Gemini-specific features like thinking mode and Google Search grounding: + +```swift +let model = GeminiLanguageModel( + apiKey: apiKey, + model: "gemini-2.5-flash", + thinkingBudget: 1024, // Enable thinking mode with budget + includeThoughts: true, // Include thought summaries in response + enableGoogleSearch: true // Enable Google Search grounding +) +``` + ### Ollama Run models locally via Ollama's [HTTP API](https://github.com/ollama/ollama/blob/main/docs/api.md): diff --git a/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift b/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift new file mode 100644 index 00000000..54ab49df --- /dev/null +++ b/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift @@ -0,0 +1,503 @@ +import EventSource +import Foundation +import JSONSchema +import OrderedCollections + +#if canImport(FoundationNetworking) + import FoundationNetworking +#endif + +public struct GeminiLanguageModel: LanguageModel { + public typealias UnavailableReason = Never + + public static let defaultBaseURL = URL(string: "https://generativelanguage.googleapis.com")! + + public static let defaultAPIVersion = "v1beta" + + public let baseURL: URL + + private let tokenProvider: @Sendable () -> String + + public let apiVersion: String + + public let model: String + + public var thinkingBudget: Int? + + public var includeThoughts: Bool + + public var enableGoogleSearch: Bool + + private let urlSession: URLSession + + public init( + baseURL: URL = defaultBaseURL, + apiKey tokenProvider: @escaping @autoclosure @Sendable () -> String, + apiVersion: String = defaultAPIVersion, + model: String, + thinkingBudget: Int? = nil, + includeThoughts: Bool = false, + enableGoogleSearch: Bool = false, + session: URLSession = URLSession(configuration: .default) + ) { + var baseURL = baseURL + if !baseURL.path.hasSuffix("/") { + baseURL = baseURL.appendingPathComponent("") + } + + self.baseURL = baseURL + self.tokenProvider = tokenProvider + self.apiVersion = apiVersion + self.model = model + self.thinkingBudget = thinkingBudget + self.includeThoughts = includeThoughts + self.enableGoogleSearch = enableGoogleSearch + self.urlSession = session + } + + public func respond( + within session: LanguageModelSession, + to prompt: Prompt, + generating type: Content.Type, + includeSchemaInPrompt: Bool, + options: GenerationOptions + ) async throws -> LanguageModelSession.Response where Content: Generable { + guard type == String.self else { + fatalError("GeminiLanguageModel only supports generating String content") + } + + let url = + baseURL + .appendingPathComponent(apiVersion) + .appendingPathComponent("models/\(model):generateContent") + let headers = buildHeaders() + + let contents = [ + GeminiContent(role: .user, parts: [.text(GeminiTextPart(text: prompt.description))]) + ] + + let geminiTools = try buildTools(from: session.tools) + + let params = try createGenerateContentParams( + contents: contents, + tools: geminiTools, + options: options, + thinkingBudget: thinkingBudget, + includeThoughts: includeThoughts + ) + + let body = try JSONEncoder().encode(params) + + let response: GeminiGenerateContentResponse = try await urlSession.fetch( + .post, + url: url, + headers: headers, + body: body + ) + + var entries: [Transcript.Entry] = [] + + guard let firstCandidate = response.candidates.first else { + throw GeminiError.noCandidate + } + + let functionCalls: [GeminiFunctionCall] = firstCandidate.content.parts.compactMap { part in + if case .functionCall(let call) = part { return call } + return nil + } + + if !functionCalls.isEmpty { + let invocations = try await resolveFunctionCalls(functionCalls, session: session) + if !invocations.isEmpty { + entries.append(.toolCalls(Transcript.ToolCalls(invocations.map(\.call)))) + for invocation in invocations { + entries.append(.toolOutput(invocation.output)) + } + } + } + + let text = firstCandidate.content.parts.compactMap { part -> String? in + switch part { + case .text(let t): return t.text + default: return nil + } + }.joined() + + return LanguageModelSession.Response( + content: text as! Content, + rawContent: GeneratedContent(text), + transcriptEntries: ArraySlice(entries) + ) + } + + public func streamResponse( + within session: LanguageModelSession, + to prompt: Prompt, + generating type: Content.Type, + includeSchemaInPrompt: Bool, + options: GenerationOptions + ) -> sending LanguageModelSession.ResponseStream where Content: Generable { + guard type == String.self else { + fatalError("GeminiLanguageModel only supports generating String content") + } + + let contents = [ + GeminiContent(role: .user, parts: [.text(GeminiTextPart(text: prompt.description))]) + ] + + let url = + baseURL + .appendingPathComponent(apiVersion) + .appendingPathComponent("models/\(model):streamGenerateContent") + + let thinkingBudget = self.thinkingBudget + let includeThoughts = self.includeThoughts + + let stream: AsyncThrowingStream.Snapshot, any Error> = .init { + continuation in + let task = Task { @Sendable in + do { + let headers = buildHeaders() + + let geminiTools = try buildTools(from: session.tools) + + var params = try createGenerateContentParams( + contents: contents, + tools: geminiTools, + options: options, + thinkingBudget: thinkingBudget, + includeThoughts: includeThoughts + ) + params["stream"] = .bool(true) + + let body = try JSONEncoder().encode(params) + + let stream: AsyncThrowingStream = + urlSession + .fetchStream( + .post, + url: url, + headers: headers, + body: body + ) + + var accumulatedText = "" + + for try await chunk in stream { + guard let candidate = chunk.candidates.first else { continue } + + for part in candidate.content.parts { + if case .text(let textPart) = part { + accumulatedText += textPart.text + + let raw = GeneratedContent(accumulatedText) + let content: Content.PartiallyGenerated = (accumulatedText as! Content) + .asPartiallyGenerated() + continuation.yield(.init(content: content, rawContent: raw)) + } + } + } + + continuation.finish() + } catch { + continuation.finish(throwing: error) + } + } + continuation.onTermination = { _ in task.cancel() } + } + + return LanguageModelSession.ResponseStream(stream: stream) + } + + private func buildHeaders() -> [String: String] { + let headers: [String: String] = [ + "x-goog-api-key": tokenProvider() + ] + + return headers + } + + private func buildTools(from tools: [any Tool]) throws -> [GeminiTool]? { + var geminiTools: [GeminiTool] = [] + + if !tools.isEmpty { + let functionDeclarations: [GeminiFunctionDeclaration] = try tools.map { tool in + try convertToolToGeminiFormat(tool) + } + geminiTools.append(GeminiTool(functionDeclarations: functionDeclarations)) + } + + if enableGoogleSearch { + geminiTools.append(GeminiTool(googleSearch: GeminiGoogleSearch())) + } + + return geminiTools.isEmpty ? nil : geminiTools + } +} + +private func createGenerateContentParams( + contents: [GeminiContent], + tools: [GeminiTool]?, + options: GenerationOptions, + thinkingBudget: Int?, + includeThoughts: Bool +) throws -> [String: JSONValue] { + var params: [String: JSONValue] = [ + "contents": try JSONValue(contents) + ] + + if let tools, !tools.isEmpty { + params["tools"] = try JSONValue(tools) + } + + var generationConfig: [String: JSONValue] = [:] + + if let maxTokens = options.maximumResponseTokens { + generationConfig["maxOutputTokens"] = .int(maxTokens) + } + + if let temperature = options.temperature { + generationConfig["temperature"] = .double(temperature) + } + + if thinkingBudget != nil || includeThoughts { + var thinkingConfig: [String: JSONValue] = [:] + + if let budget = thinkingBudget { + thinkingConfig["thinkingBudget"] = .int(budget) + } + + if includeThoughts { + thinkingConfig["includeThoughts"] = .bool(true) + } + + if !thinkingConfig.isEmpty { + generationConfig["thinkingConfig"] = .object(thinkingConfig) + } + } + + if !generationConfig.isEmpty { + params["generationConfig"] = .object(generationConfig) + } + + return params +} + +private struct ToolInvocationResult { + let call: Transcript.ToolCall + let output: Transcript.ToolOutput +} + +private func resolveFunctionCalls( + _ functionCalls: [GeminiFunctionCall], + session: LanguageModelSession +) async throws -> [ToolInvocationResult] { + if functionCalls.isEmpty { return [] } + + var toolsByName: [String: any Tool] = [:] + for tool in session.tools { + if toolsByName[tool.name] == nil { + toolsByName[tool.name] = tool + } + } + + var results: [ToolInvocationResult] = [] + results.reserveCapacity(functionCalls.count) + + for call in functionCalls { + let args = try toGeneratedContent(call.args) + let callID = UUID().uuidString + let transcriptCall = Transcript.ToolCall( + id: callID, + toolName: call.name, + arguments: args + ) + + guard let tool = toolsByName[call.name] else { + let message = Transcript.Segment.text(.init(content: "Tool not found: \(call.name)")) + let output = Transcript.ToolOutput( + id: callID, + toolName: call.name, + segments: [message] + ) + results.append(ToolInvocationResult(call: transcriptCall, output: output)) + continue + } + + do { + let segments = try await tool.makeOutputSegments(from: args) + let output = Transcript.ToolOutput( + id: callID, + toolName: tool.name, + segments: segments + ) + results.append(ToolInvocationResult(call: transcriptCall, output: output)) + } catch { + throw LanguageModelSession.ToolCallError(tool: tool, underlyingError: error) + } + } + + return results +} + +private func convertToolToGeminiFormat(_ tool: any Tool) throws -> GeminiFunctionDeclaration { + let resolvedSchema = tool.parameters.withResolvedRoot() ?? tool.parameters + + let data = try JSONEncoder().encode(resolvedSchema) + let schema = try JSONDecoder().decode(JSONSchema.self, from: data) + + return GeminiFunctionDeclaration( + name: tool.name, + description: tool.description, + parameters: schema + ) +} + +private func toGeneratedContent(_ value: [String: JSONValue]?) throws -> GeneratedContent { + guard let value else { return GeneratedContent(properties: [:]) } + let data = try JSONEncoder().encode(JSONValue.object(value)) + let json = String(data: data, encoding: .utf8) ?? "{}" + return try GeneratedContent(json: json) +} + +private struct GeminiTool: Codable, Sendable { + let functionDeclarations: [GeminiFunctionDeclaration]? + let googleSearch: GeminiGoogleSearch? + + enum CodingKeys: String, CodingKey { + case functionDeclarations = "function_declarations" + case googleSearch = "google_search" + } + + init(functionDeclarations: [GeminiFunctionDeclaration]) { + self.functionDeclarations = functionDeclarations + self.googleSearch = nil + } + + init(googleSearch: GeminiGoogleSearch) { + self.functionDeclarations = nil + self.googleSearch = googleSearch + } +} + +private struct GeminiGoogleSearch: Codable, Sendable {} + +private struct GeminiFunctionDeclaration: Codable, Sendable { + let name: String + let description: String + let parameters: JSONSchema +} + +private struct GeminiContent: Codable, Sendable { + enum Role: String, Codable, Sendable { + case user + case model + case tool + } + + let role: Role + let parts: [GeminiPart] +} + +private enum GeminiPart: Codable, Sendable { + case text(GeminiTextPart) + case functionCall(GeminiFunctionCall) + case functionResponse(GeminiFunctionResponse) + + enum CodingKeys: String, CodingKey { + case text + case functionCall + case functionResponse + } + + init(from decoder: any Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + + if container.contains(.text) { + self = .text(try GeminiTextPart(from: decoder)) + } else if container.contains(.functionCall) { + self = .functionCall(try GeminiFunctionCall(from: decoder)) + } else if container.contains(.functionResponse) { + self = .functionResponse(try GeminiFunctionResponse(from: decoder)) + } else { + throw DecodingError.dataCorrupted( + DecodingError.Context( + codingPath: decoder.codingPath, + debugDescription: "Unable to decode GeminiPart" + ) + ) + } + } + + func encode(to encoder: any Encoder) throws { + switch self { + case .text(let part): try part.encode(to: encoder) + case .functionCall(let call): try call.encode(to: encoder) + case .functionResponse(let response): try response.encode(to: encoder) + } + } +} + +private struct GeminiTextPart: Codable, Sendable { + let text: String +} + +private struct GeminiFunctionCall: Codable, Sendable { + let name: String + let args: [String: JSONValue]? + + enum CodingKeys: String, CodingKey { + case name + case args + } +} + +private struct GeminiFunctionResponse: Codable, Sendable { + let name: String + let response: [String: JSONValue] +} + +private struct GeminiGenerateContentResponse: Codable, Sendable { + let candidates: [GeminiCandidate] + let usageMetadata: GeminiUsageMetadata? + + enum CodingKeys: String, CodingKey { + case candidates + case usageMetadata = "usageMetadata" + } +} + +private struct GeminiCandidate: Codable, Sendable { + let content: GeminiContent + let finishReason: String? + + enum CodingKeys: String, CodingKey { + case content + case finishReason + } +} + +private struct GeminiUsageMetadata: Codable, Sendable { + let promptTokenCount: Int? + let candidatesTokenCount: Int? + let totalTokenCount: Int? + let thoughtsTokenCount: Int? + + enum CodingKeys: String, CodingKey { + case promptTokenCount + case candidatesTokenCount + case totalTokenCount + case thoughtsTokenCount + } +} + +enum GeminiError: Error, CustomStringConvertible { + case noCandidate + + var description: String { + switch self { + case .noCandidate: + return "No candidate in response" + } + } +} From 023604bdf2370acd0f5b3f8a18fdc278e5629e21 Mon Sep 17 00:00:00 2001 From: Mattt Zmuda Date: Sun, 2 Nov 2025 06:29:25 -0800 Subject: [PATCH 02/13] Improve ergnomics of thinking and server tools --- README.md | 71 ++++- .../Models/GeminiLanguageModel.swift | 278 +++++++++++++----- 2 files changed, 265 insertions(+), 84 deletions(-) diff --git a/README.md b/README.md index 0f84a2c3..d43e57d4 100644 --- a/README.md +++ b/README.md @@ -245,15 +245,78 @@ let response = try await session.respond { } ``` -Enable Gemini-specific features like thinking mode and Google Search grounding: +Enable Gemini-specific features like thinking mode and server-side tools: ```swift +// Enable thinking mode with specific token budget let model = GeminiLanguageModel( apiKey: apiKey, model: "gemini-2.5-flash", - thinkingBudget: 1024, // Enable thinking mode with budget - includeThoughts: true, // Include thought summaries in response - enableGoogleSearch: true // Enable Google Search grounding + thinking: .budget(1024), // Specific thinking budget + serverTools: [.googleSearch] // Enable Google Search grounding +) + +// Enable dynamic thinking (model decides budget) +let model = GeminiLanguageModel( + apiKey: apiKey, + model: "gemini-2.5-flash", + thinking: true, // Dynamic thinking + serverTools: [.googleSearch, .codeExecution] +) + +// Disable thinking (default) +let model = GeminiLanguageModel( + apiKey: apiKey, + model: "gemini-2.5-flash", + thinking: .disabled +) +``` + +#### Server-Side Tools + +Gemini supports server-side tools that execute transparently on Google's infrastructure: + +```swift +// Google Search - provides real-time web information +let model = GeminiLanguageModel( + apiKey: apiKey, + model: "gemini-2.5-flash", + serverTools: [.googleSearch] +) + +// URL Context - fetches and analyzes content from URLs mentioned in prompts +let model = GeminiLanguageModel( + apiKey: apiKey, + model: "gemini-2.5-flash", + serverTools: [.urlContext] +) + +// Code Execution - generates and runs Python code to solve problems +let model = GeminiLanguageModel( + apiKey: apiKey, + model: "gemini-2.5-flash", + serverTools: [.codeExecution] +) + +// Google Maps - provides location-aware responses +let model = GeminiLanguageModel( + apiKey: apiKey, + model: "gemini-2.5-flash", + serverTools: [ + .googleMaps(latitude: 37.7749, longitude: -122.4194) + ] +) + +// Combine multiple server tools +let model = GeminiLanguageModel( + apiKey: apiKey, + model: "gemini-2.5-flash", + serverTools: [ + .googleSearch, + .codeExecution, + .urlContext, + .googleMaps(latitude: nil, longitude: nil) // Optional location + ] ) ``` diff --git a/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift b/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift index 54ab49df..c603d033 100644 --- a/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift +++ b/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift @@ -14,6 +14,35 @@ public struct GeminiLanguageModel: LanguageModel { public static let defaultAPIVersion = "v1beta" + public enum Thinking: Sendable, ExpressibleByBooleanLiteral, ExpressibleByIntegerLiteral { + case disabled + case dynamic + case budget(Int) + + var budgetValue: Int? { + switch self { + case .disabled: return 0 + case .dynamic: return -1 + case .budget(let value): return value + } + } + + public init(booleanLiteral value: Bool) { + self = value ? .dynamic : .disabled + } + + public init(integerLiteral value: Int) { + self = .budget(value) + } + } + + public enum GeminiServerTool: Sendable, Equatable { + case googleSearch + case urlContext + case codeExecution + case googleMaps(latitude: Double?, longitude: Double?) + } + public let baseURL: URL private let tokenProvider: @Sendable () -> String @@ -22,11 +51,9 @@ public struct GeminiLanguageModel: LanguageModel { public let model: String - public var thinkingBudget: Int? + public var thinking: Thinking - public var includeThoughts: Bool - - public var enableGoogleSearch: Bool + public var serverTools: [GeminiServerTool] private let urlSession: URLSession @@ -35,9 +62,8 @@ public struct GeminiLanguageModel: LanguageModel { apiKey tokenProvider: @escaping @autoclosure @Sendable () -> String, apiVersion: String = defaultAPIVersion, model: String, - thinkingBudget: Int? = nil, - includeThoughts: Bool = false, - enableGoogleSearch: Bool = false, + thinking: Thinking = .disabled, + serverTools: [GeminiServerTool] = [], session: URLSession = URLSession(configuration: .default) ) { var baseURL = baseURL @@ -49,9 +75,8 @@ public struct GeminiLanguageModel: LanguageModel { self.tokenProvider = tokenProvider self.apiVersion = apiVersion self.model = model - self.thinkingBudget = thinkingBudget - self.includeThoughts = includeThoughts - self.enableGoogleSearch = enableGoogleSearch + self.thinking = thinking + self.serverTools = serverTools self.urlSession = session } @@ -72,62 +97,89 @@ public struct GeminiLanguageModel: LanguageModel { .appendingPathComponent("models/\(model):generateContent") let headers = buildHeaders() - let contents = [ + var contents = [ GeminiContent(role: .user, parts: [.text(GeminiTextPart(text: prompt.description))]) ] let geminiTools = try buildTools(from: session.tools) - let params = try createGenerateContentParams( - contents: contents, - tools: geminiTools, - options: options, - thinkingBudget: thinkingBudget, - includeThoughts: includeThoughts - ) + var allEntries: [Transcript.Entry] = [] - let body = try JSONEncoder().encode(params) + // Multi-turn conversation loop for tool calling + while true { + let params = try createGenerateContentParams( + contents: contents, + tools: geminiTools, + options: options, + thinking: thinking + ) - let response: GeminiGenerateContentResponse = try await urlSession.fetch( - .post, - url: url, - headers: headers, - body: body - ) + let body = try JSONEncoder().encode(params) - var entries: [Transcript.Entry] = [] + let response: GeminiGenerateContentResponse = try await urlSession.fetch( + .post, + url: url, + headers: headers, + body: body + ) - guard let firstCandidate = response.candidates.first else { - throw GeminiError.noCandidate - } + guard let firstCandidate = response.candidates.first else { + throw GeminiError.noCandidate + } - let functionCalls: [GeminiFunctionCall] = firstCandidate.content.parts.compactMap { part in - if case .functionCall(let call) = part { return call } - return nil - } + let functionCalls: [GeminiFunctionCall] = firstCandidate.content.parts.compactMap { part in + if case .functionCall(let call) = part { return call } + return nil + } + + if !functionCalls.isEmpty { + // Append the model's response with function calls to the conversation + contents.append(firstCandidate.content) + + // Resolve function calls + let invocations = try await resolveFunctionCalls(functionCalls, session: session) + if !invocations.isEmpty { + allEntries.append(.toolCalls(Transcript.ToolCalls(invocations.map(\.call)))) + + // Build tool response parts for Gemini + var toolParts: [GeminiPart] = [] + for invocation in invocations { + allEntries.append(.toolOutput(invocation.output)) + + // Convert tool output to function response + let responseValue = try toJSONValue(invocation.output) + toolParts.append( + .functionResponse( + GeminiFunctionResponse( + name: invocation.call.toolName, + response: responseValue + ) + ) + ) + } - if !functionCalls.isEmpty { - let invocations = try await resolveFunctionCalls(functionCalls, session: session) - if !invocations.isEmpty { - entries.append(.toolCalls(Transcript.ToolCalls(invocations.map(\.call)))) - for invocation in invocations { - entries.append(.toolOutput(invocation.output)) + // Append tool responses to the conversation + contents.append(GeminiContent(role: .tool, parts: toolParts)) } - } - } - let text = firstCandidate.content.parts.compactMap { part -> String? in - switch part { - case .text(let t): return t.text - default: return nil - } - }.joined() + // Continue the loop to send the next request with tool results + continue + } else { + // No function calls, extract final text and return + let text = firstCandidate.content.parts.compactMap { part -> String? in + switch part { + case .text(let t): return t.text + default: return nil + } + }.joined() - return LanguageModelSession.Response( - content: text as! Content, - rawContent: GeneratedContent(text), - transcriptEntries: ArraySlice(entries) - ) + return LanguageModelSession.Response( + content: text as! Content, + rawContent: GeneratedContent(text), + transcriptEntries: ArraySlice(allEntries) + ) + } + } } public func streamResponse( @@ -150,8 +202,7 @@ public struct GeminiLanguageModel: LanguageModel { .appendingPathComponent(apiVersion) .appendingPathComponent("models/\(model):streamGenerateContent") - let thinkingBudget = self.thinkingBudget - let includeThoughts = self.includeThoughts + let thinking = self.thinking let stream: AsyncThrowingStream.Snapshot, any Error> = .init { continuation in @@ -165,8 +216,7 @@ public struct GeminiLanguageModel: LanguageModel { contents: contents, tools: geminiTools, options: options, - thinkingBudget: thinkingBudget, - includeThoughts: includeThoughts + thinking: thinking ) params["stream"] = .bool(true) @@ -224,11 +274,20 @@ public struct GeminiLanguageModel: LanguageModel { let functionDeclarations: [GeminiFunctionDeclaration] = try tools.map { tool in try convertToolToGeminiFormat(tool) } - geminiTools.append(GeminiTool(functionDeclarations: functionDeclarations)) + geminiTools.append(.functionDeclarations(functionDeclarations)) } - if enableGoogleSearch { - geminiTools.append(GeminiTool(googleSearch: GeminiGoogleSearch())) + for serverTool in serverTools { + switch serverTool { + case .googleSearch: + geminiTools.append(.googleSearch) + case .urlContext: + geminiTools.append(.urlContext) + case .codeExecution: + geminiTools.append(.codeExecution) + case .googleMaps(let latitude, let longitude): + geminiTools.append(.googleMaps(latitude: latitude, longitude: longitude)) + } } return geminiTools.isEmpty ? nil : geminiTools @@ -239,8 +298,7 @@ private func createGenerateContentParams( contents: [GeminiContent], tools: [GeminiTool]?, options: GenerationOptions, - thinkingBudget: Int?, - includeThoughts: Bool + thinking: GeminiLanguageModel.Thinking ) throws -> [String: JSONValue] { var params: [String: JSONValue] = [ "contents": try JSONValue(contents) @@ -260,20 +318,17 @@ private func createGenerateContentParams( generationConfig["temperature"] = .double(temperature) } - if thinkingBudget != nil || includeThoughts { + if case .disabled = thinking { + } else { var thinkingConfig: [String: JSONValue] = [:] - if let budget = thinkingBudget { + if let budget = thinking.budgetValue { thinkingConfig["thinkingBudget"] = .int(budget) } - if includeThoughts { - thinkingConfig["includeThoughts"] = .bool(true) - } + thinkingConfig["includeThoughts"] = .bool(true) - if !thinkingConfig.isEmpty { - generationConfig["thinkingConfig"] = .object(thinkingConfig) - } + generationConfig["thinkingConfig"] = .object(thinkingConfig) } if !generationConfig.isEmpty { @@ -360,28 +415,91 @@ private func toGeneratedContent(_ value: [String: JSONValue]?) throws -> Generat return try GeneratedContent(json: json) } -private struct GeminiTool: Codable, Sendable { - let functionDeclarations: [GeminiFunctionDeclaration]? - let googleSearch: GeminiGoogleSearch? +private func toJSONValue(_ toolOutput: Transcript.ToolOutput) throws -> [String: JSONValue] { + var result: [String: JSONValue] = [:] + + for segment in toolOutput.segments { + switch segment { + case .text(let text): + result["result"] = .string(text.content) + case .structure(let structured): + // For structured segments, encode the content + let data = try JSONEncoder().encode(structured.content) + if let jsonString = String(data: data, encoding: .utf8) { + result["result"] = .string(jsonString) + } + } + } + + return result +} + +private enum GeminiTool: Codable, Sendable { + case functionDeclarations([GeminiFunctionDeclaration]) + case googleSearch + case urlContext + case codeExecution + case googleMaps(latitude: Double?, longitude: Double?) enum CodingKeys: String, CodingKey { case functionDeclarations = "function_declarations" case googleSearch = "google_search" + case urlContext = "url_context" + case codeExecution = "code_execution" + case googleMaps = "google_maps" } - init(functionDeclarations: [GeminiFunctionDeclaration]) { - self.functionDeclarations = functionDeclarations - self.googleSearch = nil + init(from decoder: any Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + + if let declarations = try container.decodeIfPresent( + [GeminiFunctionDeclaration].self, + forKey: .functionDeclarations + ) { + self = .functionDeclarations(declarations) + } else if container.contains(.googleSearch) { + self = .googleSearch + } else if container.contains(.urlContext) { + self = .urlContext + } else if container.contains(.codeExecution) { + self = .codeExecution + } else if let mapsData = try container.decodeIfPresent(GoogleMapsPayload.self, forKey: .googleMaps) { + self = .googleMaps(latitude: mapsData.lat, longitude: mapsData.lng) + } else { + throw DecodingError.dataCorrupted( + DecodingError.Context( + codingPath: decoder.codingPath, + debugDescription: "Unable to decode GeminiTool" + ) + ) + } + } + + func encode(to encoder: any Encoder) throws { + var container = encoder.container(keyedBy: CodingKeys.self) + + switch self { + case .functionDeclarations(let declarations): + try container.encode(declarations, forKey: .functionDeclarations) + case .googleSearch: + try container.encode(EmptyObject(), forKey: .googleSearch) + case .urlContext: + try container.encode(EmptyObject(), forKey: .urlContext) + case .codeExecution: + try container.encode(EmptyObject(), forKey: .codeExecution) + case .googleMaps(let latitude, let longitude): + try container.encode(GoogleMapsPayload(lat: latitude, lng: longitude), forKey: .googleMaps) + } } - init(googleSearch: GeminiGoogleSearch) { - self.functionDeclarations = nil - self.googleSearch = googleSearch + private struct EmptyObject: Codable {} + + private struct GoogleMapsPayload: Codable { + let lat: Double? + let lng: Double? } } -private struct GeminiGoogleSearch: Codable, Sendable {} - private struct GeminiFunctionDeclaration: Codable, Sendable { let name: String let description: String From 719f8a9e49995f6e918af0303cfabc8f66e792d7 Mon Sep 17 00:00:00 2001 From: Mattt Zmuda Date: Sun, 2 Nov 2025 06:29:56 -0800 Subject: [PATCH 03/13] Rename GeminiServerTool to ServerTool --- Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift b/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift index c603d033..5a0faddd 100644 --- a/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift +++ b/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift @@ -36,7 +36,7 @@ public struct GeminiLanguageModel: LanguageModel { } } - public enum GeminiServerTool: Sendable, Equatable { + public enum ServerTool: Sendable, Equatable { case googleSearch case urlContext case codeExecution @@ -53,7 +53,7 @@ public struct GeminiLanguageModel: LanguageModel { public var thinking: Thinking - public var serverTools: [GeminiServerTool] + public var serverTools: [ServerTool] private let urlSession: URLSession @@ -63,7 +63,7 @@ public struct GeminiLanguageModel: LanguageModel { apiVersion: String = defaultAPIVersion, model: String, thinking: Thinking = .disabled, - serverTools: [GeminiServerTool] = [], + serverTools: [ServerTool] = [], session: URLSession = URLSession(configuration: .default) ) { var baseURL = baseURL From e96c0f9a0ffb6f5575b39540a8876878d201b74c Mon Sep 17 00:00:00 2001 From: Mattt Zmuda Date: Mon, 3 Nov 2025 01:26:44 -0800 Subject: [PATCH 04/13] Update README --- README.md | 82 ++++++++++++++++++------------------------------------- 1 file changed, 26 insertions(+), 56 deletions(-) diff --git a/README.md b/README.md index d43e57d4..e7aad9ee 100644 --- a/README.md +++ b/README.md @@ -245,81 +245,51 @@ let response = try await session.respond { } ``` -Enable Gemini-specific features like thinking mode and server-side tools: +Gemini models use an internal ["thinking process"](https://ai.google.dev/gemini-api/docs/thinking) +that improves reasoning and multi-step planning. +You can configure how much Gemini should "think" using the `thinking` parameter: ```swift -// Enable thinking mode with specific token budget -let model = GeminiLanguageModel( +// Enable thinking +var model = GeminiLanguageModel( apiKey: apiKey, model: "gemini-2.5-flash", - thinking: .budget(1024), // Specific thinking budget - serverTools: [.googleSearch] // Enable Google Search grounding + thinking: true /* or `.dynamic` */, ) -// Enable dynamic thinking (model decides budget) -let model = GeminiLanguageModel( - apiKey: apiKey, - model: "gemini-2.5-flash", - thinking: true, // Dynamic thinking - serverTools: [.googleSearch, .codeExecution] -) +// Set an explicit number of tokens for its thinking budget +model.thinking = .budget(1024) -// Disable thinking (default) -let model = GeminiLanguageModel( - apiKey: apiKey, - model: "gemini-2.5-flash", - thinking: .disabled -) +// Revert to default configuration without thinking +model.thinking = false /* or `.disabled` */ ``` -#### Server-Side Tools - -Gemini supports server-side tools that execute transparently on Google's infrastructure: +Gemini supports [server-side tools](https://ai.google.dev/gemini-api/docs/google-search) +that execute transparently on Google's infrastructure: ```swift -// Google Search - provides real-time web information -let model = GeminiLanguageModel( - apiKey: apiKey, - model: "gemini-2.5-flash", - serverTools: [.googleSearch] -) - -// URL Context - fetches and analyzes content from URLs mentioned in prompts -let model = GeminiLanguageModel( - apiKey: apiKey, - model: "gemini-2.5-flash", - serverTools: [.urlContext] -) - -// Code Execution - generates and runs Python code to solve problems -let model = GeminiLanguageModel( - apiKey: apiKey, - model: "gemini-2.5-flash", - serverTools: [.codeExecution] -) - -// Google Maps - provides location-aware responses let model = GeminiLanguageModel( apiKey: apiKey, model: "gemini-2.5-flash", serverTools: [ - .googleMaps(latitude: 37.7749, longitude: -122.4194) - ] -) - -// Combine multiple server tools -let model = GeminiLanguageModel( - apiKey: apiKey, - model: "gemini-2.5-flash", - serverTools: [ - .googleSearch, - .codeExecution, - .urlContext, - .googleMaps(latitude: nil, longitude: nil) // Optional location + .googleMaps(latitude: 37.7749, longitude: -122.4194) // Optional location ] ) ``` +- `.googleSearch` + Grounds responses with real-time web information +- `.googleMaps` + Provides location-aware responses +- `.codeExecution` + Generates and runs Python code to solve problems +- `.urlContext` + Fetches and analyzes content from URLs mentioned in prompts + +> [!TIP] +> Google server tools are exclusive to Gemini models, +> and not available as client tools (`Tool`) for other models. + ### Ollama Run models locally via Ollama's [HTTP API](https://github.com/ollama/ollama/blob/main/docs/api.md): From bcd610058d4b1c5cc55256d5bdcb8725efd0bd29 Mon Sep 17 00:00:00 2001 From: Mattt Zmuda Date: Mon, 3 Nov 2025 02:38:47 -0800 Subject: [PATCH 05/13] sse as url query parameter, not stream body parameter --- Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift b/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift index 5a0faddd..d360152d 100644 --- a/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift +++ b/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift @@ -197,10 +197,12 @@ public struct GeminiLanguageModel: LanguageModel { GeminiContent(role: .user, parts: [.text(GeminiTextPart(text: prompt.description))]) ] - let url = + var streamURL = baseURL .appendingPathComponent(apiVersion) .appendingPathComponent("models/\(model):streamGenerateContent") + streamURL.append(queryItems: [URLQueryItem(name: "alt", value: "sse")]) + let url = streamURL let thinking = self.thinking @@ -212,13 +214,12 @@ public struct GeminiLanguageModel: LanguageModel { let geminiTools = try buildTools(from: session.tools) - var params = try createGenerateContentParams( + let params = try createGenerateContentParams( contents: contents, tools: geminiTools, options: options, thinking: thinking ) - params["stream"] = .bool(true) let body = try JSONEncoder().encode(params) From 3fc7529abf3f15936ccd685a4b2d2532120308b1 Mon Sep 17 00:00:00 2001 From: Mattt Zmuda Date: Mon, 3 Nov 2025 02:39:26 -0800 Subject: [PATCH 06/13] Add test coverage for GeminiLanguageModel --- .../GeminiLanguageModelTests.swift | 116 ++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 Tests/AnyLanguageModelTests/GeminiLanguageModelTests.swift diff --git a/Tests/AnyLanguageModelTests/GeminiLanguageModelTests.swift b/Tests/AnyLanguageModelTests/GeminiLanguageModelTests.swift new file mode 100644 index 00000000..2e5259e0 --- /dev/null +++ b/Tests/AnyLanguageModelTests/GeminiLanguageModelTests.swift @@ -0,0 +1,116 @@ +import Foundation +import Testing + +@testable import AnyLanguageModel + +private let geminiAPIKey: String? = ProcessInfo.processInfo.environment["GEMINI_API_KEY"] + +@Suite("GeminiLanguageModel", .enabled(if: geminiAPIKey?.isEmpty == false)) +struct GeminiLanguageModelTests { + let model = GeminiLanguageModel( + apiKey: geminiAPIKey!, + model: "gemini-2.5-flash" + ) + + @Test func customHost() throws { + let customURL = URL(string: "https://example.com")! + let model = GeminiLanguageModel(baseURL: customURL, apiKey: "test", model: "test-model") + #expect(model.baseURL.absoluteString.hasSuffix("/")) + } + + @Test func basicResponse() async throws { + let session = LanguageModelSession(model: model) + let response = try await session.respond(to: "Say hello") + #expect(!response.content.isEmpty) + } + + @Test func withInstructions() async throws { + let session = LanguageModelSession( + model: model, + instructions: "You are a helpful assistant. Be concise." + ) + + let response = try await session.respond(to: "What is 2+2?") + #expect(!response.content.isEmpty) + } + + @Test func streaming() async throws { + let session = LanguageModelSession(model: model) + + let stream = session.streamResponse(to: "Count to 5") + var chunks: [String] = [] + + for try await response in stream { + chunks.append(response.content) + } + + #expect(!chunks.isEmpty) + } + + @Test func streamingString() async throws { + let session = LanguageModelSession(model: model) + + let stream = session.streamResponse(to: "Say 'Hello' slowly") + + var snapshots: [LanguageModelSession.ResponseStream.Snapshot] = [] + for try await snapshot in stream { + snapshots.append(snapshot) + } + + #expect(!snapshots.isEmpty) + #expect(!snapshots.last!.rawContent.jsonString.isEmpty) + } + + @Test func withGenerationOptions() async throws { + let session = LanguageModelSession(model: model) + + let options = GenerationOptions( + temperature: 0.7, + maximumResponseTokens: 50 + ) + + let response = try await session.respond( + to: "Tell me a fact", + options: options + ) + #expect(!response.content.isEmpty) + } + + @Test func conversationContext() async throws { + let session = LanguageModelSession(model: model) + + let firstResponse = try await session.respond(to: "My favorite color is blue") + #expect(!firstResponse.content.isEmpty) + + let secondResponse = try await session.respond(to: "What did I just tell you?") + #expect(!secondResponse.content.isEmpty) + } + + @Test func withClientTools() async throws { + let weatherTool = WeatherTool() + let session = LanguageModelSession(model: model, tools: [weatherTool]) + + let response = try await session.respond(to: "How's the weather in San Francisco?") + + var foundToolOutput = false + for case let .toolOutput(toolOutput) in response.transcriptEntries { + #expect(toolOutput.id == "getWeather") + foundToolOutput = true + } + #expect(foundToolOutput) + } + + @Test func withServerTools() async throws { + let model = GeminiLanguageModel( + apiKey: geminiAPIKey!, + model: "gemini-2.5-flash", + serverTools: [ + .googleMaps(latitude: 37.7749, longitude: -122.4194) + ] + ) + + let session = LanguageModelSession(model: model) + let response = try await session.respond(to: "What coffee shops are nearby?") + #expect(!response.content.isEmpty) + } +} From aaaea581359bcfc36d4b18c69e0c9c0951463642 Mon Sep 17 00:00:00 2001 From: Mattt Zmuda Date: Mon, 3 Nov 2025 03:12:56 -0800 Subject: [PATCH 07/13] Add custom user info key to control omission of additionalProperties --- .../AnyLanguageModel/GenerationSchema.swift | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/Sources/AnyLanguageModel/GenerationSchema.swift b/Sources/AnyLanguageModel/GenerationSchema.swift index 00e29031..694dac0c 100644 --- a/Sources/AnyLanguageModel/GenerationSchema.swift +++ b/Sources/AnyLanguageModel/GenerationSchema.swift @@ -9,9 +9,6 @@ import struct Foundation.Decimal /// Generation schemas guide the output of a ``SystemLanguageModel`` to deterministically /// ensure the output is in the desired format. public struct GenerationSchema: Sendable, Codable, CustomDebugStringConvertible { - - // MARK: - Structure - indirect enum Node: Sendable, Codable { case object(ObjectNode) case array(ArrayNode) @@ -44,7 +41,12 @@ public struct GenerationSchema: Sendable, Codable, CustomDebugStringConvertible try propsContainer.encode(node, forKey: DynamicCodingKey(stringValue: name)!) } try container.encode(Array(obj.required), forKey: .required) - try container.encode(false, forKey: .additionalProperties) + + // Check userInfo to see if additionalProperties should be omitted + let shouldOmit = encoder.userInfo[GenerationSchema.omitAdditionalPropertiesKey] as? Bool ?? false + if !shouldOmit { + try container.encode(false, forKey: .additionalProperties) + } case .array(let arr): try container.encode("array", forKey: .type) @@ -201,8 +203,6 @@ public struct GenerationSchema: Sendable, Codable, CustomDebugStringConvertible } } - // MARK: - Properties - let root: Node private var defs: [String: Node] @@ -774,3 +774,21 @@ extension GenerationSchema { } } } + +// MARK: - CodingUserInfoKey + +extension GenerationSchema { + /// A key used in the encoder's `userInfo` dictionary to control whether + /// the `additionalProperties` field should be omitted from the encoded output. + /// + /// Set this to `true` to omit `additionalProperties` from object schemas. + /// Defaults to `false` (includes `additionalProperties`) if not specified. + /// + /// Example: + /// ```swift + /// let encoder = JSONEncoder() + /// encoder.userInfo[GenerationSchema.omitAdditionalPropertiesKey] = true + /// let data = try encoder.encode(schema) + /// ``` + static let omitAdditionalPropertiesKey = CodingUserInfoKey(rawValue: "GenerationSchema.omitAdditionalProperties")! +} From 649e5dcc2a9e335674ee8ff52aa3d388cd5917a9 Mon Sep 17 00:00:00 2001 From: Mattt Zmuda Date: Mon, 3 Nov 2025 03:13:45 -0800 Subject: [PATCH 08/13] Fix encoding of server tools --- .../Models/GeminiLanguageModel.swift | 97 ++++++++----------- 1 file changed, 40 insertions(+), 57 deletions(-) diff --git a/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift b/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift index d360152d..96d54a6a 100644 --- a/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift +++ b/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift @@ -306,7 +306,15 @@ private func createGenerateContentParams( ] if let tools, !tools.isEmpty { - params["tools"] = try JSONValue(tools) + params["tools"] = try .array(tools.map { try $0.jsonValue }) + + // Add toolConfig if any tool provides one + for tool in tools { + if let toolConfig = tool.toolConfigValue { + params["toolConfig"] = toolConfig + break + } + } } var generationConfig: [String: JSONValue] = [:] @@ -319,18 +327,17 @@ private func createGenerateContentParams( generationConfig["temperature"] = .double(temperature) } + var thinkingConfig: [String: JSONValue] = [:] if case .disabled = thinking { + thinkingConfig["includeThoughts"] = .bool(false) } else { - var thinkingConfig: [String: JSONValue] = [:] + thinkingConfig["includeThoughts"] = .bool(true) if let budget = thinking.budgetValue { thinkingConfig["thinkingBudget"] = .int(budget) } - - thinkingConfig["includeThoughts"] = .bool(true) - - generationConfig["thinkingConfig"] = .object(thinkingConfig) } + generationConfig["thinkingConfig"] = .object(thinkingConfig) if !generationConfig.isEmpty { params["generationConfig"] = .object(generationConfig) @@ -435,70 +442,46 @@ private func toJSONValue(_ toolOutput: Transcript.ToolOutput) throws -> [String: return result } -private enum GeminiTool: Codable, Sendable { +private enum GeminiTool: Sendable { case functionDeclarations([GeminiFunctionDeclaration]) case googleSearch case urlContext case codeExecution case googleMaps(latitude: Double?, longitude: Double?) - enum CodingKeys: String, CodingKey { - case functionDeclarations = "function_declarations" - case googleSearch = "google_search" - case urlContext = "url_context" - case codeExecution = "code_execution" - case googleMaps = "google_maps" - } - - init(from decoder: any Decoder) throws { - let container = try decoder.container(keyedBy: CodingKeys.self) - - if let declarations = try container.decodeIfPresent( - [GeminiFunctionDeclaration].self, - forKey: .functionDeclarations - ) { - self = .functionDeclarations(declarations) - } else if container.contains(.googleSearch) { - self = .googleSearch - } else if container.contains(.urlContext) { - self = .urlContext - } else if container.contains(.codeExecution) { - self = .codeExecution - } else if let mapsData = try container.decodeIfPresent(GoogleMapsPayload.self, forKey: .googleMaps) { - self = .googleMaps(latitude: mapsData.lat, longitude: mapsData.lng) - } else { - throw DecodingError.dataCorrupted( - DecodingError.Context( - codingPath: decoder.codingPath, - debugDescription: "Unable to decode GeminiTool" - ) - ) + var jsonValue: JSONValue { + get throws { + switch self { + case .functionDeclarations(let declarations): + return .object(["function_declarations": try JSONValue(declarations)]) + case .googleSearch: + return .object(["google_search": .object([:])]) + case .urlContext: + return .object(["url_context": .object([:])]) + case .codeExecution: + return .object(["code_execution": .object([:])]) + case .googleMaps: + return .object(["google_maps": .object([:])]) + } } } - func encode(to encoder: any Encoder) throws { - var container = encoder.container(keyedBy: CodingKeys.self) - + var toolConfigValue: JSONValue? { switch self { - case .functionDeclarations(let declarations): - try container.encode(declarations, forKey: .functionDeclarations) - case .googleSearch: - try container.encode(EmptyObject(), forKey: .googleSearch) - case .urlContext: - try container.encode(EmptyObject(), forKey: .urlContext) - case .codeExecution: - try container.encode(EmptyObject(), forKey: .codeExecution) case .googleMaps(let latitude, let longitude): - try container.encode(GoogleMapsPayload(lat: latitude, lng: longitude), forKey: .googleMaps) + guard let lat = latitude, let lng = longitude else { return nil } + return .object([ + "retrievalConfig": .object([ + "latLng": .object([ + "latitude": .double(lat), + "longitude": .double(lng), + ]) + ]) + ]) + default: + return nil } } - - private struct EmptyObject: Codable {} - - private struct GoogleMapsPayload: Codable { - let lat: Double? - let lng: Double? - } } private struct GeminiFunctionDeclaration: Codable, Sendable { From 5a767d43d70d7e2160d18567ebe6af3a2deb959b Mon Sep 17 00:00:00 2001 From: Mattt Zmuda Date: Mon, 3 Nov 2025 03:14:32 -0800 Subject: [PATCH 09/13] Serialize Gemini tests to avoid API rate limiting --- Tests/AnyLanguageModelTests/GeminiLanguageModelTests.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tests/AnyLanguageModelTests/GeminiLanguageModelTests.swift b/Tests/AnyLanguageModelTests/GeminiLanguageModelTests.swift index 2e5259e0..a8253355 100644 --- a/Tests/AnyLanguageModelTests/GeminiLanguageModelTests.swift +++ b/Tests/AnyLanguageModelTests/GeminiLanguageModelTests.swift @@ -5,7 +5,7 @@ import Testing private let geminiAPIKey: String? = ProcessInfo.processInfo.environment["GEMINI_API_KEY"] -@Suite("GeminiLanguageModel", .enabled(if: geminiAPIKey?.isEmpty == false)) +@Suite("GeminiLanguageModel", .serialized, .enabled(if: geminiAPIKey?.isEmpty == false)) struct GeminiLanguageModelTests { let model = GeminiLanguageModel( apiKey: geminiAPIKey!, From cd351df9d2af65cfeed7512c39531549629073b4 Mon Sep 17 00:00:00 2001 From: Mattt Zmuda Date: Mon, 3 Nov 2025 03:14:58 -0800 Subject: [PATCH 10/13] Disable thinking for withGenerationOptions test --- .../AnyLanguageModelTests/GeminiLanguageModelTests.swift | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Tests/AnyLanguageModelTests/GeminiLanguageModelTests.swift b/Tests/AnyLanguageModelTests/GeminiLanguageModelTests.swift index a8253355..2bda08a2 100644 --- a/Tests/AnyLanguageModelTests/GeminiLanguageModelTests.swift +++ b/Tests/AnyLanguageModelTests/GeminiLanguageModelTests.swift @@ -62,7 +62,13 @@ struct GeminiLanguageModelTests { } @Test func withGenerationOptions() async throws { - let session = LanguageModelSession(model: model) + // Use a model with thinking disabled to avoid consuming all tokens on thinking + let modelWithoutThinking = GeminiLanguageModel( + apiKey: geminiAPIKey!, + model: "gemini-2.5-flash", + thinking: false + ) + let session = LanguageModelSession(model: modelWithoutThinking) let options = GenerationOptions( temperature: 0.7, From ba55d9dcedabbfbcea4efffbe0547c3fcf6c1bc3 Mon Sep 17 00:00:00 2001 From: Mattt Zmuda Date: Mon, 3 Nov 2025 03:15:08 -0800 Subject: [PATCH 11/13] Fix various coding issues --- .../Models/GeminiLanguageModel.swift | 67 +++++++++++-------- 1 file changed, 40 insertions(+), 27 deletions(-) diff --git a/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift b/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift index 96d54a6a..88429e03 100644 --- a/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift +++ b/Sources/AnyLanguageModel/Models/GeminiLanguageModel.swift @@ -127,10 +127,11 @@ public struct GeminiLanguageModel: LanguageModel { throw GeminiError.noCandidate } - let functionCalls: [GeminiFunctionCall] = firstCandidate.content.parts.compactMap { part in - if case .functionCall(let call) = part { return call } - return nil - } + let functionCalls: [GeminiFunctionCall] = + firstCandidate.content.parts?.compactMap { part in + if case .functionCall(let call) = part { return call } + return nil + } ?? [] if !functionCalls.isEmpty { // Append the model's response with function calls to the conversation @@ -166,12 +167,13 @@ public struct GeminiLanguageModel: LanguageModel { continue } else { // No function calls, extract final text and return - let text = firstCandidate.content.parts.compactMap { part -> String? in - switch part { - case .text(let t): return t.text - default: return nil - } - }.joined() + let text = + firstCandidate.content.parts?.compactMap { part -> String? in + switch part { + case .text(let t): return t.text + default: return nil + } + }.joined() ?? "" return LanguageModelSession.Response( content: text as! Content, @@ -225,7 +227,7 @@ public struct GeminiLanguageModel: LanguageModel { let stream: AsyncThrowingStream = urlSession - .fetchStream( + .fetchEventStream( .post, url: url, headers: headers, @@ -237,14 +239,16 @@ public struct GeminiLanguageModel: LanguageModel { for try await chunk in stream { guard let candidate = chunk.candidates.first else { continue } - for part in candidate.content.parts { - if case .text(let textPart) = part { - accumulatedText += textPart.text + if let parts = candidate.content.parts { + for part in parts { + if case .text(let textPart) = part { + accumulatedText += textPart.text - let raw = GeneratedContent(accumulatedText) - let content: Content.PartiallyGenerated = (accumulatedText as! Content) - .asPartiallyGenerated() - continuation.yield(.init(content: content, rawContent: raw)) + let raw = GeneratedContent(accumulatedText) + let content: Content.PartiallyGenerated = (accumulatedText as! Content) + .asPartiallyGenerated() + continuation.yield(.init(content: content, rawContent: raw)) + } } } } @@ -390,7 +394,7 @@ private func resolveFunctionCalls( do { let segments = try await tool.makeOutputSegments(from: args) let output = Transcript.ToolOutput( - id: callID, + id: tool.name, toolName: tool.name, segments: segments ) @@ -406,7 +410,9 @@ private func resolveFunctionCalls( private func convertToolToGeminiFormat(_ tool: any Tool) throws -> GeminiFunctionDeclaration { let resolvedSchema = tool.parameters.withResolvedRoot() ?? tool.parameters - let data = try JSONEncoder().encode(resolvedSchema) + let encoder = JSONEncoder() + encoder.userInfo[GenerationSchema.omitAdditionalPropertiesKey] = true + let data = try encoder.encode(resolvedSchema) let schema = try JSONDecoder().decode(JSONSchema.self, from: data) return GeminiFunctionDeclaration( @@ -498,7 +504,7 @@ private struct GeminiContent: Codable, Sendable { } let role: Role - let parts: [GeminiPart] + let parts: [GeminiPart]? } private enum GeminiPart: Codable, Sendable { @@ -510,17 +516,20 @@ private enum GeminiPart: Codable, Sendable { case text case functionCall case functionResponse + case thoughtSignature } init(from decoder: any Decoder) throws { let container = try decoder.container(keyedBy: CodingKeys.self) if container.contains(.text) { - self = .text(try GeminiTextPart(from: decoder)) + let text = try container.decode(String.self, forKey: .text) + self = .text(GeminiTextPart(text: text)) } else if container.contains(.functionCall) { - self = .functionCall(try GeminiFunctionCall(from: decoder)) + // Note: thoughtSignature may be present but is ignored + self = .functionCall(try container.decode(GeminiFunctionCall.self, forKey: .functionCall)) } else if container.contains(.functionResponse) { - self = .functionResponse(try GeminiFunctionResponse(from: decoder)) + self = .functionResponse(try container.decode(GeminiFunctionResponse.self, forKey: .functionResponse)) } else { throw DecodingError.dataCorrupted( DecodingError.Context( @@ -532,10 +541,14 @@ private enum GeminiPart: Codable, Sendable { } func encode(to encoder: any Encoder) throws { + var container = encoder.container(keyedBy: CodingKeys.self) switch self { - case .text(let part): try part.encode(to: encoder) - case .functionCall(let call): try call.encode(to: encoder) - case .functionResponse(let response): try response.encode(to: encoder) + case .text(let part): + try container.encode(part.text, forKey: .text) + case .functionCall(let call): + try container.encode(call, forKey: .functionCall) + case .functionResponse(let response): + try container.encode(response, forKey: .functionResponse) } } } From 9d886bacbafad3a3d288cdf601fe84b57035d3a1 Mon Sep 17 00:00:00 2001 From: Mattt Zmuda Date: Mon, 3 Nov 2025 03:17:54 -0800 Subject: [PATCH 12/13] Bump maximum token count for withGenerationOptions test --- Tests/AnyLanguageModelTests/GeminiLanguageModelTests.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tests/AnyLanguageModelTests/GeminiLanguageModelTests.swift b/Tests/AnyLanguageModelTests/GeminiLanguageModelTests.swift index 2bda08a2..420d36da 100644 --- a/Tests/AnyLanguageModelTests/GeminiLanguageModelTests.swift +++ b/Tests/AnyLanguageModelTests/GeminiLanguageModelTests.swift @@ -72,7 +72,7 @@ struct GeminiLanguageModelTests { let options = GenerationOptions( temperature: 0.7, - maximumResponseTokens: 50 + maximumResponseTokens: 2048 ) let response = try await session.respond( From 6828cdff4223d19b3e03e0953b007dc9095af601 Mon Sep 17 00:00:00 2001 From: Mattt Zmuda Date: Mon, 3 Nov 2025 03:19:25 -0800 Subject: [PATCH 13/13] Update README --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e7aad9ee..2fbaf1d2 100644 --- a/README.md +++ b/README.md @@ -272,11 +272,13 @@ let model = GeminiLanguageModel( apiKey: apiKey, model: "gemini-2.5-flash", serverTools: [ - .googleMaps(latitude: 37.7749, longitude: -122.4194) // Optional location + .googleMaps(latitude: 35.6580, longitude: 139.7016) // Optional location ] ) ``` +**Available server tools**: + - `.googleSearch` Grounds responses with real-time web information - `.googleMaps` @@ -287,8 +289,7 @@ let model = GeminiLanguageModel( Fetches and analyzes content from URLs mentioned in prompts > [!TIP] -> Google server tools are exclusive to Gemini models, -> and not available as client tools (`Tool`) for other models. +> Gemini server tools are not available as client tools (`Tool`) for other models. ### Ollama