diff --git a/Voxt.xcodeproj/project.pbxproj b/Voxt.xcodeproj/project.pbxproj index aeff42f..6e17eaf 100644 --- a/Voxt.xcodeproj/project.pbxproj +++ b/Voxt.xcodeproj/project.pbxproj @@ -681,7 +681,6 @@ DEBUG_INFORMATION_FORMAT = dwarf; ENABLE_TESTABILITY = YES; ENABLE_USER_SCRIPT_SANDBOXING = YES; - GENERATE_INFOPLIST_FILE = YES; GCC_DYNAMIC_NO_PIC = NO; GCC_NO_COMMON_BLOCKS = YES; GCC_PREPROCESSOR_DEFINITIONS = ( @@ -696,6 +695,7 @@ GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_LABEL = YES; GCC_WARN_UNUSED_VARIABLE = YES; + GENERATE_INFOPLIST_FILE = YES; INSTALL_PATH = "$(TEST_HOST)/../.."; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", @@ -752,7 +752,6 @@ ENABLE_NS_ASSERTIONS = NO; ENABLE_TESTABILITY = YES; ENABLE_USER_SCRIPT_SANDBOXING = YES; - GENERATE_INFOPLIST_FILE = YES; GCC_DYNAMIC_NO_PIC = NO; GCC_NO_COMMON_BLOCKS = YES; GCC_WARN_64_TO_32_BIT_CONVERSION = YES; @@ -763,6 +762,7 @@ GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_LABEL = YES; GCC_WARN_UNUSED_VARIABLE = YES; + GENERATE_INFOPLIST_FILE = YES; INSTALL_PATH = "$(TEST_HOST)/../.."; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", @@ -815,7 +815,6 @@ DEBUG_INFORMATION_FORMAT = dwarf; ENABLE_TESTABILITY = YES; ENABLE_USER_SCRIPT_SANDBOXING = YES; - GENERATE_INFOPLIST_FILE = YES; GCC_DYNAMIC_NO_PIC = NO; GCC_NO_COMMON_BLOCKS = YES; GCC_PREPROCESSOR_DEFINITIONS = ( @@ -830,6 +829,7 @@ GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_LABEL = YES; GCC_WARN_UNUSED_VARIABLE = YES; + GENERATE_INFOPLIST_FILE = YES; INSTALL_PATH = "$(TEST_HOST)/../.."; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", diff --git a/Voxt/App/AppDelegate+HotkeyLifecycle.swift b/Voxt/App/AppDelegate+HotkeyLifecycle.swift index 821f734..2fec652 100644 --- a/Voxt/App/AppDelegate+HotkeyLifecycle.swift +++ b/Voxt/App/AppDelegate+HotkeyLifecycle.swift @@ -3,6 +3,23 @@ import Carbon @MainActor extension AppDelegate { + enum SessionCallbackHandlingDecision: Equatable { + case accept + case rejectStale + case rejectCancelled + + var logDescription: String { + switch self { + case .accept: + return "accept" + case .rejectStale: + return "stale-session" + case .rejectCancelled: + return "cancelled-session" + } + } + } + func setupHotkey() { // Callback contract: // - HotkeyManager only emits normalized events (transcriptionDown/up, translationDown/up, rewriteDown/up). @@ -403,16 +420,35 @@ extension AppDelegate { pendingTranscriptionStartTask = nil } + nonisolated static func sessionCallbackHandlingDecision( + requestedSessionID: UUID, + activeSessionID: UUID, + isSessionCancellationRequested: Bool + ) -> SessionCallbackHandlingDecision { + guard requestedSessionID == activeSessionID else { + return .rejectStale + } + guard !isSessionCancellationRequested else { + return .rejectCancelled + } + return .accept + } + func shouldHandleCallbacks(for sessionID: UUID) -> Bool { - guard sessionID == activeRecordingSessionID else { + switch Self.sessionCallbackHandlingDecision( + requestedSessionID: sessionID, + activeSessionID: activeRecordingSessionID, + isSessionCancellationRequested: isSessionCancellationRequested + ) { + case .accept: + return true + case .rejectStale: VoxtLog.info("Ignoring stale session callback. sessionID=\(sessionID.uuidString)", verbose: true) return false - } - guard !isSessionCancellationRequested else { + case .rejectCancelled: VoxtLog.info("Ignoring callback for cancelled session. sessionID=\(sessionID.uuidString)", verbose: true) return false } - return true } var sessionOutputModeLabel: String { diff --git a/Voxt/App/AppDelegate+PreferencesAndHistory.swift b/Voxt/App/AppDelegate+PreferencesAndHistory.swift index 3a99931..b528355 100644 --- a/Voxt/App/AppDelegate+PreferencesAndHistory.swift +++ b/Voxt/App/AppDelegate+PreferencesAndHistory.swift @@ -254,7 +254,7 @@ extension AppDelegate { } var whisperRealtimeEnabled: Bool { - defaults.object(forKey: AppPreferenceKey.whisperRealtimeEnabled) as? Bool ?? true + defaults.object(forKey: AppPreferenceKey.whisperRealtimeEnabled) as? Bool ?? false } var whisperKeepResidentLoaded: Bool { @@ -319,13 +319,15 @@ extension AppDelegate { let historyKind = resolvedHistoryKind(for: outputMode) VoxtLog.info( - "History append requested. kind=\(historyKind.rawValue), engine=\(transcriptionEngine.rawValue), historyEnabled=\(historyEnabled), audioStorageEnabled=\(historyAudioStorageEnabled), stashedAudio=\(pendingCompletedHistoryAudioArchiveURL != nil)" + "History append requested. kind=\(historyKind.rawValue), engine=\(transcriptionEngine.rawValue), historyEnabled=\(historyEnabled), audioStorageEnabled=\(historyAudioStorageEnabled), stashedAudio=\(pendingCompletedHistoryAudioArchiveURL != nil)", + verbose: true ) let pendingAudioArchiveURL = consumePendingCompletedHistoryAudioURL() if let pendingAudioArchiveURL { let exists = FileManager.default.fileExists(atPath: pendingAudioArchiveURL.path) VoxtLog.info( - "History append consumed pending audio archive. kind=\(historyKind.rawValue), file=\(pendingAudioArchiveURL.lastPathComponent), exists=\(exists)" + "History append consumed pending audio archive. kind=\(historyKind.rawValue), file=\(pendingAudioArchiveURL.lastPathComponent), exists=\(exists)", + verbose: true ) } else { VoxtLog.warning( @@ -519,7 +521,7 @@ extension AppDelegate { } } } else { - try? historyStore.replaceAudioArchive(for: activeEntryID, with: pendingAudioArchiveURL) + _ = try? historyStore.replaceAudioArchive(for: activeEntryID, with: pendingAudioArchiveURL) } } @@ -674,7 +676,8 @@ extension AppDelegate { self.pendingCompletedHistoryAudioArchiveURL = nil let exists = FileManager.default.fileExists(atPath: pendingCompletedHistoryAudioArchiveURL.path) VoxtLog.info( - "Consumed stashed history audio archive. file=\(pendingCompletedHistoryAudioArchiveURL.lastPathComponent), exists=\(exists)" + "Consumed stashed history audio archive. file=\(pendingCompletedHistoryAudioArchiveURL.lastPathComponent), exists=\(exists)", + verbose: true ) return pendingCompletedHistoryAudioArchiveURL } @@ -692,7 +695,8 @@ extension AppDelegate { if let consumedURL { let exists = FileManager.default.fileExists(atPath: consumedURL.path) VoxtLog.info( - "Consumed transcriber history audio archive. engine=\(transcriptionEngine.rawValue), file=\(consumedURL.lastPathComponent), exists=\(exists)" + "Consumed transcriber history audio archive. engine=\(transcriptionEngine.rawValue), file=\(consumedURL.lastPathComponent), exists=\(exists)", + verbose: true ) } else { VoxtLog.warning( @@ -725,9 +729,7 @@ extension AppDelegate { try? FileManager.default.removeItem(at: pendingCompletedHistoryAudioArchiveURL) } pendingCompletedHistoryAudioArchiveURL = url - let exists = FileManager.default.fileExists(atPath: url.path) - let fileSize = (try? url.resourceValues(forKeys: [.fileSizeKey]).fileSize) ?? 0 - VoxtLog.info("Pending history audio archive stashed. file=\(url.lastPathComponent), exists=\(exists), size=\(fileSize)") + VoxtLog.info("Pending history audio archive stashed. file=\(url.lastPathComponent)") } private func importConsumedAudioArchiveIfNeeded( diff --git a/Voxt/App/AppDelegate+RecordingSession.swift b/Voxt/App/AppDelegate+RecordingSession.swift index 79395bb..744e861 100644 --- a/Voxt/App/AppDelegate+RecordingSession.swift +++ b/Voxt/App/AppDelegate+RecordingSession.swift @@ -5,6 +5,74 @@ import AVFoundation import Speech extension AppDelegate { + enum StopRecordingFallbackDecision: Equatable { + case finishNow + case extendGrace(seconds: TimeInterval) + } + + nonisolated static func stopRecordingFallbackDecision( + transcriptionEngine: TranscriptionEngine, + isWhisperFinalizing: Bool, + transcriptionResultReceived: Bool, + isExtendedGrace: Bool + ) -> StopRecordingFallbackDecision { + guard transcriptionEngine == .whisperKit else { return .finishNow } + guard isWhisperFinalizing else { return .finishNow } + guard !transcriptionResultReceived else { return .finishNow } + guard !isExtendedGrace else { return .finishNow } + return .extendGrace(seconds: 12) + } + + private func shouldDeferStopRecordingFallback() -> Bool { + guard transcriptionEngine == .whisperKit else { return false } + guard whisperTranscriber?.isFinalizingTranscription == true else { return false } + guard transcriptionResultReceivedAt == nil else { return false } + return true + } + + private func armStopRecordingFallback( + timeoutSeconds: TimeInterval, + isExtendedGrace: Bool = false + ) { + let armedSessionID = activeRecordingSessionID + stopRecordingFallbackTask = Task { [weak self] in + guard let self else { return } + do { + try await Task.sleep(for: .seconds(timeoutSeconds)) + } catch { + return + } + guard !Task.isCancelled else { return } + guard self.isSessionActive, self.activeRecordingSessionID == armedSessionID else { return } + + let fallbackDecision = Self.stopRecordingFallbackDecision( + transcriptionEngine: self.transcriptionEngine, + isWhisperFinalizing: self.shouldDeferStopRecordingFallback(), + transcriptionResultReceived: self.transcriptionResultReceivedAt != nil, + isExtendedGrace: isExtendedGrace + ) + if case .extendGrace(let graceSeconds) = fallbackDecision { + VoxtLog.warning( + """ + Stop recording fallback reached while Whisper finalization is still running; extending grace. sessionID=\(armedSessionID.uuidString), engine=\(self.transcriptionEngine.rawValue), output=\(RecordingSessionSupport.outputLabel(for: self.sessionOutputMode)) + """ + ) + self.armStopRecordingFallback(timeoutSeconds: graceSeconds, isExtendedGrace: true) + return + } + + VoxtLog.warning( + """ + Stop recording fallback triggered; forcing session finish. sessionID=\(self.activeRecordingSessionID.uuidString), engine=\(self.transcriptionEngine.rawValue), output=\(RecordingSessionSupport.outputLabel(for: self.sessionOutputMode)), resultReceived=\(self.transcriptionResultReceivedAt != nil), endingSessionID=\(self.currentEndingSessionID?.uuidString ?? "nil"), whisperFinalizing=\(self.whisperTranscriber?.isFinalizingTranscription == true) + """ + ) + if self.transcriptionEngine == .remote { + self.remoteASRTranscriber.discardPendingSessionOutput() + } + self.finishSession(after: 0) + } + } + func continueRewriteConversation() { guard overlayState.canContinueRewriteAnswer else { return } overlayState.beginRewriteConversationIfNeeded() @@ -43,7 +111,7 @@ extension AppDelegate { whisperTranscriber?.stopRecording() remoteASRTranscriber.discardPendingSessionOutput() if preservePendingHistoryAudio { - VoxtLog.info("Preserving pending history audio during residual resource release. reason=\(reason)") + VoxtLog.info("Preserving pending history audio during residual resource release. reason=\(reason)", verbose: true) } else { discardPendingCompletedHistoryAudio() } @@ -220,21 +288,7 @@ extension AppDelegate { transcriptionEngine: transcriptionEngine, remoteProvider: remoteASRSelectedProvider ) - stopRecordingFallbackTask = Task { [weak self] in - guard let self else { return } - do { - try await Task.sleep(for: .seconds(fallbackTimeoutSeconds)) - } catch { - return - } - guard !Task.isCancelled else { return } - guard self.isSessionActive else { return } - VoxtLog.warning("Stop recording fallback triggered; forcing session finish.") - if self.transcriptionEngine == .remote { - self.remoteASRTranscriber.discardPendingSessionOutput() - } - self.finishSession(after: 0) - } + armStopRecordingFallback(timeoutSeconds: fallbackTimeoutSeconds) } func cancelActiveRecordingSession() { diff --git a/Voxt/App/AppDelegate+RecordingSessionCapture.swift b/Voxt/App/AppDelegate+RecordingSessionCapture.swift index 2f32a52..9147cc6 100644 --- a/Voxt/App/AppDelegate+RecordingSessionCapture.swift +++ b/Voxt/App/AppDelegate+RecordingSessionCapture.swift @@ -96,10 +96,7 @@ extension AppDelegate { whisper.transcribedText = "" whisper.isModelInitializing = needsModelInitialization whisper.setPreferredInputDevice(selectedInputDeviceID) - whisper.onPartialTranscription = { [weak self] text in - guard let self, self.shouldHandleCallbacks(for: sessionID) else { return } - self.overlayState.transcribedText = text - } + whisper.onPartialTranscription = nil whisper.onTranscriptionFinished = { [weak self] text in self?.stashPendingCompletedHistoryAudioArchive(self?.whisperTranscriber?.consumeCompletedAudioArchiveURL()) self?.processTranscription(text, sessionID: sessionID) @@ -335,6 +332,12 @@ extension AppDelegate { if transcriptionEngine == .mlxAudio, isMLXReady { mlxTranscriber?.stopRecording() } else if transcriptionEngine == .whisperKit, isWhisperReady { + if let whisperTranscriber { + VoxtLog.info( + "Issuing Whisper stop. \(whisperTranscriber.debugCaptureStopSummary())", + verbose: true + ) + } whisperTranscriber?.stopRecording() } else if transcriptionEngine == .remote { remoteASRTranscriber.stopRecording() diff --git a/Voxt/App/AppDelegate+RecordingSessionTextRouting.swift b/Voxt/App/AppDelegate+RecordingSessionTextRouting.swift index c37a05f..14406ae 100644 --- a/Voxt/App/AppDelegate+RecordingSessionTextRouting.swift +++ b/Voxt/App/AppDelegate+RecordingSessionTextRouting.swift @@ -6,7 +6,20 @@ extension AppDelegate { } func processTranscription(_ rawText: String, sessionID: UUID) { - guard shouldHandleCallbacks(for: sessionID) else { return } + let callbackDecision = Self.sessionCallbackHandlingDecision( + requestedSessionID: sessionID, + activeSessionID: activeRecordingSessionID, + isSessionCancellationRequested: isSessionCancellationRequested + ) + guard callbackDecision == .accept else { + VoxtLog.info( + """ + Dropping transcription callback before processing. reason=\(callbackDecision.logDescription), callbackSessionID=\(sessionID.uuidString), activeSessionID=\(activeRecordingSessionID.uuidString), stopped=\(recordingStoppedAt != nil), endingSessionID=\(currentEndingSessionID?.uuidString ?? "nil"), rawChars=\(rawText.count) + """, + verbose: true + ) + return + } if didCommitSessionOutput { VoxtLog.info("Ignoring transcription callback because current session output has already been committed.") return @@ -16,6 +29,13 @@ extension AppDelegate { stopRecordingFallbackTask = nil transcriptionResultReceivedAt = Date() + if let stoppedAt = recordingStoppedAt { + let stopToResultMs = max(Int(Date().timeIntervalSince(stoppedAt) * 1000), 0) + VoxtLog.info( + "Transcription callback accepted after stop. sessionID=\(sessionID.uuidString), stopToResultMs=\(stopToResultMs), rawChars=\(rawText.count)", + verbose: true + ) + } let displayText = RecordingSessionSupport.normalizedTranscriptionDisplayText( rawText, transcriptionEngine: transcriptionEngine, diff --git a/Voxt/App/AppDelegate+SessionTextIO.swift b/Voxt/App/AppDelegate+SessionTextIO.swift index 4b156e7..c15db3a 100644 --- a/Voxt/App/AppDelegate+SessionTextIO.swift +++ b/Voxt/App/AppDelegate+SessionTextIO.swift @@ -119,7 +119,19 @@ extension AppDelegate { let sessionID = activeRecordingSessionID let sessionOutputMode = sessionOutputMode let userMainLanguage = userMainLanguage - guard shouldHandleCallbacks(for: sessionID) else { return } + let callbackDecision = Self.sessionCallbackHandlingDecision( + requestedSessionID: sessionID, + activeSessionID: activeRecordingSessionID, + isSessionCancellationRequested: isSessionCancellationRequested + ) + guard callbackDecision == .accept else { + VoxtLog.warning( + """ + Commit transcription abandoned after session invalidation. reason=\(callbackDecision.logDescription), sessionID=\(sessionID.uuidString), activeSessionID=\(activeRecordingSessionID.uuidString), outputMode=\(RecordingSessionSupport.outputLabel(for: sessionOutputMode)), chars=\(text.count), stopped=\(recordingStoppedAt != nil) + """ + ) + return + } VoxtLog.info("Commit transcription entered. characters=\(text.count)") diff --git a/Voxt/App/MeetingStartPlanner.swift b/Voxt/App/MeetingStartPlanner.swift index d4a1b02..df436ec 100644 --- a/Voxt/App/MeetingStartPlanner.swift +++ b/Voxt/App/MeetingStartPlanner.swift @@ -9,7 +9,7 @@ enum MeetingStartBlockReason: Equatable { var userMessage: String { switch self { case .dictationUnsupported: - return String(localized: "Meeting Notes currently supports Whisper, MLX Audio, and Remote ASR. Direct Dictation is not available for meetings.") + return String(localized: "Meeting Notes currently supports MLX Audio and Remote ASR. Direct Dictation is not available for meetings.") case .recording(let reason): return reason.userMessage case .remoteASRUnavailable: @@ -62,12 +62,12 @@ enum MeetingStartPlanner { } case .whisperKit: switch RecordingStartPlanner.resolve( - selectedEngine: .whisperKit, + selectedEngine: .mlxAudio, mlxModelState: mlxModelState, whisperModelState: whisperModelState ) { case .start: - return .start(.whisperKit) + return .start(.mlxAudio) case .blocked(let reason): return .blocked(.recording(reason)) } diff --git a/Voxt/App/RecordingSessionSupport.swift b/Voxt/App/RecordingSessionSupport.swift index c89f2d3..f36fbed 100644 --- a/Voxt/App/RecordingSessionSupport.swift +++ b/Voxt/App/RecordingSessionSupport.swift @@ -118,7 +118,14 @@ enum RecordingSessionSupport { transcriptionEngine: TranscriptionEngine, remoteProvider: RemoteASRProvider ) -> TimeInterval { - guard transcriptionEngine == .remote else { return 8 } + switch transcriptionEngine { + case .whisperKit: + return 20 + case .mlxAudio, .dictation: + return 8 + case .remote: + break + } switch remoteProvider { case .openAIWhisper, .glmASR: return 60 diff --git a/Voxt/App/VoxtApp.swift b/Voxt/App/VoxtApp.swift index 1692e12..db50778 100644 --- a/Voxt/App/VoxtApp.swift +++ b/Voxt/App/VoxtApp.swift @@ -265,7 +265,7 @@ class AppDelegate: NSObject, NSApplicationDelegate { AppPreferenceKey.whisperTemperature: 0.0, AppPreferenceKey.whisperVADEnabled: true, AppPreferenceKey.whisperTimestampsEnabled: false, - AppPreferenceKey.whisperRealtimeEnabled: true, + AppPreferenceKey.whisperRealtimeEnabled: false, AppPreferenceKey.whisperKeepResidentLoaded: true, AppPreferenceKey.translationFallbackModelProvider: TranslationModelProvider.customLLM.rawValue, AppPreferenceKey.rewriteCustomLLMModelRepo: CustomLLMModelManager.defaultModelRepo, diff --git a/Voxt/Meeting/MeetingASRSupport.swift b/Voxt/Meeting/MeetingASRSupport.swift index 4599ae2..b06a9b4 100644 --- a/Voxt/Meeting/MeetingASRSupport.swift +++ b/Voxt/Meeting/MeetingASRSupport.swift @@ -52,10 +52,12 @@ enum MeetingASRSupport { switch transcriptionEngine { case .whisperKit: return MeetingASREngineContext( - engine: .whisperKit, - historyModelDescription: "\(whisperDisplayTitle(whisperCurrentModelID)) (\(whisperCurrentModelID))", - resolvedMode: .chunk(profile: whisperRealtimeEnabled ? .realtime : .quality), - needsModelInitialization: !whisperIsCurrentModelLoaded && modelStateNeedsInitialization(whisperModelState) + engine: .mlxAudio, + historyModelDescription: "\(mlxDisplayTitle(mlxCurrentModelRepo)) (\(mlxCurrentModelRepo))", + resolvedMode: .chunk( + profile: MLXModelManager.isRealtimeCapableModelRepo(mlxCurrentModelRepo) ? .realtime : .quality + ), + needsModelInitialization: !mlxIsCurrentModelLoaded && modelStateNeedsInitialization(mlxModelState) ) case .mlxAudio: return MeetingASREngineContext( diff --git a/Voxt/Meeting/MeetingSessionCoordinator.swift b/Voxt/Meeting/MeetingSessionCoordinator.swift index f9c8fb8..e11fe07 100644 --- a/Voxt/Meeting/MeetingSessionCoordinator.swift +++ b/Voxt/Meeting/MeetingSessionCoordinator.swift @@ -726,7 +726,7 @@ final class MeetingSessionCoordinator { private func resolvedEngineContext() -> MeetingASREngineContext { let transcriptionEngine = resolvedTranscriptionEngine() let remoteSelection = resolvedRemoteASRSelection() - let whisperRealtimeEnabled = UserDefaults.standard.object(forKey: AppPreferenceKey.whisperRealtimeEnabled) as? Bool ?? true + let whisperRealtimeEnabled = UserDefaults.standard.object(forKey: AppPreferenceKey.whisperRealtimeEnabled) as? Bool ?? false return MeetingASRSupport.resolveContext( transcriptionEngine: transcriptionEngine, diff --git a/Voxt/Settings/AppPreferenceKey.swift b/Voxt/Settings/AppPreferenceKey.swift index 97c36ad..7b76239 100644 --- a/Voxt/Settings/AppPreferenceKey.swift +++ b/Voxt/Settings/AppPreferenceKey.swift @@ -105,8 +105,8 @@ enum AppPreferenceKey { static let dictionarySuggestionFilterSettings = "dictionarySuggestionFilterSettings" static let dictionarySuggestionIngestModelOptionID = "dictionarySuggestionIngestModelOptionID" static let autoCheckForUpdates = "autoCheckForUpdates" - static let hotkeyDebugLoggingEnabled = "hotkeyDebugLoggingEnabled" - static let llmDebugLoggingEnabled = "llmDebugLoggingEnabled" + nonisolated static let hotkeyDebugLoggingEnabled = "hotkeyDebugLoggingEnabled" + nonisolated static let llmDebugLoggingEnabled = "llmDebugLoggingEnabled" static let llmDebugCustomPrompt = "llmDebugCustomPrompt" static let llmDebugPresetPromptOverrides = "llmDebugPresetPromptOverrides" static let useSystemProxy = "useSystemProxy" diff --git a/Voxt/Settings/FeatureModelCatalogBuilder.swift b/Voxt/Settings/FeatureModelCatalogBuilder.swift index cf2e2c1..be78a16 100644 --- a/Voxt/Settings/FeatureModelCatalogBuilder.swift +++ b/Voxt/Settings/FeatureModelCatalogBuilder.swift @@ -18,7 +18,7 @@ struct FeatureModelCatalogBuilder { func entries(for sheet: FeatureModelSelectorSheet) -> [FeatureModelSelectorEntry] { switch sheet { case .transcriptionASR, .translationASR, .rewriteASR, .meetingASR: - return asrEntries() + return asrEntries(for: sheet) case .transcriptionLLM, .transcriptionNoteTitle, .rewriteLLM, .meetingSummary: return llmEntries(includeAppleIntelligence: true) case .translationModel: @@ -78,7 +78,7 @@ struct FeatureModelCatalogBuilder { } } - private func asrEntries() -> [FeatureModelSelectorEntry] { + private func asrEntries(for sheet: FeatureModelSelectorSheet) -> [FeatureModelSelectorEntry] { var entries = [FeatureModelSelectorEntry]() entries.append( FeatureModelSelectorEntry( @@ -134,7 +134,8 @@ struct FeatureModelCatalogBuilder { ) }) - entries.append(contentsOf: WhisperKitModelManager.availableModels.map { model in + if sheet != .meetingASR { + entries.append(contentsOf: WhisperKitModelManager.availableModels.map { model in let selectionID = FeatureModelSelectionID.whisper(model.id) let isInstalled = whisperModelManager.isModelDownloaded(id: model.id) return FeatureModelSelectorEntry( @@ -164,7 +165,8 @@ struct FeatureModelCatalogBuilder { isSelectable: isInstalled, disabledReason: isInstalled ? nil : localized("Install this model in Model settings first.") ) - }) + }) + } let remoteConfigurations = RemoteModelConfigurationStore.loadConfigurations( from: remoteASRProviderConfigurationsRaw, diff --git a/Voxt/Settings/FeatureSettingsStore.swift b/Voxt/Settings/FeatureSettingsStore.swift index df8bb0d..36d4a2e 100644 --- a/Voxt/Settings/FeatureSettingsStore.swift +++ b/Voxt/Settings/FeatureSettingsStore.swift @@ -88,7 +88,10 @@ enum FeatureSettingsStore { ), meeting: MeetingFeatureSettings( enabled: defaults.object(forKey: AppPreferenceKey.meetingNotesBetaEnabled) as? Bool ?? false, - asrSelectionID: transcriptionASR, + asrSelectionID: supportedMeetingASRSelection( + transcriptionASR, + defaults: defaults + ), summaryModelSelectionID: meetingSummary, summaryPrompt: AppPromptDefaults.resolvedStoredText( defaults.string(forKey: AppPreferenceKey.meetingSummaryPromptTemplate), @@ -135,9 +138,10 @@ enum FeatureSettingsStore { from settings: FeatureSettings, defaults: UserDefaults = .standard ) { - syncLegacyMeeting(settings.meeting, defaults: defaults) + let sanitizedMeeting = sanitizedMeetingSettings(settings.meeting, defaults: defaults) + syncLegacyMeeting(sanitizedMeeting, defaults: defaults) syncLegacyTranslation(settings.translation, defaults: defaults) - syncLegacyASRSelection(settings.meeting.asrSelectionID, defaults: defaults) + syncLegacyASRSelection(sanitizedMeeting.asrSelectionID, defaults: defaults) } private static func loadRaw(defaults: UserDefaults) -> String? { @@ -289,7 +293,10 @@ enum FeatureSettingsStore { ), meeting: MeetingFeatureSettings( enabled: settings.meeting.enabled, - asrSelectionID: settings.meeting.asrSelectionID.asrSelection == nil ? fallback.meeting.asrSelectionID : settings.meeting.asrSelectionID, + asrSelectionID: supportedMeetingASRSelection( + settings.meeting.asrSelectionID.asrSelection == nil ? fallback.meeting.asrSelectionID : settings.meeting.asrSelectionID, + defaults: defaults + ), summaryModelSelectionID: settings.meeting.summaryModelSelectionID.textSelection == nil ? fallback.meeting.summaryModelSelectionID : settings.meeting.summaryModelSelectionID, summaryPrompt: AppPromptDefaults.resolvedStoredText( sanitizedPrompt(settings.meeting.summaryPrompt), @@ -416,6 +423,39 @@ enum FeatureSettingsStore { ) } + private static func sanitizedMeetingSettings( + _ settings: MeetingFeatureSettings, + defaults: UserDefaults + ) -> MeetingFeatureSettings { + MeetingFeatureSettings( + enabled: settings.enabled, + asrSelectionID: supportedMeetingASRSelection(settings.asrSelectionID, defaults: defaults), + summaryModelSelectionID: settings.summaryModelSelectionID, + summaryPrompt: settings.summaryPrompt, + summaryAutoGenerate: settings.summaryAutoGenerate, + realtimeTranslateEnabled: settings.realtimeTranslateEnabled, + realtimeTargetLanguageRawValue: settings.realtimeTargetLanguageRawValue, + showOverlayInScreenShare: settings.showOverlayInScreenShare + ) + } + + private static func supportedMeetingASRSelection( + _ selectionID: FeatureModelSelectionID, + defaults: UserDefaults + ) -> FeatureModelSelectionID { + let fallbackRepo = MLXModelManager.canonicalModelRepo( + defaults.string(forKey: AppPreferenceKey.mlxModelRepo) ?? MLXModelManager.defaultModelRepo + ) + switch selectionID.asrSelection { + case .whisper: + return .mlx(fallbackRepo) + case .none: + return .mlx(fallbackRepo) + case .dictation, .mlx, .remote: + return selectionID + } + } + private static func legacyASRSelection(defaults: UserDefaults) -> FeatureModelSelectionID { let engine = TranscriptionEngine(rawValue: defaults.string(forKey: AppPreferenceKey.transcriptionEngine) ?? "") ?? .mlxAudio switch engine { diff --git a/Voxt/Settings/ModelSettingsView+Lifecycle.swift b/Voxt/Settings/ModelSettingsView+Lifecycle.swift index 6c7cf80..73b35c6 100644 --- a/Voxt/Settings/ModelSettingsView+Lifecycle.swift +++ b/Voxt/Settings/ModelSettingsView+Lifecycle.swift @@ -13,7 +13,7 @@ extension ModelSettingsView { } whisperModelManager.updateModel(id: canonicalWhisperModelID) if UserDefaults.standard.object(forKey: AppPreferenceKey.whisperRealtimeEnabled) == nil { - whisperRealtimeEnabled = true + whisperRealtimeEnabled = false } if UserDefaults.standard.object(forKey: AppPreferenceKey.whisperKeepResidentLoaded) == nil { whisperKeepResidentLoaded = true diff --git a/Voxt/Settings/ModelSettingsView+Sections.swift b/Voxt/Settings/ModelSettingsView+Sections.swift index 5f53cd1..953b269 100644 --- a/Voxt/Settings/ModelSettingsView+Sections.swift +++ b/Voxt/Settings/ModelSettingsView+Sections.swift @@ -528,7 +528,7 @@ private struct WhisperASRConfigurationSheetView: View { .toggleStyle(.switch) Toggle("Enable Timestamps", isOn: $whisperTimestampsEnabled) .toggleStyle(.switch) - Toggle("Realtime", isOn: $whisperRealtimeEnabled) + Toggle("Live Realtime (Experimental)", isOn: $whisperRealtimeEnabled) .toggleStyle(.switch) } @@ -580,7 +580,7 @@ private struct WhisperASRConfigurationSheetView: View { ) } - Text("These settings apply to Whisper transcription sessions. Standard ASR always uses transcribe; Whisper translate is only used by the translation hotkey when Whisper translation is selected.") + Text("These settings apply to Whisper transcription sessions. Live Realtime (Experimental) streams partial text while you speak and does a final correction after stop. Turn it off to use the quality-first non-live path. Whisper translate is only used when Whisper translation is selected.") .font(.caption) .foregroundStyle(.secondary) } @@ -597,7 +597,7 @@ private struct WhisperASRConfigurationSheetView: View { whisperTemperature = 0 whisperVADEnabled = true whisperTimestampsEnabled = false - whisperRealtimeEnabled = true + whisperRealtimeEnabled = false whisperKeepResidentLoaded = true } .buttonStyle(SettingsPillButtonStyle()) diff --git a/Voxt/Settings/ModelSettingsView.swift b/Voxt/Settings/ModelSettingsView.swift index 632b056..5f5cd4b 100644 --- a/Voxt/Settings/ModelSettingsView.swift +++ b/Voxt/Settings/ModelSettingsView.swift @@ -71,7 +71,7 @@ struct ModelSettingsView: View { @AppStorage(AppPreferenceKey.whisperTemperature) var whisperTemperature = 0.0 @AppStorage(AppPreferenceKey.whisperVADEnabled) var whisperVADEnabled = true @AppStorage(AppPreferenceKey.whisperTimestampsEnabled) var whisperTimestampsEnabled = false - @AppStorage(AppPreferenceKey.whisperRealtimeEnabled) var whisperRealtimeEnabled = true + @AppStorage(AppPreferenceKey.whisperRealtimeEnabled) var whisperRealtimeEnabled = false @AppStorage(AppPreferenceKey.whisperKeepResidentLoaded) var whisperKeepResidentLoaded = true @AppStorage(AppPreferenceKey.whisperLocalASRTuningSettings) var whisperLocalASRTuningSettingsRaw = WhisperLocalTuningSettingsStore.defaultStoredValue() @AppStorage(AppPreferenceKey.customLLMModelRepo) var customLLMRepo = CustomLLMModelManager.defaultModelRepo diff --git a/Voxt/Settings/OnboardingComponents.swift b/Voxt/Settings/OnboardingComponents.swift index 41c4863..50eef40 100644 --- a/Voxt/Settings/OnboardingComponents.swift +++ b/Voxt/Settings/OnboardingComponents.swift @@ -169,7 +169,7 @@ struct LocalModelPickerCard: View { .foregroundStyle(.green) Button(openLabel, action: onOpen) .buttonStyle(SettingsPillButtonStyle()) - if let onUninstall { + if onUninstall != nil { Button(isRunningUninstall ? localized("Uninstalling…") : localized("Uninstall"), role: .destructive) { isShowingUninstallConfirmation = true } diff --git a/Voxt/Support/ConfigurationTransferManager.swift b/Voxt/Support/ConfigurationTransferManager.swift index fa237b3..a70a27c 100644 --- a/Voxt/Support/ConfigurationTransferManager.swift +++ b/Voxt/Support/ConfigurationTransferManager.swift @@ -472,7 +472,7 @@ enum ConfigurationTransferManager { whisperTemperature = try container.decodeIfPresent(Double.self, forKey: .whisperTemperature) ?? 0.0 whisperVADEnabled = try container.decodeIfPresent(Bool.self, forKey: .whisperVADEnabled) ?? true whisperTimestampsEnabled = try container.decodeIfPresent(Bool.self, forKey: .whisperTimestampsEnabled) ?? false - whisperRealtimeEnabled = try container.decodeIfPresent(Bool.self, forKey: .whisperRealtimeEnabled) ?? true + whisperRealtimeEnabled = try container.decodeIfPresent(Bool.self, forKey: .whisperRealtimeEnabled) ?? false whisperKeepResidentLoaded = try container.decodeIfPresent(Bool.self, forKey: .whisperKeepResidentLoaded) ?? true customLLMModelRepo = try container.decode(String.self, forKey: .customLLMModelRepo) translationCustomLLMModelRepo = try container.decode(String.self, forKey: .translationCustomLLMModelRepo) @@ -905,7 +905,7 @@ enum ConfigurationTransferManager { whisperTemperature: defaults.object(forKey: AppPreferenceKey.whisperTemperature) as? Double ?? 0.0, whisperVADEnabled: defaults.object(forKey: AppPreferenceKey.whisperVADEnabled) as? Bool ?? true, whisperTimestampsEnabled: defaults.object(forKey: AppPreferenceKey.whisperTimestampsEnabled) as? Bool ?? false, - whisperRealtimeEnabled: defaults.object(forKey: AppPreferenceKey.whisperRealtimeEnabled) as? Bool ?? true, + whisperRealtimeEnabled: defaults.object(forKey: AppPreferenceKey.whisperRealtimeEnabled) as? Bool ?? false, whisperKeepResidentLoaded: defaults.object(forKey: AppPreferenceKey.whisperKeepResidentLoaded) as? Bool ?? true, customLLMModelRepo: defaults.string(forKey: AppPreferenceKey.customLLMModelRepo) ?? CustomLLMModelManager.defaultModelRepo, translationCustomLLMModelRepo: defaults.string(forKey: AppPreferenceKey.translationCustomLLMModelRepo) ?? CustomLLMModelManager.defaultModelRepo, diff --git a/Voxt/Support/DictionaryMatchingSupport.swift b/Voxt/Support/DictionaryMatchingSupport.swift index 0ad2900..6812242 100644 --- a/Voxt/Support/DictionaryMatchingSupport.swift +++ b/Voxt/Support/DictionaryMatchingSupport.swift @@ -13,15 +13,17 @@ private struct DictionaryScriptProfile { var containsKana = false var containsHangul = false - var containsCJKLike: Bool { + nonisolated init() {} + + nonisolated var containsCJKLike: Bool { containsHan || containsKana || containsHangul } - var isLatinLike: Bool { + nonisolated var isLatinLike: Bool { (containsLatin || containsDigit) && !containsCJKLike } - var isMixedScript: Bool { + nonisolated var isMixedScript: Bool { containsCJKLike && (containsLatin || containsDigit) } } @@ -36,7 +38,7 @@ private enum DictionaryMatchVariantSource { case replacementTerm case observedVariant - var source: DictionaryMatchSource { + nonisolated var source: DictionaryMatchSource { switch self { case .term: return .term @@ -47,8 +49,13 @@ private enum DictionaryMatchVariantSource { } } - var allowsFuzzyMatch: Bool { - self != .replacementTerm + nonisolated var allowsFuzzyMatch: Bool { + switch self { + case .replacementTerm: + return false + case .term, .observedVariant: + return true + } } } @@ -104,7 +111,7 @@ nonisolated func dictionaryIsHangul(_ scalar: UnicodeScalar) -> Bool { } } -private func dictionaryScriptProfile(for text: String) -> DictionaryScriptProfile { +private nonisolated func dictionaryScriptProfile(for text: String) -> DictionaryScriptProfile { var profile = DictionaryScriptProfile() for scalar in text.unicodeScalars { if CharacterSet.decimalDigits.contains(scalar) { @@ -129,7 +136,7 @@ private func dictionaryScriptProfile(for text: String) -> DictionaryScriptProfil return profile } -private func dictionaryNormalizedMapping(for text: String) -> DictionaryNormalizedMapping { +private nonisolated func dictionaryNormalizedMapping(for text: String) -> DictionaryNormalizedMapping { var output = "" var sourceRanges: [NSRange] = [] var previousWasWhitespace = false @@ -175,7 +182,7 @@ private func dictionaryNormalizedMapping(for text: String) -> DictionaryNormaliz return DictionaryNormalizedMapping(text: output, sourceRanges: sourceRanges) } -private func dictionaryExactNormalizedMatchRanges( +private nonisolated func dictionaryExactNormalizedMatchRanges( in text: String, normalizedNeedle: String, requireTokenBoundaries: Bool @@ -231,12 +238,12 @@ struct DictionaryMatcher { let entries: [DictionaryEntry] let blockedGlobalMatchKeys: Set - func promptContext(for text: String) -> DictionaryPromptContext { + nonisolated func promptContext(for text: String) -> DictionaryPromptContext { let candidates = recallCandidates(in: text) return DictionaryPromptContext(entries: entries, candidates: candidates) } - func recallCandidates(in text: String) -> [DictionaryMatchCandidate] { + nonisolated func recallCandidates(in text: String) -> [DictionaryMatchCandidate] { guard !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { return [] } let rawTokens = tokenize(text) @@ -283,7 +290,7 @@ struct DictionaryMatcher { } } - func applyCorrections(to text: String, automaticReplacementEnabled: Bool) -> DictionaryCorrectionResult { + nonisolated func applyCorrections(to text: String, automaticReplacementEnabled: Bool) -> DictionaryCorrectionResult { let candidates = recallCandidates(in: text) let replacementCandidates = candidates .filter { shouldApplyReplacement(for: $0, automaticReplacementEnabled: automaticReplacementEnabled) } @@ -314,7 +321,7 @@ struct DictionaryMatcher { ) } - private func shouldApplyReplacement( + private nonisolated func shouldApplyReplacement( for candidate: DictionaryMatchCandidate, automaticReplacementEnabled: Bool ) -> Bool { @@ -325,7 +332,7 @@ struct DictionaryMatcher { return candidate.allowsAutomaticReplacement } - private func replacementSortComparator( + private nonisolated func replacementSortComparator( lhs: DictionaryMatchCandidate, rhs: DictionaryMatchCandidate ) -> Bool { @@ -350,7 +357,7 @@ struct DictionaryMatcher { return lhs.score > rhs.score } - private func replacementPriority(for candidate: DictionaryMatchCandidate) -> Int { + private nonisolated func replacementPriority(for candidate: DictionaryMatchCandidate) -> Int { if candidate.source == .replacementTerm { return 4 } @@ -367,7 +374,7 @@ struct DictionaryMatcher { } } - private func matchVariants(for entry: DictionaryEntry) -> [DictionaryMatchVariant] { + private nonisolated func matchVariants(for entry: DictionaryEntry) -> [DictionaryMatchVariant] { var variants = [ DictionaryMatchVariant( text: entry.term, @@ -399,12 +406,12 @@ struct DictionaryMatcher { return variants } - private func shouldBlock(variant: DictionaryMatchVariant, entry: DictionaryEntry) -> Bool { + private nonisolated func shouldBlock(variant: DictionaryMatchVariant, entry: DictionaryEntry) -> Bool { guard entry.groupID == nil else { return false } return blockedGlobalMatchKeys.contains(variant.normalizedText) } - private func exactCandidates( + private nonisolated func exactCandidates( for entry: DictionaryEntry, variant: DictionaryMatchVariant, text: String @@ -435,7 +442,7 @@ struct DictionaryMatcher { } } - private func exactReason( + private nonisolated func exactReason( for entry: DictionaryEntry, variant: DictionaryMatchVariant, matchedText: String @@ -448,7 +455,7 @@ struct DictionaryMatcher { } } - private func bestFuzzyCandidate( + private nonisolated func bestFuzzyCandidate( for entry: DictionaryEntry, variant: DictionaryMatchVariant, text: String, @@ -514,7 +521,7 @@ struct DictionaryMatcher { return best } - private func tokenize(_ text: String) -> [DictionaryToken] { + private nonisolated func tokenize(_ text: String) -> [DictionaryToken] { var tokens: [DictionaryToken] = [] var currentStart: String.Index? var current = "" @@ -560,26 +567,26 @@ struct DictionaryMatcher { return tokens } - private func allowsFuzzyMatch(for profile: DictionaryScriptProfile) -> Bool { + private nonisolated func allowsFuzzyMatch(for profile: DictionaryScriptProfile) -> Bool { profile.isLatinLike || profile.isMixedScript } - private func minimumFuzzyLength(for profile: DictionaryScriptProfile) -> Int { + private nonisolated func minimumFuzzyLength(for profile: DictionaryScriptProfile) -> Int { profile.isMixedScript ? 5 : 4 } - private func minimumFuzzyScore(for profile: DictionaryScriptProfile) -> Double { + private nonisolated func minimumFuzzyScore(for profile: DictionaryScriptProfile) -> Double { profile.isMixedScript ? 0.96 : 0.90 } - private func fuzzyThreshold(for profile: DictionaryScriptProfile, maxLength: Int) -> Int { + private nonisolated func fuzzyThreshold(for profile: DictionaryScriptProfile, maxLength: Int) -> Int { if profile.isMixedScript { return maxLength >= 10 ? 2 : 1 } return max(1, min(2, maxLength / 6)) } - private func levenshteinDistance(lhs: String, rhs: String) -> Int { + private nonisolated func levenshteinDistance(lhs: String, rhs: String) -> Int { let lhsChars = Array(lhs) let rhsChars = Array(rhs) guard !lhsChars.isEmpty else { return rhsChars.count } diff --git a/Voxt/Support/DictionaryStore.swift b/Voxt/Support/DictionaryStore.swift index 3cb4eb3..113ed29 100644 --- a/Voxt/Support/DictionaryStore.swift +++ b/Voxt/Support/DictionaryStore.swift @@ -199,13 +199,13 @@ struct DictionaryMatchCandidate: Identifiable, Hashable { let source: DictionaryMatchSource let matchRange: NSRange? - var id: String { + nonisolated var id: String { let location = matchRange?.location ?? -1 let length = matchRange?.length ?? 0 return "\(entryID.uuidString)|\(normalizedMatchedText)|\(reason.rawValue)|\(source.rawValue)|\(location)|\(length)" } - var allowsAutomaticReplacement: Bool { + nonisolated var allowsAutomaticReplacement: Bool { if source == .replacementTerm { return true } @@ -222,7 +222,7 @@ struct DictionaryMatchCandidate: Identifiable, Hashable { } } - var shouldPersistObservedVariant: Bool { + nonisolated var shouldPersistObservedVariant: Bool { source != .replacementTerm && reason != .exactTerm } } diff --git a/Voxt/Support/HistoryAudioArchiveSupport.swift b/Voxt/Support/HistoryAudioArchiveSupport.swift index 23472c1..7e6e2f9 100644 --- a/Voxt/Support/HistoryAudioArchiveSupport.swift +++ b/Voxt/Support/HistoryAudioArchiveSupport.swift @@ -1,10 +1,10 @@ import Foundation enum HistoryAudioArchiveSupport { - static let targetSampleRate: Double = 16_000 - static let rewriteJoinGapSeconds: Double = 0.3 + nonisolated static let targetSampleRate: Double = 16_000 + nonisolated static let rewriteJoinGapSeconds: Double = 0.3 - static func exportWAV( + nonisolated static func exportWAV( samples: [Float], sampleRate: Double, to destinationURL: URL @@ -20,7 +20,7 @@ enum HistoryAudioArchiveSupport { return true } - static func mergedRewriteArchive( + nonisolated static func mergedRewriteArchive( existingArchiveURL: URL?, appendedArchiveURL: URL ) throws -> URL { @@ -49,7 +49,7 @@ enum HistoryAudioArchiveSupport { return tempURL } - static func readWAVSamples(from fileURL: URL) throws -> [Float] { + nonisolated static func readWAVSamples(from fileURL: URL) throws -> [Float] { let data = try Data(contentsOf: fileURL) guard data.count >= 44 else { throw NSError( @@ -112,18 +112,18 @@ enum HistoryAudioArchiveSupport { } } - static func silenceSamples(durationSeconds: Double) -> [Float] { + nonisolated static func silenceSamples(durationSeconds: Double) -> [Float] { let count = max(Int((durationSeconds * targetSampleRate).rounded()), 0) return [Float](repeating: 0, count: count) } - static func temporaryArchiveURL(prefix: String) -> URL { + nonisolated static func temporaryArchiveURL(prefix: String) -> URL { FileManager.default.temporaryDirectory .appendingPathComponent("\(prefix)-\(UUID().uuidString)") .appendingPathExtension("wav") } - private static func wavData(for samples: [Float], sampleRate: Int) -> Data { + private nonisolated static func wavData(for samples: [Float], sampleRate: Int) -> Data { let channelCount: UInt16 = 1 let bitsPerSample: UInt16 = 16 let byteRate = UInt32(sampleRate) * UInt32(channelCount) * UInt32(bitsPerSample / 8) @@ -157,12 +157,12 @@ enum HistoryAudioArchiveSupport { return data } - private static func bytes(of value: T) -> Data { + private nonisolated static func bytes(of value: T) -> Data { var mutableValue = value return withUnsafeBytes(of: &mutableValue) { Data($0) } } - private static func resample(samples: [Float], from inputRate: Double, to outputRate: Double) -> [Float] { + private nonisolated static func resample(samples: [Float], from inputRate: Double, to outputRate: Double) -> [Float] { guard !samples.isEmpty, inputRate > 0, outputRate > 0 else { return samples } if abs(inputRate - outputRate) <= 1 { return samples @@ -185,13 +185,13 @@ enum HistoryAudioArchiveSupport { return output } - private static func littleEndianUInt16(from data: Data, at offset: Int) -> UInt16 { + private nonisolated static func littleEndianUInt16(from data: Data, at offset: Int) -> UInt16 { data.subdata(in: offset..<(offset + 2)).withUnsafeBytes { rawBuffer in rawBuffer.load(as: UInt16.self).littleEndian } } - private static func littleEndianUInt32(from data: Data, at offset: Int) -> UInt32 { + private nonisolated static func littleEndianUInt32(from data: Data, at offset: Int) -> UInt32 { data.subdata(in: offset..<(offset + 4)).withUnsafeBytes { rawBuffer in rawBuffer.load(as: UInt32.self).littleEndian } diff --git a/Voxt/Support/RemoteModelConfiguration.swift b/Voxt/Support/RemoteModelConfiguration.swift index bb66559..d9f2db3 100644 --- a/Voxt/Support/RemoteModelConfiguration.swift +++ b/Voxt/Support/RemoteModelConfiguration.swift @@ -62,6 +62,9 @@ enum RemoteASRProvider: String, CaseIterable, Identifiable { RemoteModelOption(id: "qwen3-asr-flash-realtime", title: "Qwen3 ASR Flash Realtime"), RemoteModelOption(id: "qwen3-asr-flash-realtime-2026-02-10", title: "Qwen3 ASR Flash Realtime (2026-02-10)"), RemoteModelOption(id: "qwen3-asr-flash-realtime-2025-10-27", title: "Qwen3 ASR Flash Realtime (2025-10-27)"), + RemoteModelOption(id: "qwen3.5-omni-flash-realtime", title: "Qwen3.5 Omni Flash Realtime"), + RemoteModelOption(id: "qwen3.5-omni-plus-realtime", title: "Qwen3.5 Omni Plus Realtime"), + RemoteModelOption(id: "qwen-omni-turbo-realtime", title: "Qwen Omni Turbo Realtime"), RemoteModelOption(id: "fun-asr-realtime", title: "Fun ASR Realtime"), RemoteModelOption(id: "fun-asr-realtime-2026-02-28", title: "Fun ASR Realtime (2026-02-28)"), RemoteModelOption(id: "fun-asr-realtime-2025-11-07", title: "Fun ASR Realtime (2025-11-07)"), diff --git a/Voxt/Support/RemoteProviderConnectivityTester.swift b/Voxt/Support/RemoteProviderConnectivityTester.swift index 5b651c0..ead163a 100644 --- a/Voxt/Support/RemoteProviderConnectivityTester.swift +++ b/Voxt/Support/RemoteProviderConnectivityTester.swift @@ -71,14 +71,15 @@ struct RemoteProviderConnectivityTester { throw NSError(domain: "Voxt.Settings", code: -5, userInfo: [NSLocalizedDescriptionKey: AppLocalization.localizedString("Aliyun Bailian API Key is required for testing.")]) } let model = configuration.model.isEmpty ? "fun-asr-realtime" : configuration.model - if isAliyunQwenRealtimeModel(model) { + if let kind = RemoteASREndpointSupport.aliyunQwenRealtimeSessionKind(for: model) { let endpoint = RemoteProviderConnectivityTestEndpoints.resolvedAliyunASRQwenRealtimeWebSocketEndpoint( endpoint: configuration.endpoint, model: model ) return try await testAliyunASRQwenRealtimeWebSocketReachability( endpoint: endpoint, - apiKey: configuration.apiKey + apiKey: configuration.apiKey, + kind: kind ) } let endpoint = RemoteProviderConnectivityTestEndpoints.resolvedAliyunASRRealtimeWebSocketEndpoint( @@ -318,7 +319,8 @@ struct RemoteProviderConnectivityTester { private func testAliyunASRQwenRealtimeWebSocketReachability( endpoint: String, - apiKey: String + apiKey: String, + kind: AliyunQwenRealtimeSessionKind ) async throws -> String { guard let url = URL(string: endpoint) else { throw NSError(domain: "Voxt.Settings", code: -53, userInfo: [NSLocalizedDescriptionKey: AppLocalization.localizedString("Invalid WebSocket endpoint URL.")]) @@ -337,23 +339,10 @@ struct RemoteProviderConnectivityTester { ws.cancel(with: .goingAway, reason: nil) } - let updatePayload: [String: Any] = [ - "event_id": UUID().uuidString.lowercased(), - "type": "session.update", - "session": [ - "modalities": ["text"], - "input_audio_format": "pcm", - "sample_rate": 16000, - "input_audio_transcription": [ - "language": "zh" - ], - "turn_detection": [ - "type": "server_vad", - "threshold": 0.0, - "silence_duration_ms": 400 - ] - ] - ] + let updatePayload = AliyunQwenRealtimePayloadSupport.sessionUpdatePayload( + kind: kind, + hintPayload: .init(language: "zh", languageHints: ["zh"]) + ) let finishPayload: [String: Any] = [ "event_id": UUID().uuidString.lowercased(), "type": "session.finish" @@ -1164,9 +1153,4 @@ struct RemoteProviderConnectivityTester { } } - private func isAliyunQwenRealtimeModel(_ model: String) -> Bool { - model.trimmingCharacters(in: .whitespacesAndNewlines) - .lowercased() - .hasPrefix("qwen3-asr-flash-realtime") - } } diff --git a/Voxt/Support/VoxtLog.swift b/Voxt/Support/VoxtLog.swift index 75e69a7..b9defdf 100644 --- a/Voxt/Support/VoxtLog.swift +++ b/Voxt/Support/VoxtLog.swift @@ -12,28 +12,28 @@ enum VoxtLog { case error = "ERROR" } - static var verboseEnabled = false + nonisolated(unsafe) static var verboseEnabled = false - static func info(_ message: @autoclosure () -> String, verbose: Bool = false) { + nonisolated static func info(_ message: @autoclosure () -> String, verbose: Bool = false) { log(message(), level: .info, verbose: verbose) } - static func hotkey(_ message: @autoclosure () -> String) { + nonisolated static func hotkey(_ message: @autoclosure () -> String) { guard UserDefaults.standard.bool(forKey: AppPreferenceKey.hotkeyDebugLoggingEnabled) else { return } log(message(), level: .info) } - static func llm(_ message: @autoclosure () -> String) { + nonisolated static func llm(_ message: @autoclosure () -> String) { guard UserDefaults.standard.bool(forKey: AppPreferenceKey.llmDebugLoggingEnabled) else { return } log(message(), level: .info) } - static func model(_ message: @autoclosure () -> String) { + nonisolated static func model(_ message: @autoclosure () -> String) { guard UserDefaults.standard.bool(forKey: AppPreferenceKey.llmDebugLoggingEnabled) else { return } log(message(), level: .info) } - static func llmPreview(_ text: String, limit: Int = 1200) -> String { + nonisolated static func llmPreview(_ text: String, limit: Int = 1200) -> String { let normalized = text .replacingOccurrences(of: "\r\n", with: "\n") .trimmingCharacters(in: .whitespacesAndNewlines) @@ -43,15 +43,15 @@ enum VoxtLog { return "\(normalized[.. String) { + nonisolated static func warning(_ message: @autoclosure () -> String) { log(message(), level: .warning) } - static func error(_ message: @autoclosure () -> String) { + nonisolated static func error(_ message: @autoclosure () -> String) { log(message(), level: .error) } - static func latestLogUpdateDate() -> Date? { + nonisolated static func latestLogUpdateDate() -> Date? { lock.lock() defer { lock.unlock() } do { @@ -62,7 +62,7 @@ enum VoxtLog { } } - static func latestLogExportPayload(limit: Int = 2000) -> ExportPayload { + nonisolated static func latestLogExportPayload(limit: Int = 2000) -> ExportPayload { lock.lock() defer { lock.unlock() } loadCacheIfNeeded() @@ -77,24 +77,24 @@ enum VoxtLog { return ExportPayload(filename: filename, content: content) } - static func exportLatestLogs(limit: Int = 2000) throws -> URL { + nonisolated static func exportLatestLogs(limit: Int = 2000) throws -> URL { let payload = latestLogExportPayload(limit: limit) let url = FileManager.default.temporaryDirectory.appendingPathComponent(payload.filename) try payload.content.write(to: url, atomically: true, encoding: .utf8) return url } - private static let lock = NSLock() - private static let maxStoredLines = 10000 - private static var didLoadCache = false - private static var logLines: [String] = [] - private static let lineDateFormatter: ISO8601DateFormatter = { + private nonisolated static let lock = NSLock() + private nonisolated static let maxStoredLines = 10000 + private nonisolated(unsafe) static var didLoadCache = false + private nonisolated(unsafe) static var logLines: [String] = [] + private nonisolated(unsafe) static let lineDateFormatter: ISO8601DateFormatter = { let formatter = ISO8601DateFormatter() formatter.formatOptions = [.withInternetDateTime, .withFractionalSeconds] return formatter }() - private static var logFileURL: URL { + private nonisolated static var logFileURL: URL { let supportDirectory = try? FileManager.default.url( for: .applicationSupportDirectory, in: .userDomainMask, @@ -108,19 +108,19 @@ enum VoxtLog { .appendingPathComponent("voxt.log") } - private static func log(_ message: String, level: Level, verbose: Bool = false) { + private nonisolated static func log(_ message: String, level: Level, verbose: Bool = false) { guard !verbose || verboseEnabled else { return } let line = formatLine(message: message, level: level) print(line) persist(line: line) } - private static func formatLine(message: String, level: Level) -> String { + private nonisolated static func formatLine(message: String, level: Level) -> String { let dateText = lineDateFormatter.string(from: Date()) return "[Voxt] \(dateText) [\(level.rawValue)] \(message)" } - private static func persist(line: String) { + private nonisolated static func persist(line: String) { lock.lock() defer { lock.unlock() } loadCacheIfNeeded() @@ -129,7 +129,7 @@ enum VoxtLog { writeAllLines() } - private static func loadCacheIfNeeded() { + private nonisolated static func loadCacheIfNeeded() { guard !didLoadCache else { return } didLoadCache = true guard let content = try? String(contentsOf: logFileURL, encoding: .utf8), !content.isEmpty else { @@ -142,12 +142,12 @@ enum VoxtLog { trimIfNeeded() } - private static func trimIfNeeded() { + private nonisolated static func trimIfNeeded() { guard logLines.count > maxStoredLines else { return } logLines = Array(logLines.suffix(maxStoredLines)) } - private static func writeAllLines() { + private nonisolated static func writeAllLines() { do { try FileManager.default.createDirectory( at: logFileURL.deletingLastPathComponent(), diff --git a/Voxt/Transcription/RemoteASRStreamingContexts.swift b/Voxt/Transcription/RemoteASRStreamingContexts.swift index b6aaf10..88e6e6a 100644 --- a/Voxt/Transcription/RemoteASRStreamingContexts.swift +++ b/Voxt/Transcription/RemoteASRStreamingContexts.swift @@ -20,6 +20,7 @@ final class AliyunQwenStreamingContext { let ws: URLSessionWebSocketTask let responseState: AliyunQwenResponseState let generationID: UUID + let kind: AliyunQwenRealtimeSessionKind var isClosed = false var didStartAudioStream = false @@ -27,12 +28,14 @@ final class AliyunQwenStreamingContext { session: URLSession, ws: URLSessionWebSocketTask, responseState: AliyunQwenResponseState, - generationID: UUID + generationID: UUID, + kind: AliyunQwenRealtimeSessionKind ) { self.session = session self.ws = ws self.responseState = responseState self.generationID = generationID + self.kind = kind } } @@ -57,6 +60,9 @@ actor AliyunQwenResponseState { } func markCompletedWithError(_ error: Error) { + if sessionFinished { + return + } if completionError == nil { completionError = error onError(error) diff --git a/Voxt/Transcription/RemoteASRSupport.swift b/Voxt/Transcription/RemoteASRSupport.swift index 92a7b2c..7ec95a9 100644 --- a/Voxt/Transcription/RemoteASRSupport.swift +++ b/Voxt/Transcription/RemoteASRSupport.swift @@ -1,5 +1,58 @@ import Foundation +enum AliyunQwenRealtimeSessionKind: Equatable { + case qwenASR + case omniASR + + var transcriptionModel: String? { + switch self { + case .qwenASR: + return nil + case .omniASR: + return "qwen3-asr-flash-realtime" + } + } + + var shouldCommitBeforeFinish: Bool { + switch self { + case .qwenASR: + return false + case .omniASR: + return false + } + } +} + +enum AliyunQwenRealtimePayloadSupport { + static func sessionUpdatePayload( + kind: AliyunQwenRealtimeSessionKind, + hintPayload: ResolvedASRHintPayload + ) -> [String: Any] { + var transcriptionPayload: [String: Any] = [:] + if let transcriptionModel = kind.transcriptionModel { + transcriptionPayload["model"] = transcriptionModel + } + if let language = hintPayload.language?.trimmingCharacters(in: .whitespacesAndNewlines), !language.isEmpty { + transcriptionPayload["language"] = language + } + return [ + "event_id": UUID().uuidString.lowercased(), + "type": "session.update", + "session": [ + "modalities": ["text"], + "input_audio_format": "pcm", + "sample_rate": 16000, + "input_audio_transcription": transcriptionPayload, + "turn_detection": [ + "type": "server_vad", + "threshold": 0.0, + "silence_duration_ms": 400 + ] + ] + ] + } +} + enum RemoteASRTextSupport { static func extractTextFragment(fromLine line: String) -> String? { let trimmed = line.trimmingCharacters(in: .whitespacesAndNewlines) @@ -273,6 +326,23 @@ enum RemoteASREndpointSupport { return normalized.hasPrefix("qwen3-asr-flash-realtime") } + static func isAliyunOmniRealtimeModel(_ model: String) -> Bool { + let normalized = model.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() + return normalized.hasPrefix("qwen3.5-omni-flash-realtime") + || normalized.hasPrefix("qwen3.5-omni-plus-realtime") + || normalized.hasPrefix("qwen-omni-turbo-realtime") + } + + static func aliyunQwenRealtimeSessionKind(for model: String) -> AliyunQwenRealtimeSessionKind? { + if isAliyunQwenRealtimeModel(model) { + return .qwenASR + } + if isAliyunOmniRealtimeModel(model) { + return .omniASR + } + return nil + } + static func isAliyunFileTranscriptionModel(_ model: String) -> Bool { let normalized = model.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() return normalized.hasPrefix("qwen3-asr-flash-filetrans") diff --git a/Voxt/Transcription/RemoteASRTranscriber+AliyunDebug.swift b/Voxt/Transcription/RemoteASRTranscriber+AliyunDebug.swift index 4945a6e..6f6a19b 100644 --- a/Voxt/Transcription/RemoteASRTranscriber+AliyunDebug.swift +++ b/Voxt/Transcription/RemoteASRTranscriber+AliyunDebug.swift @@ -155,6 +155,7 @@ extension RemoteASRTranscriber { let responseState = AliyunQwenResponseState() let startSignal = AsyncGate() + let kind = RemoteASREndpointSupport.aliyunQwenRealtimeSessionKind(for: model) ?? .qwenASR let receiveTask = Task { do { while !Task.isCancelled { @@ -181,7 +182,7 @@ extension RemoteASRTranscriber { } } - sendAliyunQwenSessionUpdate(through: ws, hintPayload: hintPayload) { error in + sendAliyunQwenSessionUpdate(through: ws, hintPayload: hintPayload, kind: kind) { error in Task { if let error { await responseState.markCompletedWithError(error) @@ -293,6 +294,12 @@ extension RemoteASRTranscriber { return } + if type.hasPrefix("response.") + || type.hasPrefix("output_audio.") + || (type.hasPrefix("conversation.item.") && !type.hasPrefix("conversation.item.input_audio_transcription.")) { + return + } + if type == "conversation.item.input_audio_transcription.text" { let partial = (object["text"] as? String ?? "").trimmingCharacters(in: .whitespacesAndNewlines) if !partial.isEmpty { diff --git a/Voxt/Transcription/RemoteASRTranscriber.swift b/Voxt/Transcription/RemoteASRTranscriber.swift index f1a0c15..eade56f 100644 --- a/Voxt/Transcription/RemoteASRTranscriber.swift +++ b/Voxt/Transcription/RemoteASRTranscriber.swift @@ -65,6 +65,7 @@ class RemoteASRTranscriber: NSObject, ObservableObject, TranscriberProtocol { private var didRetryDoubaoCaptureStartup = false private var doubaoCaptureUsesPreferredInputDevice = false private let doubaoCaptureStartupWatchdogDelay: Duration = .seconds(1.2) + private let aliyunRealtimeStopDrainDelay: Duration = .milliseconds(180) func setPreferredInputDevice(_ deviceID: AudioDeviceID?) { preferredInputDeviceID = deviceID @@ -131,8 +132,12 @@ class RemoteASRTranscriber: NSObject, ObservableObject, TranscriberProtocol { if provider == .aliyunBailianASR { do { - if RemoteASREndpointSupport.isAliyunQwenRealtimeModel(configuration.model) { - try startAliyunQwenRealtimeStreaming(configuration: configuration, hintPayload: hintPayload) + if let kind = RemoteASREndpointSupport.aliyunQwenRealtimeSessionKind(for: configuration.model) { + try startAliyunQwenRealtimeStreaming( + configuration: configuration, + hintPayload: hintPayload, + kind: kind + ) } else { try startAliyunFunStreaming(configuration: configuration, hintPayload: hintPayload) } @@ -333,10 +338,29 @@ class RemoteASRTranscriber: NSObject, ObservableObject, TranscriberProtocol { } private func stopAliyunQwenStreaming(_ context: AliyunQwenStreamingContext) { - isRecording = false - stopAliyunAudioCapture() - guard !context.isClosed else { return } + Task { @MainActor [weak self] in + guard let self else { return } + guard self.isCurrentGeneration(context.generationID), + self.aliyunQwenStreamingContext === context, + !context.isClosed + else { return } + + // Keep capture alive briefly so the last queued tap callbacks can append + // trailing speech before we close the realtime session. + try? await Task.sleep(for: self.aliyunRealtimeStopDrainDelay) + guard self.isCurrentGeneration(context.generationID), + self.aliyunQwenStreamingContext === context, + !context.isClosed + else { return } + + self.isRecording = false + self.stopAliyunAudioCapture() + self.sendAliyunQwenFinishEvent(context) + } + } + + private func sendAliyunQwenFinishEvent(_ context: AliyunQwenStreamingContext) { sendAliyunQwenEvent( type: "session.finish", through: context.ws @@ -775,14 +799,14 @@ class RemoteASRTranscriber: NSObject, ObservableObject, TranscriberProtocol { ? RemoteASRProvider.aliyunBailianASR.suggestedModel : configuration.model.trimmingCharacters(in: .whitespacesAndNewlines) guard RemoteASREndpointSupport.isAliyunFunRealtimeModel(model) - || RemoteASREndpointSupport.isAliyunQwenRealtimeModel(model) + || RemoteASREndpointSupport.aliyunQwenRealtimeSessionKind(for: model) != nil || RemoteASREndpointSupport.isAliyunFileTranscriptionModel(model) || AliyunMeetingASRConfiguration.routing(for: model) == .compatibleShortAudio else { throw NSError( domain: "Voxt.RemoteASR", code: -33, - userInfo: [NSLocalizedDescriptionKey: "Aliyun ASR in Voxt supports Qwen/Fun/Paraformer transcription models only."] + userInfo: [NSLocalizedDescriptionKey: "Aliyun ASR in Voxt supports Qwen/Omni/Fun/Paraformer transcription models only."] ) } @@ -790,7 +814,7 @@ class RemoteASRTranscriber: NSObject, ObservableObject, TranscriberProtocol { guard !token.isEmpty else { throw NSError(domain: "Voxt.RemoteASR", code: -30, userInfo: [NSLocalizedDescriptionKey: "Aliyun Bailian API key is empty."]) } - if RemoteASREndpointSupport.isAliyunQwenRealtimeModel(model) { + if RemoteASREndpointSupport.aliyunQwenRealtimeSessionKind(for: model) != nil { return try await transcribeAliyunQwenRealtimeFile( fileURL: fileURL, token: token, @@ -1088,7 +1112,8 @@ class RemoteASRTranscriber: NSObject, ObservableObject, TranscriberProtocol { private func startAliyunQwenRealtimeStreaming( configuration: RemoteProviderConfiguration, - hintPayload: ResolvedASRHintPayload + hintPayload: ResolvedASRHintPayload, + kind: AliyunQwenRealtimeSessionKind ) throws { let token = configuration.apiKey.trimmingCharacters(in: .whitespacesAndNewlines) guard !token.isEmpty else { @@ -1119,11 +1144,12 @@ class RemoteASRTranscriber: NSObject, ObservableObject, TranscriberProtocol { session: managedSocket.session, ws: ws, responseState: responseState, - generationID: recordingGenerationID + generationID: recordingGenerationID, + kind: kind ) aliyunQwenStreamingContext = context receiveAliyunQwenMessages(context) - sendAliyunQwenSessionUpdate(through: ws, hintPayload: hintPayload) { error in + sendAliyunQwenSessionUpdate(through: ws, hintPayload: hintPayload, kind: kind) { error in Task { [responseState] in if let error { await responseState.markCompletedWithError(error) @@ -1176,6 +1202,15 @@ class RemoteASRTranscriber: NSObject, ObservableObject, TranscriberProtocol { let type = (object["type"] as? String ?? "").lowercased() if type == "error" { let detail = (object["message"] as? String) ?? "Aliyun Qwen realtime ASR task failed." + if await shouldIgnoreTrailingAliyunQwenGenericError( + detail: detail, + context: context + ) { + context.isClosed = true + await context.responseState.markSessionFinished() + return + } + VoxtLog.info("Aliyun qwen realtime error packet received. detail=\(detail)", verbose: true) throw NSError(domain: "Voxt.RemoteASR", code: -46, userInfo: [NSLocalizedDescriptionKey: detail]) } @@ -1189,6 +1224,12 @@ class RemoteASRTranscriber: NSObject, ObservableObject, TranscriberProtocol { return } + if type.hasPrefix("response.") + || type.hasPrefix("output_audio.") + || (type.hasPrefix("conversation.item.") && !type.hasPrefix("conversation.item.input_audio_transcription.")) { + return + } + if type == "conversation.item.input_audio_transcription.text" { let partial = (object["text"] as? String ?? "").trimmingCharacters(in: .whitespacesAndNewlines) if !partial.isEmpty { @@ -1246,30 +1287,31 @@ class RemoteASRTranscriber: NSObject, ObservableObject, TranscriberProtocol { isRecording = true } + private func shouldIgnoreTrailingAliyunQwenGenericError( + detail: String, + context: AliyunQwenStreamingContext + ) async -> Bool { + guard stopRequested else { return false } + let normalized = detail.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() + guard normalized.isEmpty + || normalized == "aliyun qwen realtime asr task failed." + || normalized == "aliyun qwen realtime task failed." + || normalized == "task failed" + else { return false } + let currentText = await context.responseState.currentText() + return !currentText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty + } + func sendAliyunQwenSessionUpdate( through ws: URLSessionWebSocketTask, hintPayload: ResolvedASRHintPayload, + kind: AliyunQwenRealtimeSessionKind = .qwenASR, onError: @escaping (Error?) -> Void ) { - var transcriptionPayload: [String: Any] = [:] - if let language = hintPayload.language?.trimmingCharacters(in: .whitespacesAndNewlines), !language.isEmpty { - transcriptionPayload["language"] = language - } - let payload: [String: Any] = [ - "event_id": UUID().uuidString.lowercased(), - "type": "session.update", - "session": [ - "modalities": ["text"], - "input_audio_format": "pcm", - "sample_rate": 16000, - "input_audio_transcription": transcriptionPayload, - "turn_detection": [ - "type": "server_vad", - "threshold": 0.0, - "silence_duration_ms": 400 - ] - ] - ] + let payload = AliyunQwenRealtimePayloadSupport.sessionUpdatePayload( + kind: kind, + hintPayload: hintPayload + ) sendAliyunQwenEvent(payload: payload, through: ws, onError: onError) } @@ -2721,7 +2763,8 @@ class RemoteASRTranscriber: NSObject, ObservableObject, TranscriberProtocol { ) { completedAudioArchiveURL = tempURL VoxtLog.info( - "Remote streaming audio archive staged. samples=\(samples.count), sampleRate=\(Int(streamingInputSampleRate)), file=\(tempURL.lastPathComponent), realtime=\(realtimeSummary)" + "Remote streaming audio archive staged. samples=\(samples.count), sampleRate=\(Int(streamingInputSampleRate)), file=\(tempURL.lastPathComponent), realtime=\(realtimeSummary)", + verbose: true ) } } catch { diff --git a/Voxt/Transcription/WhisperKitTranscriber.swift b/Voxt/Transcription/WhisperKitTranscriber.swift index 489e400..902bba8 100644 --- a/Voxt/Transcription/WhisperKitTranscriber.swift +++ b/Voxt/Transcription/WhisperKitTranscriber.swift @@ -4,6 +4,184 @@ import Combine import AudioToolbox import WhisperKit +public struct WhisperRealtimeEagerState { + private static let stableHoldbackCharacterCount = 4 + private static let minimumNewUtteranceCharacterCount = 4 + + public private(set) var stableCommittedText = "" + public private(set) var currentCommittedText = "" + public private(set) var liveCandidateText = "" + public private(set) var lastRawCandidateText = "" + public private(set) var publishedText = "" + public private(set) var continuesFromCommittedPrefix = false + + public init() {} + + public mutating func reset() { + stableCommittedText = "" + currentCommittedText = "" + liveCandidateText = "" + lastRawCandidateText = "" + publishedText = "" + continuesFromCommittedPrefix = false + } + + public mutating func apply(_ result: TranscriptionResult) -> String? { + apply(hypothesisText: result.text) + } + + mutating func apply( + hypothesisText text: String + ) -> String? { + let normalized = Self.normalize(text) + guard !normalized.isEmpty else { return nil } + let candidate = resolvedCurrentUtteranceText(from: normalized) + guard !candidate.isEmpty else { + continuesFromCommittedPrefix = true + return nil + } + + if continuesFromCommittedPrefix, + liveCandidateText.isEmpty, + lastRawCandidateText.isEmpty, + !stableCommittedText.isEmpty, + candidate.count < Self.minimumNewUtteranceCharacterCount { + return nil + } + + let displayCandidate = resolvedDisplayCandidate( + previousCandidate: lastRawCandidateText, + currentCandidate: candidate + ) + + if !lastRawCandidateText.isEmpty { + let agreedCount = Self.longestCommonPrefixCount( + Array(lastRawCandidateText), + Array(displayCandidate) + ) + let commitCount = max(currentCommittedText.count, max(0, agreedCount - Self.stableHoldbackCharacterCount)) + if commitCount > currentCommittedText.count { + currentCommittedText = String(displayCandidate.prefix(commitCount)) + } + } else { + currentCommittedText = "" + } + + lastRawCandidateText = displayCandidate + liveCandidateText = displayCandidate + continuesFromCommittedPrefix = false + return publish(stableCommittedText + displayCandidate) + } + + public mutating func applyFinal(_ text: String) -> String? { + let normalized = Self.normalize(text) + reset() + return publish(normalized, force: true) + } + + public mutating func sealCurrentPublishedTextForNextUtterance() { + let committed = publishedText + stableCommittedText = committed + currentCommittedText = "" + liveCandidateText = "" + lastRawCandidateText = "" + continuesFromCommittedPrefix = true + } + + var mutablePublishedCharacterCount: Int { + max(0, liveCandidateText.count - currentCommittedText.count) + } + + private mutating func publish(_ text: String, force: Bool = false) -> String? { + let normalized = Self.normalize(text) + guard force || normalized != publishedText else { return nil } + publishedText = normalized + return normalized + } + + private func resolvedCurrentUtteranceText(from fullText: String) -> String { + guard !stableCommittedText.isEmpty else { return fullText } + if fullText.hasPrefix(stableCommittedText) { + return String(fullText.dropFirst(stableCommittedText.count)) + } + + if stableCommittedText.contains(fullText) { + return "" + } + + let overlapCount = Self.longestSuffixPrefixOverlapCount( + sourceCharacters: Array(stableCommittedText), + candidateCharacters: Array(fullText) + ) + if overlapCount >= 2 { + return String(fullText.dropFirst(overlapCount)) + } + + return fullText + } + + private func resolvedDisplayCandidate( + previousCandidate: String, + currentCandidate: String + ) -> String { + guard !previousCandidate.isEmpty else { return currentCandidate } + if currentCandidate.hasPrefix(previousCandidate) { + return currentCandidate + } + if previousCandidate.hasPrefix(currentCandidate) { + return previousCandidate + } + + let overlapCount = Self.longestSuffixPrefixOverlapCount( + sourceCharacters: Array(previousCandidate), + candidateCharacters: Array(currentCandidate) + ) + guard overlapCount >= 2 else { return currentCandidate } + return previousCandidate + String(currentCandidate.dropFirst(overlapCount)) + } + + private static func normalize(_ text: String) -> String { + text.trimmingCharacters(in: .whitespacesAndNewlines) + } + + private static func longestCommonPrefixCount(_ lhs: [Character], _ rhs: [Character]) -> Int { + let upperBound = min(lhs.count, rhs.count) + var count = 0 + while count < upperBound, lhs[count] == rhs[count] { + count += 1 + } + return count + } + + private static func longestSuffixPrefixOverlapCount( + sourceCharacters: [Character], + candidateCharacters: [Character] + ) -> Int { + let upperBound = min(sourceCharacters.count, candidateCharacters.count) + guard upperBound > 0 else { return 0 } + for overlap in stride(from: upperBound, through: 1, by: -1) { + if Array(sourceCharacters.suffix(overlap)) == Array(candidateCharacters.prefix(overlap)) { + return overlap + } + } + return 0 + } + +} + +struct WhisperRealtimeReplayEvent: Equatable { + let elapsedSeconds: Double + let text: String + let isFinal: Bool + let source: String + let rawText: String +} + +struct WhisperRealtimeReplayDiagnostics: Equatable { + let events: [WhisperRealtimeReplayEvent] + let trace: [String] +} + @MainActor final class WhisperKitTranscriber: ObservableObject, TranscriberProtocol { private final class AudioSampleStore { @@ -49,49 +227,37 @@ final class WhisperKitTranscriber: ObservableObject, TranscriberProtocol { } } - private actor RealtimeStartGate { - enum Outcome { - case success - case failure(Error) - } - - private var outcome: Outcome? - private var continuation: CheckedContinuation? - - func wait() async -> Outcome { - if let outcome { - return outcome - } - return await withCheckedContinuation { continuation in - if let outcome { - continuation.resume(returning: outcome) - return - } - self.continuation = continuation + private enum WhisperInferenceProfile: String { + case offline + case realtimeDraft + case realtimeEager + case realtimeFinal + + var usesLiveDecodingBias: Bool { + switch self { + case .offline: + return false + case .realtimeDraft, .realtimeEager, .realtimeFinal: + return true } } - - func succeed() { - resolve(.success) - } - - func fail(_ error: Error) { - resolve(.failure(error)) - } - - private func resolve(_ outcome: Outcome) { - guard self.outcome == nil else { - return - } - self.outcome = outcome - let continuation = self.continuation - self.continuation = nil - continuation?.resume(returning: outcome) - } } static let offlinePartialPollInterval: Duration = .seconds(6) static let offlineFirstPartialMinimumSeconds: Double = 5.0 + static let realtimeEagerPollInterval: Duration = .milliseconds(250) + static let realtimeEagerFirstPassMinimumSeconds: Double = 0.35 + static let realtimeEagerSteadyStateMinimumSeconds: Double = 0.65 + static let realtimeEagerMinimumNewAudioSeconds: Double = 0.18 + static let realtimeDraftBootstrapSeconds: Double = 2.8 + static let realtimeDraftBootstrapCharacterCount = 18 + static let realtimeDraftWindowSeconds: Double = 1.6 + static let realtimeDraftFallbackStallSeconds: Double = 0.55 + static let realtimeSilenceWindowSeconds: Double = 0.45 + static let realtimeSilenceRMSHoldThreshold: Float = 0.0035 + static let realtimeSilencePeakHoldThreshold: Float = 0.018 + static let realtimeSegmentOverlapSeconds: Double = 0.8 + nonisolated static let realtimeLongFormFinalProfileThresholdSeconds: Double = 30 @Published var isRecording = false @Published var isModelInitializing = false @@ -118,11 +284,18 @@ final class WhisperKitTranscriber: ObservableObject, TranscriberProtocol { private var partialLoopTask: Task? private var finalizationTask: Task? private var captureWatchdogTask: Task? - private var realtimeTranscriptionTask: Task? - private var audioStreamTranscriber: AudioStreamTranscriber? + private var realtimeEagerTask: Task? + private var realtimeLevelTask: Task? private var activeUseHeld = false private var isInferenceRunning = false private var didRetryCaptureStartup = false + private var realtimeEagerLastSampleCount = 0 + private var realtimeEagerLastPublishedSampleCount = 0 + private var realtimeEagerState = WhisperRealtimeEagerState() + private var realtimeTraceEntries: [String] = [] + private var realtimeCommittedSampleCount = 0 + private var realtimeWasRecentlySpeaking = false + private var realtimeDidFlushCurrentSilence = false private let captureStartupWatchdogDelay: Duration = .seconds(1.2) private let targetSampleRate = Double(WhisperKit.sampleRate) @@ -230,6 +403,14 @@ final class WhisperKitTranscriber: ObservableObject, TranscriberProtocol { guard isRecording else { return } let revision = sessionRevision + let stopSampleCount = whisperRealtimeEnabled ? snapshotPreparedAudioSamples().count : sampleStore.count() + let stopBufferedSeconds = Double(stopSampleCount) / max(whisperRealtimeEnabled ? targetSampleRate : inputSampleRate, 1) + VoxtLog.info( + """ + Whisper stop requested. revision=\(revision), realtime=\(whisperRealtimeEnabled), sampleCount=\(stopSampleCount), bufferedSec=\(String(format: "%.2f", stopBufferedSeconds)), partialChars=\(transcribedText.count) + """, + verbose: true + ) isRecording = false isModelInitializing = false audioLevel = 0 @@ -238,14 +419,14 @@ final class WhisperKitTranscriber: ObservableObject, TranscriberProtocol { partialLoopTask = nil captureWatchdogTask?.cancel() captureWatchdogTask = nil + realtimeEagerTask?.cancel() + realtimeEagerTask = nil isFinalizingTranscription = true finalizationTask?.cancel() if whisperRealtimeEnabled { - let streamTranscriber = audioStreamTranscriber - audioStreamTranscriber = nil + preparedWhisper?.audioProcessor.stopRecording() finalizationTask = Task { [weak self] in - await streamTranscriber?.stopStreamTranscription() guard let self else { return } await self.runFinalTranscription( revision: revision, @@ -302,6 +483,250 @@ final class WhisperKitTranscriber: ObservableObject, TranscriberProtocol { return normalizeText(results.map(\.text).joined(separator: " ")) } + func debugReplayRealtimeAudioFile( + _ fileURL: URL, + outputMode: SessionOutputMode = .transcription, + useBuiltInTranslationTask: Bool = false, + stepSeconds: Double = 0.25 + ) async throws -> [WhisperRealtimeReplayEvent] { + try await debugReplayRealtimeAudioFileWithTrace( + fileURL, + outputMode: outputMode, + useBuiltInTranslationTask: useBuiltInTranslationTask, + stepSeconds: stepSeconds + ).events + } + + func debugReplayRealtimeAudioFileWithTrace( + _ fileURL: URL, + outputMode: SessionOutputMode = .transcription, + useBuiltInTranslationTask: Bool = false, + stepSeconds: Double = 0.25 + ) async throws -> WhisperRealtimeReplayDiagnostics { + let loaded = try DebugAudioClipIO.loadMonoSamples(from: fileURL) + preparedOutputMode = outputMode + preparedUseBuiltInTranslationTask = useBuiltInTranslationTask + let whisper = try await modelManager.loadWhisper() + let preparedSamples = prepareInputSamples(loaded.samples, sampleRate: loaded.sampleRate) + let stepSampleCount = max(Int(stepSeconds * targetSampleRate), 1) + var eagerState = WhisperRealtimeEagerState() + var events: [WhisperRealtimeReplayEvent] = [] + var trace: [String] = [] + var lastPublishedEndSample = 0 + var committedSampleCount = 0 + var wasRecentlySpeaking = false + var didFlushCurrentSilence = false + + var endSample = stepSampleCount + while endSample <= preparedSamples.count { + let windowSamples = Array(preparedSamples.prefix(endSample)) + let overlapSampleCount = max(Int(Self.realtimeSegmentOverlapSeconds * targetSampleRate), 0) + let activeSegmentStartSample = max(0, min(committedSampleCount, windowSamples.count) - overlapSampleCount) + let activeSegmentSamples = Array(windowSamples.suffix(windowSamples.count - activeSegmentStartSample)) + let minimumSeconds = eagerState.mutablePublishedCharacterCount == 0 + ? Self.realtimeEagerFirstPassMinimumSeconds + : Self.realtimeEagerSteadyStateMinimumSeconds + if Double(activeSegmentSamples.count) / targetSampleRate >= minimumSeconds { + let bufferedSeconds = Double(activeSegmentSamples.count) / targetSampleRate + let publishedStallSeconds = Double(max(endSample - lastPublishedEndSample, 0)) / targetSampleRate + let published: (text: String, source: String, rawText: String)? + let usesBootstrapDraft = Self.shouldUseRealtimeDraftBootstrap( + bufferedSeconds: bufferedSeconds, + publishedCharacterCount: eagerState.mutablePublishedCharacterCount + ) + let hasRecentSpeech = Self.hasRecentSpeechActivity( + samples: windowSamples, + targetSampleRate: targetSampleRate + ) + if hasRecentSpeech { + wasRecentlySpeaking = true + didFlushCurrentSilence = false + } + let shouldFlushSilenceBoundary = !hasRecentSpeech && + wasRecentlySpeaking && + !didFlushCurrentSilence && + !eagerState.publishedText.isEmpty && + publishedStallSeconds >= Self.realtimeDraftFallbackStallSeconds + if shouldFlushSilenceBoundary { + let result = try await whisper.transcribe( + audioArray: activeSegmentSamples, + decodeOptions: buildDecodingOptions( + whisper: whisper, + includeWordTimings: false, + profile: .realtimeFinal + ) + ).first + let silencePublished = result.flatMap { result in + eagerState.apply(hypothesisText: result.text).map { + ( + text: $0, + source: "silence-flush", + rawText: result.text + ) + } + } + trace.append( + String( + format: "[%.1fs] silence-flush raw=%@ published=%@", + Double(endSample) / targetSampleRate, + Self.traceQuoted(normalizeText(result?.text ?? "")), + Self.traceQuoted(normalizeText(silencePublished?.text ?? eagerState.publishedText)) + ) + ) + if let silencePublished { + let normalized = normalizeText(silencePublished.text) + if !normalized.isEmpty, events.last?.text != normalized { + lastPublishedEndSample = endSample + events.append( + WhisperRealtimeReplayEvent( + elapsedSeconds: Double(endSample) / targetSampleRate, + text: normalized, + isFinal: false, + source: silencePublished.source, + rawText: normalizeText(silencePublished.rawText) + ) + ) + } + } + eagerState.sealCurrentPublishedTextForNextUtterance() + committedSampleCount = endSample + lastPublishedEndSample = endSample + didFlushCurrentSilence = true + wasRecentlySpeaking = false + endSample = min(endSample + stepSampleCount, preparedSamples.count) + continue + } + let shouldHoldForSilence = !hasRecentSpeech && + !eagerState.publishedText.isEmpty && + publishedStallSeconds >= Self.realtimeDraftFallbackStallSeconds + if shouldHoldForSilence { + trace.append( + String( + format: "[%.1fs] hold/silence published=%@", + Double(endSample) / targetSampleRate, + Self.traceQuoted(eagerState.publishedText) + ) + ) + endSample = min(endSample + stepSampleCount, preparedSamples.count) + continue + } + if usesBootstrapDraft { + let draftWindowSampleCount = max(Int(Self.realtimeDraftWindowSeconds * targetSampleRate), 1) + let draftWindow = Array(activeSegmentSamples.suffix(min(activeSegmentSamples.count, draftWindowSampleCount))) + let result = try await whisper.transcribe( + audioArray: draftWindow, + decodeOptions: buildDecodingOptions( + whisper: whisper, + includeWordTimings: false, + profile: .realtimeDraft + ) + ).first + published = result.flatMap { result in + eagerState.apply(hypothesisText: result.text).map { + ( + text: $0, + source: usesBootstrapDraft ? "draft-bootstrap" : "draft-fallback", + rawText: result.text + ) + } + } + trace.append( + String( + format: "[%.1fs] %@ raw=%@ published=%@", + Double(endSample) / targetSampleRate, + "draft-bootstrap", + Self.traceQuoted(normalizeText(result?.text ?? "")), + Self.traceQuoted(normalizeText(published?.text ?? eagerState.publishedText)) + ) + ) + } else { + let result = try await whisper.transcribe( + audioArray: activeSegmentSamples, + decodeOptions: buildDecodingOptions( + whisper: whisper, + includeWordTimings: false, + profile: .realtimeEager + ) + ).first + published = result.flatMap { result in + eagerState.apply(result).map { + ( + text: $0, + source: "eager", + rawText: result.text + ) + } + } + trace.append( + String( + format: "[%.1fs] eager raw=%@ published=%@", + Double(endSample) / targetSampleRate, + Self.traceQuoted(normalizeText(result?.text ?? "")), + Self.traceQuoted(normalizeText(published?.text ?? eagerState.publishedText)) + ) + ) + } + + if let published { + let normalized = normalizeText(published.text) + if !normalized.isEmpty, events.last?.text != normalized { + lastPublishedEndSample = endSample + events.append( + WhisperRealtimeReplayEvent( + elapsedSeconds: Double(endSample) / targetSampleRate, + text: normalized, + isFinal: false, + source: published.source, + rawText: normalizeText(published.rawText) + ) + ) + } + } + } + + if endSample == preparedSamples.count { + break + } + endSample = min(endSample + stepSampleCount, preparedSamples.count) + } + + let bufferedSeconds = Double(preparedSamples.count) / targetSampleRate + let useOfflineFinalProfile = Self.shouldUseOfflineFinalProfileForStop( + realtimeEnabled: true, + bufferedSeconds: bufferedSeconds + ) + let finalResults = try await whisper.transcribe( + audioArray: preparedSamples, + decodeOptions: buildDecodingOptions( + whisper: whisper, + includeWordTimings: false, + profile: useOfflineFinalProfile ? .offline : .realtimeFinal + ) + ) + let finalText = normalizeText(finalResults.map(\.text).joined(separator: " ")) + if !finalText.isEmpty { + events.append( + WhisperRealtimeReplayEvent( + elapsedSeconds: Double(preparedSamples.count) / targetSampleRate, + text: finalText, + isFinal: true, + source: "final", + rawText: finalText + ) + ) + trace.append( + String( + format: "[%.1fs] final/%@ raw=%@", + Double(preparedSamples.count) / targetSampleRate, + useOfflineFinalProfile ? "offline" : "realtime", + Self.traceQuoted(finalText) + ) + ) + } + + return WhisperRealtimeReplayDiagnostics(events: events, trace: trace) + } + func restartCaptureForPreferredInputDevice() throws { guard isRecording else { return } guard !whisperRealtimeEnabled else { @@ -321,34 +746,21 @@ final class WhisperKitTranscriber: ObservableObject, TranscriberProtocol { } do { - let startGate = RealtimeStartGate() - let streamTranscriber = try makeAudioStreamTranscriber( - whisper: whisper, - revision: revision, - startGate: startGate - ) - audioStreamTranscriber = streamTranscriber - realtimeTranscriptionTask = Task { [weak self] in - do { - try await streamTranscriber.startStreamTranscription() - } catch { - await startGate.fail(error) - await self?.handleRealtimeTranscriptionError(error, revision: revision) + inputSampleRate = targetSampleRate + try whisper.audioProcessor.startRecordingLive(inputDeviceID: preferredInputDeviceID) { [weak self] _ in + Task { @MainActor [weak self] in + self?.handleRealtimeAudioBuffer(revision: revision) } - self?.handleRealtimeTranscriptionTaskExit(revision: revision) - } - - let outcome = await startGate.wait() - switch outcome { - case .success: - return nil - case .failure(let error): - let message = String(localized: "Whisper failed to start recording.") - lastStartFailureMessage = message - VoxtLog.error("Whisper realtime start failed: \(error)") - cleanupPreparedWhisperIfNeeded() - return message } + isRecording = true + isModelInitializing = false + startRealtimeLevelUpdates(revision: revision) + startRealtimeEagerLoop(revision: revision) + VoxtLog.info( + "Whisper audio capture started. sampleRate=\(Int(targetSampleRate)), deviceID=\(preferredInputDeviceID.map(String.init(describing:)) ?? "default"), mode=realtime-eager", + verbose: true + ) + return nil } catch { let message = String(localized: "Whisper failed to start recording.") lastStartFailureMessage = message @@ -384,6 +796,18 @@ final class WhisperKitTranscriber: ObservableObject, TranscriberProtocol { } private func runFinalTranscription(revision: Int, samples: [Float], sampleRate: Double) async { + let bufferedSeconds = Double(samples.count) / max(sampleRate, 1) + let useOfflineFinalProfile = Self.shouldUseOfflineFinalProfileForStop( + realtimeEnabled: whisperRealtimeEnabled, + bufferedSeconds: bufferedSeconds + ) + let finalProfile: WhisperInferenceProfile = useOfflineFinalProfile ? .offline : .realtimeFinal + if whisperRealtimeEnabled, useOfflineFinalProfile { + VoxtLog.info( + "Whisper realtime finalization promoted to offline long-form profile. bufferedSec=\(String(format: "%.2f", bufferedSeconds))", + verbose: true + ) + } defer { if revision == sessionRevision { isFinalizingTranscription = false @@ -395,19 +819,54 @@ final class WhisperKitTranscriber: ObservableObject, TranscriberProtocol { samples: samples, sampleRate: sampleRate, includeWordTimings: whisperTimestampsEnabled, - publishFinalResult: true + publishFinalResult: true, + profile: finalProfile ) } + nonisolated static func shouldUseOfflineFinalProfileForStop( + realtimeEnabled: Bool, + bufferedSeconds: Double + ) -> Bool { + guard realtimeEnabled else { return true } + return bufferedSeconds >= Self.realtimeLongFormFinalProfileThresholdSeconds + } + + nonisolated static func reconcileRealtimeFinalText( + finalText: String, + latestPublishedText: String + ) -> String { + let normalizedFinal = finalText.trimmingCharacters(in: .whitespacesAndNewlines) + let normalizedLive = latestPublishedText.trimmingCharacters(in: .whitespacesAndNewlines) + + guard !normalizedLive.isEmpty else { return normalizedFinal } + guard !normalizedFinal.isEmpty else { return normalizedLive } + guard normalizedLive != normalizedFinal else { return normalizedFinal } + + if normalizedLive.hasPrefix(normalizedFinal) { + let finalCount = normalizedFinal.count + let liveCount = normalizedLive.count + let delta = liveCount - finalCount + let minimumExtraCharacters = max(8, Int(Double(liveCount) * 0.12)) + if delta >= minimumExtraCharacters { + return normalizedLive + } + } + + return normalizedFinal + } + private func runInference( revision: Int, samples: [Float], sampleRate: Double, includeWordTimings: Bool, - publishFinalResult: Bool + publishFinalResult: Bool, + profile: WhisperInferenceProfile = .offline ) async { guard !samples.isEmpty else { if publishFinalResult { + VoxtLog.warning("Whisper finalization produced an empty audio snapshot; finishing with empty transcription.") cleanupPreparedWhisperIfNeeded() onTranscriptionFinished?("") } @@ -425,36 +884,55 @@ final class WhisperKitTranscriber: ObservableObject, TranscriberProtocol { guard revision == sessionRevision else { return } guard let whisper = preparedWhisper else { if publishFinalResult { + VoxtLog.warning("Whisper finalization aborted because preparedWhisper was already released.") cleanupPreparedWhisperIfNeeded() onTranscriptionFinished?("") } return } - - isInferenceRunning = true defer { - isInferenceRunning = false if publishFinalResult { cleanupPreparedWhisperIfNeeded() } } do { - let preparedSamples = prepareInputSamples(samples, sampleRate: sampleRate) - let decodeOptions = buildDecodingOptions( + let inferenceStartedAt = Date() + let transcription = try await transcribePreparedSamples( whisper: whisper, - includeWordTimings: includeWordTimings + samples: samples, + sampleRate: sampleRate, + includeWordTimings: includeWordTimings, + profile: profile, + revision: revision ) - let results = try await whisper.transcribe(audioArray: preparedSamples, decodeOptions: decodeOptions) - guard revision == sessionRevision else { return } - - let text = normalizeText(results.map(\.text).joined(separator: " ")) + let preparedSamples = transcription.preparedSamples + let results = transcription.results + let text = transcription.text if publishFinalResult { + let latestPublishedText = transcribedText + let resolvedFinalText = Self.reconcileRealtimeFinalText( + finalText: text, + latestPublishedText: latestPublishedText + ) + let elapsedMs = max(Int(Date().timeIntervalSince(inferenceStartedAt) * 1000), 0) stageCompletedAudioArchive(samples: preparedSamples, sampleRate: targetSampleRate) latestWordTimings = includeWordTimings ? buildWordTimings(from: results) : [] - transcribedText = text - onPartialTranscription?(text) - onTranscriptionFinished?(text) + VoxtLog.info( + """ + Whisper final transcription ready. revision=\(revision), chars=\(resolvedFinalText.count), preparedSampleCount=\(preparedSamples.count), segmentCount=\(results.count), elapsedMs=\(elapsedMs) + """ + ) + if resolvedFinalText != text { + VoxtLog.info( + """ + Whisper final transcription preserved longer live hypothesis tail. revision=\(revision), finalChars=\(text.count), liveChars=\(latestPublishedText.trimmingCharacters(in: .whitespacesAndNewlines).count), resolvedChars=\(resolvedFinalText.count) + """, + verbose: true + ) + } + publishWhisperFinalText(resolvedFinalText) + onTranscriptionFinished?(resolvedFinalText) } else { transcribedText = text onPartialTranscription?(text) @@ -465,6 +943,11 @@ final class WhisperKitTranscriber: ObservableObject, TranscriberProtocol { let preparedSamples = prepareInputSamples(samples, sampleRate: sampleRate) stageCompletedAudioArchive(samples: preparedSamples, sampleRate: targetSampleRate) latestWordTimings = [] + VoxtLog.warning( + """ + Whisper final inference failed; falling back to latest partial text. revision=\(revision), fallbackChars=\(transcribedText.trimmingCharacters(in: .whitespacesAndNewlines).count), preparedSampleCount=\(preparedSamples.count) + """ + ) onTranscriptionFinished?(transcribedText.trimmingCharacters(in: .whitespacesAndNewlines)) } } @@ -472,12 +955,47 @@ final class WhisperKitTranscriber: ObservableObject, TranscriberProtocol { private func buildDecodingOptions( whisper: WhisperKit, - includeWordTimings: Bool + includeWordTimings: Bool, + profile: WhisperInferenceProfile = .offline ) -> DecodingOptions { let hintPayload = resolvedHintPayload() let tuningSettings = resolvedLocalTuningSettings() let resolvedTask = resolvedDecodingTask() - let detectLanguage = hintPayload.language == nil + let resolvedLanguage = resolvedWhisperLanguage(for: profile, hintPayload: hintPayload) + let detectLanguage = resolvedLanguage == nil + let temperature: Float + let temperatureIncrementOnFallback: Float + let temperatureFallbackCount: Int + let noSpeechThreshold: Float + let chunkingStrategy: ChunkingStrategy? + + switch profile { + case .offline: + temperature = whisperTemperature + temperatureIncrementOnFallback = Float(tuningSettings.temperatureIncrementOnFallback) + temperatureFallbackCount = tuningSettings.temperatureFallbackCount + noSpeechThreshold = Float(tuningSettings.noSpeechThreshold) + chunkingStrategy = whisperVADEnabled ? .vad : nil + case .realtimeDraft: + temperature = 0 + temperatureIncrementOnFallback = 0 + temperatureFallbackCount = 1 + noSpeechThreshold = Float(min(tuningSettings.noSpeechThreshold, 0.35)) + chunkingStrategy = nil + case .realtimeEager: + temperature = 0 + temperatureIncrementOnFallback = Float(min(tuningSettings.temperatureIncrementOnFallback, 0.1)) + temperatureFallbackCount = min(max(tuningSettings.temperatureFallbackCount, 1), 2) + noSpeechThreshold = Float(min(tuningSettings.noSpeechThreshold, 0.45)) + chunkingStrategy = nil + case .realtimeFinal: + temperature = 0 + temperatureIncrementOnFallback = Float(min(tuningSettings.temperatureIncrementOnFallback, 0.15)) + temperatureFallbackCount = max(tuningSettings.temperatureFallbackCount, 2) + noSpeechThreshold = Float(min(tuningSettings.noSpeechThreshold, 0.45)) + chunkingStrategy = nil + } + let promptTokens: [Int]? if let prompt = hintPayload.prompt?.trimmingCharacters(in: .whitespacesAndNewlines), !prompt.isEmpty, @@ -488,18 +1006,20 @@ final class WhisperKitTranscriber: ObservableObject, TranscriberProtocol { promptTokens = nil } - VoxtLog.info( - "Whisper decode options. task=\(resolvedTask.description), language=\(hintPayload.language ?? "auto"), detectLanguage=\(detectLanguage), promptChars=\(hintPayload.prompt?.count ?? 0), promptTokens=\(promptTokens?.count ?? 0), realtime=\(whisperRealtimeEnabled)", - verbose: true - ) + if profile == .offline { + VoxtLog.info( + "Whisper decode options. profile=\(profile.rawValue), task=\(resolvedTask.description), language=\(resolvedLanguage ?? "auto"), detectLanguage=\(detectLanguage), promptChars=\(hintPayload.prompt?.count ?? 0), promptTokens=\(promptTokens?.count ?? 0), realtime=\(whisperRealtimeEnabled)", + verbose: true + ) + } return DecodingOptions( verbose: false, task: resolvedTask, - language: hintPayload.language, - temperature: whisperTemperature, - temperatureIncrementOnFallback: Float(tuningSettings.temperatureIncrementOnFallback), - temperatureFallbackCount: tuningSettings.temperatureFallbackCount, + language: resolvedLanguage, + temperature: temperature, + temperatureIncrementOnFallback: temperatureIncrementOnFallback, + temperatureFallbackCount: temperatureFallbackCount, usePrefillPrompt: true, detectLanguage: detectLanguage, skipSpecialTokens: true, @@ -508,8 +1028,8 @@ final class WhisperKitTranscriber: ObservableObject, TranscriberProtocol { promptTokens: promptTokens, compressionRatioThreshold: Float(tuningSettings.compressionRatioThreshold), logProbThreshold: Float(tuningSettings.logProbThreshold), - noSpeechThreshold: Float(tuningSettings.noSpeechThreshold), - chunkingStrategy: whisperVADEnabled ? .vad : nil + noSpeechThreshold: noSpeechThreshold, + chunkingStrategy: chunkingStrategy ) } @@ -569,7 +1089,13 @@ final class WhisperKitTranscriber: ObservableObject, TranscriberProtocol { isModelInitializing = false isFinalizingTranscription = false latestWordTimings = [] - audioStreamTranscriber = nil + realtimeEagerLastSampleCount = 0 + realtimeEagerLastPublishedSampleCount = 0 + realtimeEagerState.reset() + realtimeTraceEntries.removeAll(keepingCapacity: false) + realtimeCommittedSampleCount = 0 + realtimeWasRecentlySpeaking = false + realtimeDidFlushCurrentSilence = false preparedWhisper?.audioProcessor.stopRecording() preparedWhisper?.audioProcessor.purgeAudioSamples(keepingLast: 0) } @@ -602,8 +1128,16 @@ final class WhisperKitTranscriber: ObservableObject, TranscriberProtocol { isFinalizingTranscription = false captureWatchdogTask?.cancel() captureWatchdogTask = nil - realtimeTranscriptionTask?.cancel() - realtimeTranscriptionTask = nil + realtimeEagerTask?.cancel() + realtimeEagerTask = nil + realtimeLevelTask?.cancel() + realtimeLevelTask = nil + realtimeEagerLastSampleCount = 0 + realtimeEagerLastPublishedSampleCount = 0 + realtimeTraceEntries.removeAll(keepingCapacity: false) + realtimeCommittedSampleCount = 0 + realtimeWasRecentlySpeaking = false + realtimeDidFlushCurrentSilence = false } private func stopAudioEngine() { @@ -616,7 +1150,17 @@ final class WhisperKitTranscriber: ObservableObject, TranscriberProtocol { private func cleanupPreparedWhisperIfNeeded() { preparedWhisper?.audioProcessor.stopRecording() preparedWhisper?.audioProcessor.purgeAudioSamples(keepingLast: 0) - audioStreamTranscriber = nil + realtimeEagerTask?.cancel() + realtimeEagerTask = nil + realtimeLevelTask?.cancel() + realtimeLevelTask = nil + realtimeEagerLastSampleCount = 0 + realtimeEagerLastPublishedSampleCount = 0 + realtimeEagerState.reset() + realtimeTraceEntries.removeAll(keepingCapacity: false) + realtimeCommittedSampleCount = 0 + realtimeWasRecentlySpeaking = false + realtimeDidFlushCurrentSilence = false if activeUseHeld { modelManager.endActiveUse() activeUseHeld = false @@ -698,86 +1242,398 @@ final class WhisperKitTranscriber: ObservableObject, TranscriberProtocol { } } - private func makeAudioStreamTranscriber( - whisper: WhisperKit, - revision: Int, - startGate: RealtimeStartGate - ) throws -> AudioStreamTranscriber { - guard let tokenizer = whisper.tokenizer else { - throw NSError(domain: "Voxt.WhisperKitTranscriber", code: 1, userInfo: [ - NSLocalizedDescriptionKey: "Whisper tokenizer is unavailable." - ]) - } - - let decodeOptions = buildDecodingOptions(whisper: whisper, includeWordTimings: false) - inputSampleRate = targetSampleRate - return AudioStreamTranscriber( - audioEncoder: whisper.audioEncoder, - featureExtractor: whisper.featureExtractor, - segmentSeeker: whisper.segmentSeeker, - textDecoder: whisper.textDecoder, - tokenizer: tokenizer, - audioProcessor: whisper.audioProcessor, - decodingOptions: decodeOptions, - useVAD: whisperVADEnabled, - stateChangeCallback: { [weak self] oldState, newState in - Task { @MainActor [weak self] in - self?.handleRealtimeStateChange( - oldState: oldState, - newState: newState, - revision: revision, - startGate: startGate - ) + private func handleRealtimeAudioBuffer(revision: Int) { + guard revision == sessionRevision, isRecording, whisperRealtimeEnabled else { return } + audioLevel = resolvedRealtimeAudioLevel(from: preparedWhisper?.audioProcessor.relativeEnergy ?? []) + } + + private func startRealtimeEagerLoop(revision: Int) { + realtimeEagerTask?.cancel() + realtimeEagerLastSampleCount = 0 + realtimeEagerLastPublishedSampleCount = 0 + realtimeEagerTask = Task { [weak self] in + while !Task.isCancelled { + do { + try await Task.sleep(for: Self.realtimeEagerPollInterval) + } catch { + return } + await self?.runRealtimeEagerPassIfNeeded(revision: revision) } + } + } + + private func runRealtimeEagerPassIfNeeded(revision: Int) async { + guard revision == sessionRevision, isRecording, whisperRealtimeEnabled else { return } + let samples = snapshotPreparedAudioSamples() + let sampleCount = samples.count + guard sampleCount > 0 else { return } + + let activeSegmentSamples = resolvedRealtimeInferenceSamples(from: samples) + let activeSegmentSampleCount = activeSegmentSamples.count + guard activeSegmentSampleCount > 0 else { return } + + let bufferedSeconds = Double(activeSegmentSampleCount) / targetSampleRate + let minimumSeconds = realtimeEagerState.mutablePublishedCharacterCount == 0 + ? Self.realtimeEagerFirstPassMinimumSeconds + : Self.realtimeEagerSteadyStateMinimumSeconds + guard bufferedSeconds >= minimumSeconds else { return } + let hasRecentSpeech = Self.hasRecentSpeechActivity( + samples: samples, + targetSampleRate: targetSampleRate ) + let publishedStallSeconds = Double(max(sampleCount - realtimeEagerLastPublishedSampleCount, 0)) / targetSampleRate + if hasRecentSpeech { + realtimeWasRecentlySpeaking = true + realtimeDidFlushCurrentSilence = false + } + let shouldFlushSilenceBoundary = !hasRecentSpeech && + realtimeWasRecentlySpeaking && + !realtimeDidFlushCurrentSilence && + !realtimeEagerState.publishedText.isEmpty && + publishedStallSeconds >= Self.realtimeDraftFallbackStallSeconds + if shouldFlushSilenceBoundary { + await reconcileRealtimeSilenceBoundary( + revision: revision, + samples: activeSegmentSamples, + sampleCount: sampleCount + ) + realtimeDidFlushCurrentSilence = true + realtimeWasRecentlySpeaking = false + return + } + let shouldHoldForSilence = !hasRecentSpeech && + !realtimeEagerState.publishedText.isEmpty && + publishedStallSeconds >= Self.realtimeDraftFallbackStallSeconds + if shouldHoldForSilence { + recordRealtimeTrace( + "hold", + sampleCount: sampleCount, + rawText: "", + publishedText: realtimeEagerState.publishedText, + note: "silence-hold" + ) + return + } + + if realtimeEagerLastSampleCount > 0 { + let newAudioSeconds = Double(max(sampleCount - realtimeEagerLastSampleCount, 0)) / targetSampleRate + guard newAudioSeconds >= Self.realtimeEagerMinimumNewAudioSeconds else { return } + } + realtimeEagerLastSampleCount = sampleCount + + let usesBootstrapDraft = shouldUseRealtimeDraftBootstrap(bufferedSeconds: bufferedSeconds) + if usesBootstrapDraft, + let draftCandidate = await makeRealtimeDraftCandidate(revision: revision, samples: activeSegmentSamples), + publishRealtimeDraftCandidate( + draftCandidate, + sampleCount: sampleCount, + source: "draft-bootstrap" + ) { + return + } + + guard let candidate = await makeRealtimeEagerCandidate(revision: revision, samples: activeSegmentSamples) else { return } + _ = publishRealtimeEagerCandidate(candidate, sampleCount: sampleCount) } - private func handleRealtimeStateChange( - oldState: AudioStreamTranscriber.State, - newState: AudioStreamTranscriber.State, + private func reconcileRealtimeSilenceBoundary( revision: Int, - startGate: RealtimeStartGate - ) { - guard revision == sessionRevision else { return } + samples: [Float], + sampleCount: Int + ) async { + if let candidate = await makeRealtimeSilenceReconcileCandidate( + revision: revision, + samples: samples + ) { + _ = publishRealtimeDraftCandidate( + candidate, + sampleCount: sampleCount, + source: "silence-flush" + ) + } else { + recordRealtimeTrace( + "silence-flush", + sampleCount: sampleCount, + rawText: "", + publishedText: realtimeEagerState.publishedText, + note: "reconcile-miss" + ) + } + realtimeEagerState.sealCurrentPublishedTextForNextUtterance() + realtimeCommittedSampleCount = sampleCount + realtimeEagerLastPublishedSampleCount = sampleCount + } - if newState.isRecording && !oldState.isRecording { - isRecording = true - Task { - await startGate.succeed() - } - VoxtLog.info( - "Whisper audio capture started. sampleRate=\(Int(targetSampleRate)), deviceID=\(preferredInputDeviceID.map(String.init(describing:)) ?? "default"), mode=realtime", - verbose: true + private func makeRealtimeDraftCandidate(revision: Int, samples: [Float]) async -> TranscriptionResult? { + guard let whisper = preparedWhisper else { return nil } + let windowSampleCount = max(Int(Self.realtimeDraftWindowSeconds * targetSampleRate), 1) + let draftSamples = Array(samples.suffix(min(samples.count, windowSampleCount))) + do { + let transcription = try await transcribePreparedSamples( + whisper: whisper, + samples: draftSamples, + sampleRate: targetSampleRate, + includeWordTimings: false, + profile: .realtimeDraft, + revision: revision ) + return transcription.results.first + } catch is CancellationError { + return nil + } catch { + VoxtLog.warning("Whisper realtime draft pass failed: \(error.localizedDescription)") + return nil } + } - audioLevel = max(newState.bufferEnergy.max() ?? 0, 0) - let mergedText = mergedRealtimeText(from: newState) - if !mergedText.isEmpty { - transcribedText = mergedText - onPartialTranscription?(mergedText) + private func makeRealtimeEagerCandidate(revision: Int, samples: [Float]) async -> TranscriptionResult? { + guard let whisper = preparedWhisper else { return nil } + do { + let transcription = try await transcribePreparedSamples( + whisper: whisper, + samples: samples, + sampleRate: targetSampleRate, + includeWordTimings: false, + profile: .realtimeEager, + revision: revision + ) + return transcription.results.first + } catch is CancellationError { + return nil + } catch { + VoxtLog.warning("Whisper realtime eager pass failed: \(error.localizedDescription)") + return nil } } - private func handleRealtimeTranscriptionError(_ error: Error, revision: Int) async { - guard revision == sessionRevision else { return } - VoxtLog.error("Whisper realtime transcription failed: \(error)") - guard isRecording else { return } - stopRecording() + private func makeRealtimeSilenceReconcileCandidate(revision: Int, samples: [Float]) async -> TranscriptionResult? { + guard let whisper = preparedWhisper else { return nil } + do { + let transcription = try await transcribePreparedSamples( + whisper: whisper, + samples: samples, + sampleRate: targetSampleRate, + includeWordTimings: false, + profile: .realtimeFinal, + revision: revision + ) + return transcription.results.first + } catch is CancellationError { + return nil + } catch { + VoxtLog.warning("Whisper realtime silence reconcile failed: \(error.localizedDescription)") + return nil + } } - private func handleRealtimeTranscriptionTaskExit(revision: Int) { - guard revision == sessionRevision else { return } - realtimeTranscriptionTask = nil + private func startRealtimeLevelUpdates(revision: Int) { + realtimeLevelTask?.cancel() + realtimeLevelTask = Task { [weak self] in + while !Task.isCancelled { + do { + try await Task.sleep(for: .milliseconds(50)) + } catch { + return + } + self?.publishRealtimeAudioLevelIfNeeded(revision: revision) + } + } + } + + private func publishRealtimeAudioLevelIfNeeded(revision: Int) { + guard revision == sessionRevision, isRecording, whisperRealtimeEnabled else { return } + let energy = preparedWhisper?.audioProcessor.relativeEnergy ?? [] + audioLevel = resolvedRealtimeAudioLevel(from: energy) + } + + private func resolvedRealtimeAudioLevel(from energy: [Float]) -> Float { + guard !energy.isEmpty else { return 0 } + let recentEnergy = Array(energy.suffix(4)) + guard !recentEnergy.isEmpty else { return 0 } + let peak = recentEnergy.max() ?? 0 + let average = recentEnergy.reduce(0, +) / Float(recentEnergy.count) + return min(max((peak * 0.7) + (average * 0.3), 0), 1) + } + + @discardableResult + private func publishRealtimeDraftCandidate( + _ result: TranscriptionResult, + sampleCount: Int, + source: String + ) -> Bool { + guard let published = realtimeEagerState.apply(hypothesisText: result.text) else { return false } + let normalized = normalizeText(published) + guard !normalized.isEmpty else { return false } + transcribedText = normalized + onPartialTranscription?(normalized) + realtimeEagerLastPublishedSampleCount = sampleCount + recordRealtimeTrace( + source, + sampleCount: sampleCount, + rawText: result.text, + publishedText: normalized + ) + return true } - private func mergedRealtimeText(from state: AudioStreamTranscriber.State) -> String { - let confirmed = state.confirmedSegments.map(\.text).joined() - let unconfirmedSegments = state.unconfirmedSegments.map(\.text).joined() - let fallback = state.currentText.isEmpty ? state.unconfirmedText.last ?? "" : state.currentText - let merged = normalizeText(confirmed + (unconfirmedSegments.isEmpty ? fallback : unconfirmedSegments)) - return merged == "Waiting for speech..." ? "" : merged + @discardableResult + private func publishRealtimeEagerCandidate(_ result: TranscriptionResult, sampleCount: Int) -> Bool { + guard let published = realtimeEagerState.apply(result) else { return false } + let normalized = normalizeText(published) + guard !normalized.isEmpty else { return false } + transcribedText = normalized + onPartialTranscription?(normalized) + realtimeEagerLastPublishedSampleCount = sampleCount + recordRealtimeTrace( + "eager", + sampleCount: sampleCount, + rawText: result.text, + publishedText: normalized + ) + return true + } + + private func shouldUseRealtimeDraftBootstrap(bufferedSeconds: Double) -> Bool { + Self.shouldUseRealtimeDraftBootstrap( + bufferedSeconds: bufferedSeconds, + publishedCharacterCount: realtimeEagerState.mutablePublishedCharacterCount + ) + } + + private static func shouldUseRealtimeDraftBootstrap( + bufferedSeconds: Double, + publishedCharacterCount: Int + ) -> Bool { + bufferedSeconds <= Self.realtimeDraftBootstrapSeconds && + publishedCharacterCount < Self.realtimeDraftBootstrapCharacterCount + } + + private func publishWhisperFinalText(_ text: String) { + if whisperRealtimeEnabled { + let normalized = normalizeText(text) + if let published = realtimeEagerState.applyFinal(normalized) { + transcribedText = published + onPartialTranscription?(published) + recordRealtimeTrace( + "final", + sampleCount: snapshotPreparedAudioSamples().count, + rawText: normalized, + publishedText: published + ) + } + return + } + + transcribedText = text + onPartialTranscription?(text) + } + + func consumeRealtimeTraceEntries() -> [String] { + let entries = realtimeTraceEntries + realtimeTraceEntries.removeAll(keepingCapacity: false) + return entries + } + + func debugCaptureStopSummary() -> String { + let sampleCount = whisperRealtimeEnabled ? snapshotPreparedAudioSamples().count : sampleStore.count() + let sampleRate = whisperRealtimeEnabled ? targetSampleRate : inputSampleRate + let bufferedSeconds = Double(sampleCount) / max(sampleRate, 1) + return """ + realtime=\(whisperRealtimeEnabled), sampleCount=\(sampleCount), bufferedSec=\(String(format: "%.2f", bufferedSeconds)), callbacks=\(sampleStore.callbacksReceived()), partialChars=\(transcribedText.count), finalizing=\(isFinalizingTranscription) + """ + } + + private func recordRealtimeTrace( + _ source: String, + sampleCount: Int, + rawText: String, + publishedText: String, + note: String? = nil + ) { + let seconds = Double(sampleCount) / targetSampleRate + let trace = String( + format: "[%.2fs] %@ raw=%@ published=%@%@", + seconds, + source, + Self.traceQuoted(normalizeText(rawText)), + Self.traceQuoted(normalizeText(publishedText)), + note.map { " note=\($0)" } ?? "" + ) + realtimeTraceEntries.append(trace) + if realtimeTraceEntries.count > 200 { + realtimeTraceEntries.removeFirst(realtimeTraceEntries.count - 200) + } + } + + private static func traceQuoted(_ text: String) -> String { + "\"\(text.replacingOccurrences(of: "\"", with: "\\\""))\"" + } + + private static func hasRecentSpeechActivity( + samples: [Float], + targetSampleRate: Double + ) -> Bool { + guard !samples.isEmpty else { return false } + let windowSampleCount = max(Int(Self.realtimeSilenceWindowSeconds * targetSampleRate), 1) + let recentSamples = samples.suffix(min(samples.count, windowSampleCount)) + guard !recentSamples.isEmpty else { return false } + + var sumSquares: Float = 0 + var peak: Float = 0 + for sample in recentSamples { + let magnitude = abs(sample) + sumSquares += magnitude * magnitude + peak = max(peak, magnitude) + } + + let rms = sqrt(sumSquares / Float(recentSamples.count)) + return rms >= Self.realtimeSilenceRMSHoldThreshold || peak >= Self.realtimeSilencePeakHoldThreshold + } + + private func resolvedRealtimeInferenceSamples(from samples: [Float]) -> [Float] { + guard !samples.isEmpty else { return [] } + let overlapSampleCount = max(Int(Self.realtimeSegmentOverlapSeconds * targetSampleRate), 0) + let startSample = max(0, min(realtimeCommittedSampleCount, samples.count) - overlapSampleCount) + guard startSample > 0 else { return samples } + return Array(samples[startSample...]) + } + + private func transcribePreparedSamples( + whisper: WhisperKit, + samples: [Float], + sampleRate: Double, + includeWordTimings: Bool, + profile: WhisperInferenceProfile, + revision: Int + ) async throws -> (preparedSamples: [Float], results: [TranscriptionResult], text: String) { + while isInferenceRunning { + do { + try await Task.sleep(for: .milliseconds(80)) + } catch { + throw CancellationError() + } + } + + guard revision == sessionRevision else { + throw CancellationError() + } + + isInferenceRunning = true + defer { isInferenceRunning = false } + + let preparedSamples = prepareInputSamples(samples, sampleRate: sampleRate) + let decodeOptions = buildDecodingOptions( + whisper: whisper, + includeWordTimings: includeWordTimings, + profile: profile + ) + let results = try await whisper.transcribe(audioArray: preparedSamples, decodeOptions: decodeOptions) + guard revision == sessionRevision else { + throw CancellationError() + } + let text = normalizeText(results.map(\.text).joined(separator: " ")) + return (preparedSamples, results, text) } private func snapshotPreparedAudioSamples() -> [Float] { @@ -807,7 +1663,7 @@ final class WhisperKitTranscriber: ObservableObject, TranscriberProtocol { } private var whisperRealtimeEnabled: Bool { - UserDefaults.standard.object(forKey: AppPreferenceKey.whisperRealtimeEnabled) as? Bool ?? true + UserDefaults.standard.object(forKey: AppPreferenceKey.whisperRealtimeEnabled) as? Bool ?? false } private var preferredMainLanguage: UserMainLanguageOption { @@ -821,6 +1677,27 @@ final class WhisperKitTranscriber: ObservableObject, TranscriberProtocol { return UserMainLanguageOption.fallbackOption() } + private func resolvedWhisperLanguage( + for profile: WhisperInferenceProfile, + hintPayload: ResolvedASRHintPayload + ) -> String? { + if !profile.usesLiveDecodingBias { + return hintPayload.language + } + if let language = hintPayload.language { + return language + } + + let selectedCodes = UserMainLanguageOption.storedSelection( + from: UserDefaults.standard.string(forKey: AppPreferenceKey.userMainLanguageCodes) + ) + let selectedOptions = selectedCodes.compactMap(UserMainLanguageOption.option(for:)) + guard selectedOptions.count == 1, let mainLanguage = selectedOptions.first else { + return nil + } + return mainLanguage.baseLanguageCode + } + private func applyPreferredInputDeviceIfNeeded(inputNode: AVAudioInputNode) { guard let preferredInputDeviceID else { return } guard let audioUnit = inputNode.audioUnit else { return } diff --git a/Voxt/UI/ModelDebugCore.swift b/Voxt/UI/ModelDebugCore.swift index 809b6e0..f842112 100644 --- a/Voxt/UI/ModelDebugCore.swift +++ b/Voxt/UI/ModelDebugCore.swift @@ -40,6 +40,17 @@ struct ASRDebugResult: Identifiable, Equatable { var isError: Bool { errorText != nil } } +private func formatWhisperRealtimeReplay(_ events: [WhisperRealtimeReplayEvent]) -> String { + guard !events.isEmpty else { return modelDebugLocalized("No realtime replay output.") } + return events + .map { event in + let time = String(format: "%.1fs", event.elapsedSeconds) + let phase = event.isFinal ? "final" : "live" + return "[\(time)] \(phase): \(event.text)" + } + .joined(separator: "\n") +} + struct ASRDebugClipItem: Identifiable, Equatable { let id: UUID let clip: DebugAudioClip @@ -401,6 +412,13 @@ final class ASRDebugViewModel: ObservableObject { errorText: nil ) results.insert(result, at: 0) + if case .whisper = option.selection { + await appendWhisperRealtimeReplayResult( + option: option, + clipItem: clipItem, + source: source + ) + } updateClipTitleIfNeeded(clipID: clipItem.id, transcript: output) statusMessage = AppLocalization.format("Completed %@", option.title) } catch { @@ -453,6 +471,54 @@ final class ASRDebugViewModel: ObservableObject { } } + private func appendWhisperRealtimeReplayResult( + option: ASRDebugModelOption, + clipItem: ASRDebugClipItem, + source: ASRDebugResult.Source + ) async { + guard case .whisper = option.selection else { return } + let startedAt = Date() + do { + let events = try await whisperTranscriber.debugReplayRealtimeAudioFile(clipItem.clip.fileURL) + let replayText = formatWhisperRealtimeReplay(events) + let elapsed = Date().timeIntervalSince(startedAt) + results.insert( + ASRDebugResult( + id: UUID(), + clipID: clipItem.id, + clipTitle: clipItem.displayTitle, + modelTitle: "\(option.title) · Realtime Replay", + source: source, + audioDurationText: String(format: "%.1fs", clipItem.clip.durationSeconds), + runtimeText: String(format: "%.2fs", elapsed), + characterCount: replayText.count, + createdAt: Date(), + outputText: replayText, + errorText: nil + ), + at: 0 + ) + } catch { + let elapsed = Date().timeIntervalSince(startedAt) + results.insert( + ASRDebugResult( + id: UUID(), + clipID: clipItem.id, + clipTitle: clipItem.displayTitle, + modelTitle: "\(option.title) · Realtime Replay", + source: source, + audioDurationText: String(format: "%.1fs", clipItem.clip.durationSeconds), + runtimeText: String(format: "%.2fs", elapsed), + characterCount: 0, + createdAt: Date(), + outputText: "", + errorText: error.localizedDescription + ), + at: 0 + ) + } + } + private func updateClipTitleIfNeeded(clipID: UUID, transcript: String) { guard let index = clips.firstIndex(where: { $0.id == clipID }), let preview = modelDebugClipTitlePreview(transcript) diff --git a/Voxt/en.lproj/Localizable.strings b/Voxt/en.lproj/Localizable.strings index 8cbe48c..3fd7f4b 100644 --- a/Voxt/en.lproj/Localizable.strings +++ b/Voxt/en.lproj/Localizable.strings @@ -793,6 +793,9 @@ "Applies only to rewrite. When disabled, the answer card appears only if no writable input is focused. When enabled, rewrite always shows the answer card." = "Applies only to rewrite. When disabled, the answer card appears only if no writable input is focused. When enabled, rewrite always shows the answer card."; "Whisper" = "Whisper"; "Realtime" = "Realtime"; +"Live Realtime (Experimental)" = "Live Realtime (Experimental)"; +"These settings apply to Whisper transcription sessions. Live Realtime (Experimental) streams partial text while you speak and does a final correction after stop. Turn it off to use the quality-first non-live path. Whisper translate is only used when Whisper translation is selected." = "These settings apply to Whisper transcription sessions. Live Realtime (Experimental) streams partial text while you speak and does a final correction after stop. Turn it off to use the quality-first non-live path. Whisper translate is only used when Whisper translation is selected."; +"Meeting Notes currently supports MLX Audio and Remote ASR. Direct Dictation is not available for meetings." = "Meeting Notes currently supports MLX Audio and Remote ASR. Direct Dictation is not available for meetings."; "Realtime On" = "Realtime On"; "Quality Mode" = "Quality Mode"; "Whisper translation works only when the ASR engine is Whisper. Voxt will fall back to your saved LLM provider." = "Whisper translation works only when the ASR engine is Whisper. Voxt will fall back to your saved LLM provider."; diff --git a/Voxt/ja.lproj/Localizable.strings b/Voxt/ja.lproj/Localizable.strings index 9d12987..9561720 100644 --- a/Voxt/ja.lproj/Localizable.strings +++ b/Voxt/ja.lproj/Localizable.strings @@ -792,6 +792,9 @@ "Applies only to rewrite. When disabled, the answer card appears only if no writable input is focused. When enabled, rewrite always shows the answer card." = "書き換えにのみ適用されます。無効時は、書き込み可能な入力欄にフォーカスがない場合のみ回答カードを表示します。有効時は、フォーカス状態に関係なく常に回答カードを表示します。"; "Whisper" = "Whisper"; "Realtime" = "リアルタイム"; +"Live Realtime (Experimental)" = "ライブリアルタイム(実験的)"; +"These settings apply to Whisper transcription sessions. Live Realtime (Experimental) streams partial text while you speak and does a final correction after stop. Turn it off to use the quality-first non-live path. Whisper translate is only used when Whisper translation is selected." = "これらの設定は Whisper の文字起こしセッションに適用されます。ライブリアルタイム(実験的)では話している間に partial テキストを継続表示し、停止後に最終補正を行います。オフにすると品質優先の非リアルタイム経路を使います。Whisper translate は Whisper 翻訳を選択した場合にのみ使われます。"; +"Meeting Notes currently supports MLX Audio and Remote ASR. Direct Dictation is not available for meetings." = "Meeting Notes は現在 MLX Audio と Remote ASR のみをサポートしており、Direct Dictation は会議では利用できません。"; "Realtime On" = "リアルタイム"; "Quality Mode" = "高品質モード"; "Whisper translation works only when the ASR engine is Whisper. Voxt will fall back to your saved LLM provider." = "Whisper 翻訳は ASR エンジンが Whisper のときのみ使えます。それ以外では保存済みの LLM プロバイダにフォールバックします。"; diff --git a/Voxt/zh-Hans.lproj/Localizable.strings b/Voxt/zh-Hans.lproj/Localizable.strings index 8274aff..fa86855 100644 --- a/Voxt/zh-Hans.lproj/Localizable.strings +++ b/Voxt/zh-Hans.lproj/Localizable.strings @@ -793,6 +793,9 @@ "Applies only to rewrite. When disabled, the answer card appears only if no writable input is focused. When enabled, rewrite always shows the answer card." = "仅适用于转写。关闭时,只有在未聚焦可写输入框时才显示答案卡片;开启后,转写无论是否聚焦输入框都会显示答案卡片。"; "Whisper" = "Whisper"; "Realtime" = "实时"; +"Live Realtime (Experimental)" = "实时转写(实验性)"; +"These settings apply to Whisper transcription sessions. Live Realtime (Experimental) streams partial text while you speak and does a final correction after stop. Turn it off to use the quality-first non-live path. Whisper translate is only used when Whisper translation is selected." = "这些设置适用于 Whisper 转录会话。实时转写(实验性)会在你说话时持续输出 partial 文本,并在停止后再做一次最终修正。关闭后会回到质量优先的非实时路径。只有在选择了 Whisper 翻译时,才会使用 Whisper translate。"; +"Meeting Notes currently supports MLX Audio and Remote ASR. Direct Dictation is not available for meetings." = "会议记录当前只支持 MLX Audio 和 Remote ASR,不支持 Direct Dictation。"; "Realtime On" = "实时模式"; "Quality Mode" = "质量模式"; "Whisper translation works only when the ASR engine is Whisper. Voxt will fall back to your saved LLM provider." = "Whisper 翻译仅在 ASR 引擎为 Whisper 时可用。否则 Voxt 会回退到你保存的 LLM 提供者。"; diff --git a/VoxtTests/ConfigurationTransferManagerTests.swift b/VoxtTests/ConfigurationTransferManagerTests.swift index 707448a..55759ac 100644 --- a/VoxtTests/ConfigurationTransferManagerTests.swift +++ b/VoxtTests/ConfigurationTransferManagerTests.swift @@ -408,7 +408,7 @@ final class ConfigurationTransferManagerTests: XCTestCase { XCTAssertEqual(decoded.whisperTemperature, 0.0, accuracy: 0.0001) XCTAssertTrue(decoded.whisperVADEnabled) XCTAssertFalse(decoded.whisperTimestampsEnabled) - XCTAssertTrue(decoded.whisperRealtimeEnabled) + XCTAssertFalse(decoded.whisperRealtimeEnabled) XCTAssertEqual( WhisperLocalTuningSettingsStore.resolvedSettings(from: decoded.whisperLocalASRTuningSettings), WhisperLocalTuningSettings.defaults(for: .balanced) diff --git a/VoxtTests/FeatureModelCatalogBuilderTests.swift b/VoxtTests/FeatureModelCatalogBuilderTests.swift index c826ca3..258e8e4 100644 --- a/VoxtTests/FeatureModelCatalogBuilderTests.swift +++ b/VoxtTests/FeatureModelCatalogBuilderTests.swift @@ -98,6 +98,23 @@ final class FeatureModelCatalogBuilderTests: XCTestCase { ) } + func testMeetingASRSelectorExcludesWhisperEntries() { + let builder = makeBuilder( + featureSettings: makeFeatureSettings(meetingASR: .mlx(MLXModelManager.defaultModelRepo)) + ) + + let meetingEntries = builder.entries(for: .meetingASR) + + XCTAssertFalse( + meetingEntries.contains { entry in + if case .whisper = entry.selectionID.asrSelection { + return true + } + return false + } + ) + } + func testUnconfiguredRemoteLLMEntryRemainsNotConfiguredInSelector() throws { let builder = makeBuilder( featureSettings: makeFeatureSettings(translationModel: .remoteLLM(.openAI)) diff --git a/VoxtTests/FeatureSettingsStoreTests.swift b/VoxtTests/FeatureSettingsStoreTests.swift index 343902b..e030739 100644 --- a/VoxtTests/FeatureSettingsStoreTests.swift +++ b/VoxtTests/FeatureSettingsStoreTests.swift @@ -145,6 +145,66 @@ final class FeatureSettingsStoreTests: XCTestCase { XCTAssertTrue(defaults.bool(forKey: AppPreferenceKey.meetingRealtimeTranslateEnabled)) } + func testPrepareLegacyMeetingFallsBackFromWhisperToMLX() { + let defaults = TestDoubles.makeUserDefaults() + defaults.set("mlx-community/Voxtral-Mini-4B-Realtime-2602-4bit", forKey: AppPreferenceKey.mlxModelRepo) + let settings = FeatureSettings( + transcription: .init( + asrSelectionID: .whisper("large-v3"), + llmEnabled: false, + llmSelectionID: .localLLM(CustomLLMModelManager.defaultModelRepo), + prompt: AppPreferenceKey.defaultEnhancementPrompt + ), + translation: .init( + asrSelectionID: .whisper("large-v3"), + modelSelectionID: .localLLM(CustomLLMModelManager.defaultModelRepo), + targetLanguageRawValue: TranslationTargetLanguage.english.rawValue, + prompt: AppPreferenceKey.defaultTranslationPrompt, + replaceSelectedText: true + ), + rewrite: .init( + asrSelectionID: .whisper("large-v3"), + llmSelectionID: .localLLM(CustomLLMModelManager.defaultModelRepo), + prompt: AppPreferenceKey.defaultRewritePrompt, + appEnhancementEnabled: false + ), + meeting: .init( + enabled: true, + asrSelectionID: .whisper("large-v3"), + summaryModelSelectionID: .localLLM(CustomLLMModelManager.defaultModelRepo), + summaryPrompt: AppPreferenceKey.defaultMeetingSummaryPrompt, + summaryAutoGenerate: true, + realtimeTranslateEnabled: false, + realtimeTargetLanguageRawValue: "", + showOverlayInScreenShare: false + ) + ) + + FeatureSettingsStore.prepareLegacyMeeting(from: settings, defaults: defaults) + + XCTAssertEqual(defaults.string(forKey: AppPreferenceKey.transcriptionEngine), TranscriptionEngine.mlxAudio.rawValue) + XCTAssertEqual( + defaults.string(forKey: AppPreferenceKey.mlxModelRepo), + "mlx-community/Voxtral-Mini-4B-Realtime-2602-4bit" + ) + } + + func testLoadFallsBackMeetingWhisperSelectionToMLX() { + let defaults = TestDoubles.makeUserDefaults() + defaults.set("mlx-community/Voxtral-Mini-4B-Realtime-2602-4bit", forKey: AppPreferenceKey.mlxModelRepo) + defaults.set( + """ + {"transcription":{"asrSelectionID":"whisper:large-v3","llmEnabled":false,"llmSelectionID":"local-llm:\(CustomLLMModelManager.defaultModelRepo)","prompt":"prompt","notes":{"enabled":false,"triggerShortcut":{"keyCode":49,"modifiersRawValue":0,"sidedModifiersRawValue":0},"titleModelSelectionID":"local-llm:\(CustomLLMModelManager.defaultModelRepo)","soundEnabled":false,"soundPreset":"soft"}},"translation":{"asrSelectionID":"whisper:large-v3","modelSelectionID":"local-llm:\(CustomLLMModelManager.defaultModelRepo)","targetLanguageRawValue":"english","prompt":"translation","replaceSelectedText":true,"showResultWindow":true},"rewrite":{"asrSelectionID":"whisper:large-v3","llmSelectionID":"local-llm:\(CustomLLMModelManager.defaultModelRepo)","prompt":"rewrite","appEnhancementEnabled":false,"continueShortcut":{"keyCode":49,"modifiersRawValue":0,"sidedModifiersRawValue":0}},"meeting":{"enabled":true,"asrSelectionID":"whisper:large-v3","summaryModelSelectionID":"local-llm:\(CustomLLMModelManager.defaultModelRepo)","summaryPrompt":"meeting","summaryAutoGenerate":true,"realtimeTranslateEnabled":false,"realtimeTargetLanguageRawValue":"","showOverlayInScreenShare":false}} + """, + forKey: AppPreferenceKey.featureSettings + ) + + let settings = FeatureSettingsStore.load(defaults: defaults) + + XCTAssertEqual(settings.meeting.asrSelectionID, .mlx("mlx-community/Voxtral-Mini-4B-Realtime-2602-4bit")) + XCTAssertEqual(settings.transcription.asrSelectionID, .whisper("large-v3")) + } + func testPrepareLegacyMeetingMirrorsDisabledMeetingState() { let defaults = TestDoubles.makeUserDefaults() let settings = FeatureSettings( diff --git a/VoxtTests/MeetingASRSupportTests.swift b/VoxtTests/MeetingASRSupportTests.swift index f8e7034..54db609 100644 --- a/VoxtTests/MeetingASRSupportTests.swift +++ b/VoxtTests/MeetingASRSupportTests.swift @@ -31,7 +31,7 @@ final class MeetingASRSupportTests: XCTestCase { } } - func testWhisperRealtimeUsesRealtimeProfile() { + func testWhisperMeetingFallsBackToMLXContext() { let context = MeetingASRSupport.resolveContext( transcriptionEngine: .whisperKit, whisperModelState: .ready, @@ -39,17 +39,21 @@ final class MeetingASRSupportTests: XCTestCase { whisperRealtimeEnabled: true, whisperIsCurrentModelLoaded: true, whisperDisplayTitle: { _ in "Whisper Base" }, - mlxModelState: .notDownloaded, - mlxCurrentModelRepo: MLXModelManager.defaultModelRepo, - mlxIsCurrentModelLoaded: false, - mlxDisplayTitle: { _ in "" }, + mlxModelState: .ready, + mlxCurrentModelRepo: "mlx-community/Voxtral-Mini-4B-Realtime-2602-fp16", + mlxIsCurrentModelLoaded: true, + mlxDisplayTitle: { _ in "Voxtral Realtime Mini 4B" }, remoteProvider: .openAIWhisper, remoteConfiguration: .init(providerID: RemoteASRProvider.openAIWhisper.rawValue, model: "whisper-1", endpoint: "", apiKey: "") ) - XCTAssertEqual(context.engine, .whisperKit) + XCTAssertEqual(context.engine, .mlxAudio) XCTAssertEqual(context.chunkingProfile, .realtime) XCTAssertFalse(context.needsModelInitialization) + XCTAssertEqual( + context.historyModelDescription, + "Voxtral Realtime Mini 4B (mlx-community/Voxtral-Mini-4B-Realtime-2602-fp16)" + ) } func testMLXRealtimeModelUsesRealtimeProfile() { diff --git a/VoxtTests/MeetingStartPlannerTests.swift b/VoxtTests/MeetingStartPlannerTests.swift index 8b280b0..5dc5380 100644 --- a/VoxtTests/MeetingStartPlannerTests.swift +++ b/VoxtTests/MeetingStartPlannerTests.swift @@ -3,7 +3,19 @@ import XCTest @MainActor final class MeetingStartPlannerTests: XCTestCase { - func testWhisperMeetingFollowsRecordingPlanner() { + func testWhisperMeetingFallsBackToMLXWhenAvailable() { + let decision = MeetingStartPlanner.resolve( + selectedEngine: .whisperKit, + mlxModelState: .ready, + whisperModelState: .ready, + remoteASRProvider: .openAIWhisper, + remoteASRConfiguration: .init(providerID: RemoteASRProvider.openAIWhisper.rawValue, model: "", endpoint: "", apiKey: "") + ) + + XCTAssertEqual(decision, .start(.mlxAudio)) + } + + func testWhisperMeetingUsesMLXAvailabilityRules() { let decision = MeetingStartPlanner.resolve( selectedEngine: .whisperKit, mlxModelState: .notDownloaded, @@ -12,7 +24,7 @@ final class MeetingStartPlannerTests: XCTestCase { remoteASRConfiguration: .init(providerID: RemoteASRProvider.openAIWhisper.rawValue, model: "", endpoint: "", apiKey: "") ) - XCTAssertEqual(decision, .start(.whisperKit)) + XCTAssertEqual(decision, .blocked(.recording(.mlxModelNotInstalled))) } func testMLXMeetingFollowsRecordingPlanner() { diff --git a/VoxtTests/RecordingSessionSupportTests.swift b/VoxtTests/RecordingSessionSupportTests.swift index 35c6c85..773d106 100644 --- a/VoxtTests/RecordingSessionSupportTests.swift +++ b/VoxtTests/RecordingSessionSupportTests.swift @@ -19,6 +19,13 @@ final class RecordingSessionSupportTests: XCTestCase { } func testStopRecordingFallbackTimeoutUsesProviderSpecificRemoteBudget() { + XCTAssertEqual( + RecordingSessionSupport.stopRecordingFallbackTimeoutSeconds( + transcriptionEngine: .whisperKit, + remoteProvider: .openAIWhisper + ), + 20 + ) XCTAssertEqual( RecordingSessionSupport.stopRecordingFallbackTimeoutSeconds( transcriptionEngine: .remote, diff --git a/VoxtTests/RemoteModelConfigurationTests.swift b/VoxtTests/RemoteModelConfigurationTests.swift index 9ba3eb7..9a98a22 100644 --- a/VoxtTests/RemoteModelConfigurationTests.swift +++ b/VoxtTests/RemoteModelConfigurationTests.swift @@ -142,6 +142,71 @@ final class RemoteModelConfigurationTests: XCTestCase { ) } + func testAliyunASRModelOptionsIncludeOmniRealtimeModels() { + let ids = Set(RemoteASRProvider.aliyunBailianASR.modelOptions.map(\.id)) + XCTAssertTrue(ids.contains("qwen3.5-omni-flash-realtime")) + XCTAssertTrue(ids.contains("qwen3.5-omni-plus-realtime")) + XCTAssertTrue(ids.contains("qwen-omni-turbo-realtime")) + } + + func testAliyunRealtimeModelFamilyDetectionSeparatesQwenAndOmni() { + XCTAssertEqual( + RemoteASREndpointSupport.aliyunQwenRealtimeSessionKind(for: "qwen3-asr-flash-realtime"), + .qwenASR + ) + XCTAssertEqual( + RemoteASREndpointSupport.aliyunQwenRealtimeSessionKind(for: "qwen3.5-omni-flash-realtime"), + .omniASR + ) + XCTAssertEqual( + RemoteASREndpointSupport.aliyunQwenRealtimeSessionKind(for: "qwen3.5-omni-plus-realtime"), + .omniASR + ) + XCTAssertEqual( + RemoteASREndpointSupport.aliyunQwenRealtimeSessionKind(for: "qwen-omni-turbo-realtime"), + .omniASR + ) + XCTAssertNil(RemoteASREndpointSupport.aliyunQwenRealtimeSessionKind(for: "fun-asr-realtime")) + } + + func testAliyunOmniSessionUpdatePayloadUsesExplicitInputTranscriptionModel() throws { + let payload = AliyunQwenRealtimePayloadSupport.sessionUpdatePayload( + kind: .omniASR, + hintPayload: ResolvedASRHintPayload(language: "zh", languageHints: ["zh"]) + ) + + let session = try XCTUnwrap(payload["session"] as? [String: Any]) + let transcription = try XCTUnwrap(session["input_audio_transcription"] as? [String: Any]) + let turnDetection = try XCTUnwrap(session["turn_detection"] as? [String: Any]) + + XCTAssertEqual(payload["type"] as? String, "session.update") + XCTAssertEqual(session["modalities"] as? [String], ["text"]) + XCTAssertEqual(session["input_audio_format"] as? String, "pcm") + XCTAssertEqual(session["sample_rate"] as? Int, 16000) + XCTAssertEqual(transcription["model"] as? String, "qwen3-asr-flash-realtime") + XCTAssertEqual(transcription["language"] as? String, "zh") + XCTAssertEqual(turnDetection["type"] as? String, "server_vad") + XCTAssertEqual(turnDetection["threshold"] as? Double, 0.0) + XCTAssertEqual(turnDetection["silence_duration_ms"] as? Int, 400) + } + + func testAliyunOmniRealtimeDoesNotRequireManualCommitWhenUsingServerVAD() { + XCTAssertFalse(AliyunQwenRealtimeSessionKind.omniASR.shouldCommitBeforeFinish) + } + + func testAliyunQwenSessionUpdatePayloadLeavesTranscriptionModelUnset() throws { + let payload = AliyunQwenRealtimePayloadSupport.sessionUpdatePayload( + kind: .qwenASR, + hintPayload: ResolvedASRHintPayload(language: nil, languageHints: []) + ) + + let session = try XCTUnwrap(payload["session"] as? [String: Any]) + let transcription = try XCTUnwrap(session["input_audio_transcription"] as? [String: Any]) + + XCTAssertNil(transcription["model"]) + XCTAssertNil(transcription["language"]) + } + func testLoadSaveRoundTripPreservesConfigurations() { let stored: [String: RemoteProviderConfiguration] = [ RemoteASRProvider.openAIWhisper.rawValue: TestFactories.makeRemoteConfiguration( diff --git a/VoxtTests/RemoteProviderConnectivityTestEndpointsTests.swift b/VoxtTests/RemoteProviderConnectivityTestEndpointsTests.swift index d21db4e..8dd5b4c 100644 --- a/VoxtTests/RemoteProviderConnectivityTestEndpointsTests.swift +++ b/VoxtTests/RemoteProviderConnectivityTestEndpointsTests.swift @@ -41,4 +41,14 @@ final class RemoteProviderConnectivityTestEndpointsTests: XCTestCase { "wss://dashscope.aliyuncs.com/api-ws/v1/realtime?model=qwen3-asr-flash-realtime" ) } + + func testResolvedAliyunOmniRealtimeEndpointAddsMissingModelQuery() { + XCTAssertEqual( + RemoteProviderConnectivityTestEndpoints.resolvedAliyunASRQwenRealtimeWebSocketEndpoint( + endpoint: "wss://dashscope.aliyuncs.com/api-ws/v1/realtime", + model: "qwen3.5-omni-flash-realtime" + ), + "wss://dashscope.aliyuncs.com/api-ws/v1/realtime?model=qwen3.5-omni-flash-realtime" + ) + } } diff --git a/VoxtTests/SessionEndFlowTests.swift b/VoxtTests/SessionEndFlowTests.swift index 3487c1c..dc58786 100644 --- a/VoxtTests/SessionEndFlowTests.swift +++ b/VoxtTests/SessionEndFlowTests.swift @@ -2,6 +2,43 @@ import XCTest @testable import Voxt final class SessionEndFlowTests: XCTestCase { + func testSessionCallbackHandlingDecisionAcceptsActiveNonCancelledSession() { + let sessionID = UUID() + + XCTAssertEqual( + AppDelegate.sessionCallbackHandlingDecision( + requestedSessionID: sessionID, + activeSessionID: sessionID, + isSessionCancellationRequested: false + ), + .accept + ) + } + + func testSessionCallbackHandlingDecisionRejectsStaleSession() { + XCTAssertEqual( + AppDelegate.sessionCallbackHandlingDecision( + requestedSessionID: UUID(), + activeSessionID: UUID(), + isSessionCancellationRequested: false + ), + .rejectStale + ) + } + + func testSessionCallbackHandlingDecisionRejectsCancelledSession() { + let sessionID = UUID() + + XCTAssertEqual( + AppDelegate.sessionCallbackHandlingDecision( + requestedSessionID: sessionID, + activeSessionID: sessionID, + isSessionCancellationRequested: true + ), + .rejectCancelled + ) + } + func testSessionEndExecutionDecisionAllowsFreshSession() { let sessionID = UUID() @@ -40,4 +77,61 @@ final class SessionEndFlowTests: XCTestCase { .skipAlreadyCompleted ) } + + func testStopRecordingFallbackDecisionExtendsGraceForWhisperFinalizationWithoutResult() { + XCTAssertEqual( + AppDelegate.stopRecordingFallbackDecision( + transcriptionEngine: .whisperKit, + isWhisperFinalizing: true, + transcriptionResultReceived: false, + isExtendedGrace: false + ), + .extendGrace(seconds: 12) + ) + } + + func testStopRecordingFallbackDecisionFinishesWhenWhisperAlreadyProducedResult() { + XCTAssertEqual( + AppDelegate.stopRecordingFallbackDecision( + transcriptionEngine: .whisperKit, + isWhisperFinalizing: true, + transcriptionResultReceived: true, + isExtendedGrace: false + ), + .finishNow + ) + } + + func testStopRecordingFallbackDecisionFinishesForExtendedGraceTimeout() { + XCTAssertEqual( + AppDelegate.stopRecordingFallbackDecision( + transcriptionEngine: .whisperKit, + isWhisperFinalizing: true, + transcriptionResultReceived: false, + isExtendedGrace: true + ), + .finishNow + ) + } + + func testStopRecordingFallbackDecisionFinishesForNonWhisperEngines() { + XCTAssertEqual( + AppDelegate.stopRecordingFallbackDecision( + transcriptionEngine: .mlxAudio, + isWhisperFinalizing: true, + transcriptionResultReceived: false, + isExtendedGrace: false + ), + .finishNow + ) + XCTAssertEqual( + AppDelegate.stopRecordingFallbackDecision( + transcriptionEngine: .remote, + isWhisperFinalizing: true, + transcriptionResultReceived: false, + isExtendedGrace: false + ), + .finishNow + ) + } } diff --git a/VoxtTests/WhisperLongFormReplayIntegrationTests.swift b/VoxtTests/WhisperLongFormReplayIntegrationTests.swift new file mode 100644 index 0000000..59f3e65 --- /dev/null +++ b/VoxtTests/WhisperLongFormReplayIntegrationTests.swift @@ -0,0 +1,205 @@ +import XCTest +@testable import Voxt + +@MainActor +final class WhisperLongFormReplayIntegrationTests: XCTestCase { + private func resolvedCandidateClipPaths() -> [String] { + let overridePathFile = "/tmp/voxt-longform-replay-clip-path.txt" + let overridePath = try? String(contentsOfFile: overridePathFile, encoding: .utf8) + .trimmingCharacters(in: .whitespacesAndNewlines) + let candidates = [ + ProcessInfo.processInfo.environment["VOXT_LONGFORM_REPLAY_CLIP"], + overridePath, + "/Users/guanwei/Library/Application Support/Voxt/transcription-history-audio/transcription/transcription-FD3C99FC-822F-45DB-8734-FFADEF6DC6EE.wav", + "/Users/guanwei/Library/Application Support/Voxt/transcription-history-audio/transcription/transcription-6247D986-B2EC-4758-AB40-7C1030296D7A.wav", + "/Users/guanwei/Downloads/transcription/20260505-104918-transcription-5F2FAD9F-D22E-4D0E-BA36-E1D95A53197D.wav", + "/Users/guanwei/Downloads/transcription/transcription-0A0E87B1-7C9A-4BB6-8469-E18485A63103.wav", + "/Users/guanwei/Downloads/transcription/20260507-123725-transcription-CF5D4F69-31F4-4F86-ADCA-18029BDB0EE8.wav" + ] + .compactMap { $0?.trimmingCharacters(in: .whitespacesAndNewlines) } + .filter { !$0.isEmpty && FileManager.default.fileExists(atPath: $0) } + + var seen = Set() + return candidates.filter { seen.insert($0).inserted } + } + + private func knownGoodBaselineClipPaths() -> [String] { + let preferred = [ + ProcessInfo.processInfo.environment["VOXT_LONGFORM_REPLAY_CLIP"], + try? String(contentsOfFile: "/tmp/voxt-longform-replay-clip-path.txt", encoding: .utf8) + .trimmingCharacters(in: .whitespacesAndNewlines), + "/Users/guanwei/Library/Application Support/Voxt/transcription-history-audio/transcription/transcription-6247D986-B2EC-4758-AB40-7C1030296D7A.wav", + "/Users/guanwei/Library/Application Support/Voxt/transcription-history-audio/transcription/transcription-FD3C99FC-822F-45DB-8734-FFADEF6DC6EE.wav", + "/Users/guanwei/Downloads/transcription/transcription-0A0E87B1-7C9A-4BB6-8469-E18485A63103.wav", + "/Users/guanwei/Downloads/transcription/20260507-123725-transcription-CF5D4F69-31F4-4F86-ADCA-18029BDB0EE8.wav" + ] + .compactMap { $0?.trimmingCharacters(in: .whitespacesAndNewlines) } + .filter { !$0.isEmpty && FileManager.default.fileExists(atPath: $0) } + + var seen = Set() + return preferred.filter { seen.insert($0).inserted } + } + + private func resolvedModelIDAndHubURL() throws -> (modelID: String, hubURL: URL) { + let defaults = UserDefaults.standard + defaults.set("/Users/guanwei/x/models", forKey: AppPreferenceKey.modelStorageRootPath) + defaults.removeObject(forKey: AppPreferenceKey.modelStorageRootBookmark) + let hubURL = defaults.bool(forKey: AppPreferenceKey.useHfMirror) + ? MLXModelManager.mirrorHubBaseURL + : MLXModelManager.defaultHubBaseURL + let preferredModelID = defaults.string(forKey: AppPreferenceKey.whisperModelID) ?? WhisperKitModelManager.defaultModelID + let candidateModelIDs = [preferredModelID, "large-v3", "small", "base"] + WhisperKitModelManager.availableModels.map(\.id) + let probeManager = WhisperKitModelManager(modelID: preferredModelID, hubBaseURL: hubURL) + guard let chosenModelID = candidateModelIDs + .map(WhisperKitModelManager.canonicalModelID(_:)) + .first(where: { probeManager.isModelDownloaded(id: $0) }) else { + throw XCTSkip("No downloaded Whisper model is available for long-form replay.") + } + return (chosenModelID, hubURL) + } + + func testOfflineTranscriptionConfirmsModelSupportsProvidedLongFormClip() async throws { + let existingClipPaths = resolvedCandidateClipPaths() + guard let clipPath = existingClipPaths.first else { + throw XCTSkip("No long-form replay clip is available.") + } + let clipURL = URL(fileURLWithPath: clipPath) + let resolved = try resolvedModelIDAndHubURL() + let transcriber = WhisperKitTranscriber( + modelManager: WhisperKitModelManager(modelID: resolved.modelID, hubBaseURL: resolved.hubURL) + ) + + let text = try await transcriber.transcribeAudioFile(clipURL) + XCTAssertFalse(text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty) + XCTAssertGreaterThan(text.trimmingCharacters(in: .whitespacesAndNewlines).count, 20) + } + + func testReplayProvidedLongFormClipProducesFinalTranscript() async throws { + let existingClipPaths = resolvedCandidateClipPaths() + guard let clipPath = existingClipPaths.first else { + throw XCTSkip("No long-form replay clip is available.") + } + let clipURL = URL(fileURLWithPath: clipPath) + let longFormReplayStepSeconds = 4.0 + + let resolved = try resolvedModelIDAndHubURL() + let transcriber = WhisperKitTranscriber( + modelManager: WhisperKitModelManager(modelID: resolved.modelID, hubBaseURL: resolved.hubURL) + ) + let offlineText = try await transcriber.transcribeAudioFile(clipURL) + let offlineCount = offlineText.trimmingCharacters(in: .whitespacesAndNewlines).count + + let diagnostics = try await transcriber.debugReplayRealtimeAudioFileWithTrace( + clipURL, + stepSeconds: longFormReplayStepSeconds + ) + let finalEvent = try XCTUnwrap(diagnostics.events.last(where: { $0.isFinal })) + + XCTAssertFalse(offlineText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty) + XCTAssertFalse(finalEvent.text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty) + XCTAssertTrue(diagnostics.events.contains(where: { $0.isFinal })) + + let finalCount = finalEvent.text.trimmingCharacters(in: .whitespacesAndNewlines).count + if finalEvent.elapsedSeconds >= 60, offlineCount >= 60 { + XCTAssertGreaterThanOrEqual( + finalCount, + Int(Double(offlineCount) * 0.6), + "Realtime long-form final transcript collapsed too far below the offline baseline." + ) + } + + let strongestLiveTextCount = diagnostics.events + .filter { !$0.isFinal } + .map { $0.text.trimmingCharacters(in: .whitespacesAndNewlines).count } + .max() ?? 0 + XCTAssertGreaterThanOrEqual( + finalCount, + min(strongestLiveTextCount, 2), + "Final transcript should not collapse below the strongest live hypothesis baseline." + ) + + let clipDurationSeconds = finalEvent.elapsedSeconds + if clipDurationSeconds >= WhisperKitTranscriber.realtimeLongFormFinalProfileThresholdSeconds { + XCTAssertTrue( + diagnostics.trace.contains(where: { $0.contains("final/offline") }), + "Long-form replay should use the same offline-biased final profile as runtime stop finalization." + ) + } + if clipDurationSeconds >= 30 { + let latestLiveEventTime = diagnostics.events + .filter { !$0.isFinal } + .map(\.elapsedSeconds) + .max() ?? 0 + XCTAssertGreaterThanOrEqual( + latestLiveEventTime, + clipDurationSeconds * 0.8, + "Long-form realtime replay should keep publishing updates into the later portion of the clip." + ) + } + } + + func testReplayAllAvailableLongFormClipsProduceNonEmptyFinalTranscript() async throws { + let candidateClipPaths = resolvedCandidateClipPaths() + guard !candidateClipPaths.isEmpty else { + throw XCTSkip("No available long-form clips found.") + } + let resolved = try resolvedModelIDAndHubURL() + let transcriber = WhisperKitTranscriber( + modelManager: WhisperKitModelManager(modelID: resolved.modelID, hubBaseURL: resolved.hubURL) + ) + for clipPath in candidateClipPaths { + let diagnostics = try await transcriber.debugReplayRealtimeAudioFileWithTrace( + URL(fileURLWithPath: clipPath), + stepSeconds: 4.0 + ) + let finalEvent = try XCTUnwrap(diagnostics.events.last(where: { $0.isFinal })) + XCTAssertFalse( + finalEvent.text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty, + "Expected non-empty final transcript for \(clipPath)" + ) + } + } + + func testReplayAllAvailableLongFormClipsTrackOfflineBaseline() async throws { + let candidateClipPaths = knownGoodBaselineClipPaths() + guard !candidateClipPaths.isEmpty else { + throw XCTSkip("No known-good long-form baseline clips found.") + } + + let resolved = try resolvedModelIDAndHubURL() + let transcriber = WhisperKitTranscriber( + modelManager: WhisperKitModelManager(modelID: resolved.modelID, hubBaseURL: resolved.hubURL) + ) + + for clipPath in candidateClipPaths { + let clipURL = URL(fileURLWithPath: clipPath) + let offlineText = try await transcriber.transcribeAudioFile(clipURL) + let offlineCount = offlineText.trimmingCharacters(in: .whitespacesAndNewlines).count + + let diagnostics = try await transcriber.debugReplayRealtimeAudioFileWithTrace( + clipURL, + stepSeconds: 4.0 + ) + let finalEvent = try XCTUnwrap(diagnostics.events.last(where: { $0.isFinal })) + let finalCount = finalEvent.text.trimmingCharacters(in: .whitespacesAndNewlines).count + + XCTAssertFalse( + finalEvent.text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty, + "Expected non-empty final transcript for \(clipPath)" + ) + XCTAssertGreaterThan(offlineCount, 20, "Offline baseline should be meaningfully non-empty for \(clipPath)") + XCTAssertGreaterThanOrEqual( + finalCount, + Int(Double(offlineCount) * 0.6), + "Realtime final transcript collapsed too far below the offline baseline for \(clipPath)" + ) + + if finalEvent.elapsedSeconds >= WhisperKitTranscriber.realtimeLongFormFinalProfileThresholdSeconds { + XCTAssertTrue( + diagnostics.trace.contains(where: { $0.contains("final/offline") }), + "Long-form replay should use the offline final profile for \(clipPath)" + ) + } + } + } +} diff --git a/VoxtTests/WhisperRealtimeEagerStateTests.swift b/VoxtTests/WhisperRealtimeEagerStateTests.swift new file mode 100644 index 0000000..a09bad3 --- /dev/null +++ b/VoxtTests/WhisperRealtimeEagerStateTests.swift @@ -0,0 +1,126 @@ +import Testing +import WhisperKit +@testable import Voxt + +struct WhisperRealtimeEagerStateTests { + @Test + func firstPassPublishesImmediateHypothesis() { + var state = WhisperRealtimeEagerState() + + let published = state.apply(hypothesisText: "hello world") + + #expect(published == "hello world") + #expect(state.publishedText == "hello world") + } + + @Test + func overlapExtensionKeepsDroppedPrefixVisible() { + var state = WhisperRealtimeEagerState() + + _ = state.apply(hypothesisText: "你好这是一个最小的") + let published = state.apply(hypothesisText: "这是一个最小的回归") + + #expect(published == "你好这是一个最小的回归") + #expect(state.publishedText == "你好这是一个最小的回归") + } + + @Test + func punctuationCandidateReplacesCleanlyWithoutPrefixCorruption() { + var state = WhisperRealtimeEagerState() + + _ = state.apply(hypothesisText: "你好这是一个最小的回归测试测试已经") + let published = state.apply(hypothesisText: "你好,这是一个最小的回归测试测试已经通过") + + #expect(published == "你好,这是一个最小的回归测试测试已经通过") + #expect(state.publishedText == "你好,这是一个最小的回归测试测试已经通过") + } + + @Test + func shortNewUtteranceAfterSealDoesNotAppendImmediately() { + var state = WhisperRealtimeEagerState() + + _ = state.apply(hypothesisText: "你好") + state.sealCurrentPublishedTextForNextUtterance() + + let shortCandidate = state.apply(hypothesisText: "美国") + let longerCandidate = state.apply(hypothesisText: "谢谢大家") + + #expect(shortCandidate == nil) + #expect(longerCandidate == "你好谢谢大家") + #expect(state.publishedText == "你好谢谢大家") + } + + @Test + func sealingCurrentPublishedTextStartsANewUtteranceWithoutDuplicatingBoundary() { + var state = WhisperRealtimeEagerState() + + _ = state.apply(hypothesisText: "你好这是一个最小的回归测试") + state.sealCurrentPublishedTextForNextUtterance() + let published = state.apply(hypothesisText: "回归测试相比上一版") + + #expect(published == "你好这是一个最小的回归测试相比上一版") + #expect(state.publishedText == "你好这是一个最小的回归测试相比上一版") + } + + @Test + func finalOverridesRealtimeState() { + var state = WhisperRealtimeEagerState() + + _ = state.apply(hypothesisText: "draft text") + let final = state.applyFinal("final corrected text") + + #expect(final == "final corrected text") + #expect(state.publishedText == "final corrected text") + #expect(state.liveCandidateText.isEmpty) + } + + @Test + func shortRealtimeStopKeepsRealtimeFinalProfile() { + let useOffline = WhisperKitTranscriber.shouldUseOfflineFinalProfileForStop( + realtimeEnabled: true, + bufferedSeconds: 12 + ) + + #expect(useOffline == false) + } + + @Test + func longRealtimeStopPromotesToOfflineProfile() { + let useOffline = WhisperKitTranscriber.shouldUseOfflineFinalProfileForStop( + realtimeEnabled: true, + bufferedSeconds: 123 + ) + + #expect(useOffline) + } + + @Test + func realtimeFinalPreservesLongerLiveTailWhenModelFinalCollapsesToPrefix() { + let resolved = WhisperKitTranscriber.reconcileRealtimeFinalText( + finalText: "你好这是一个最小的回归测试", + latestPublishedText: "你好这是一个最小的回归测试相比上一版长时间已经继续输出有问题了" + ) + + #expect(resolved == "你好这是一个最小的回归测试相比上一版长时间已经继续输出有问题了") + } + + @Test + func realtimeFinalKeepsModelFinalWhenLiveTextDoesNotExtendItAsPrefix() { + let resolved = WhisperKitTranscriber.reconcileRealtimeFinalText( + finalText: "你好这是一个最小的回归测试已经通过", + latestPublishedText: "你好这是一个最小的回归测试相比上一版" + ) + + #expect(resolved == "你好这是一个最小的回归测试已经通过") + } + + @Test + func realtimeFinalDoesNotPreferSlightlyLongerLiveTail() { + let resolved = WhisperKitTranscriber.reconcileRealtimeFinalText( + finalText: "你好这是一个最小的回归测试", + latestPublishedText: "你好这是一个最小的回归测试呀" + ) + + #expect(resolved == "你好这是一个最小的回归测试") + } +} diff --git a/VoxtTests/WhisperRealtimeReplayIntegrationTests.swift b/VoxtTests/WhisperRealtimeReplayIntegrationTests.swift new file mode 100644 index 0000000..bc5b927 --- /dev/null +++ b/VoxtTests/WhisperRealtimeReplayIntegrationTests.swift @@ -0,0 +1,103 @@ +import XCTest +@testable import Voxt + +@MainActor +final class WhisperRealtimeReplayIntegrationTests: XCTestCase { + private func resolvedReplayClipPath() -> String { + let overridePathFile = "/tmp/voxt-realtime-replay-clip-path.txt" + let overridePath = try? String(contentsOfFile: overridePathFile, encoding: .utf8) + .trimmingCharacters(in: .whitespacesAndNewlines) + return ProcessInfo.processInfo.environment["VOXT_REALTIME_REPLAY_CLIP"] + ?? overridePath + ?? "/Users/guanwei/Downloads/transcription/20260507-123725-transcription-CF5D4F69-31F4-4F86-ADCA-18029BDB0EE8.wav" + } + + private func resolvedPauseAwareClipPath() throws -> String { + let candidates = [ + ProcessInfo.processInfo.environment["VOXT_REALTIME_PAUSE_REPLAY_CLIP"], + "/Users/guanwei/Downloads/transcription/transcription-0A0E87B1-7C9A-4BB6-8469-E18485A63103.wav", + "/Users/guanwei/Downloads/transcription/20260507-123725-transcription-CF5D4F69-31F4-4F86-ADCA-18029BDB0EE8.wav" + ] + .compactMap { $0?.trimmingCharacters(in: .whitespacesAndNewlines) } + .filter { !$0.isEmpty && FileManager.default.fileExists(atPath: $0) } + + guard let clipPath = candidates.first else { + throw XCTSkip("No pause-aware realtime replay clip is available.") + } + return clipPath + } + + private func resolvedModelManager() throws -> (modelID: String, manager: WhisperKitModelManager) { + let defaults = UserDefaults.standard + defaults.set("/Users/guanwei/x/models", forKey: AppPreferenceKey.modelStorageRootPath) + defaults.removeObject(forKey: AppPreferenceKey.modelStorageRootBookmark) + let hubURL = defaults.bool(forKey: AppPreferenceKey.useHfMirror) + ? MLXModelManager.mirrorHubBaseURL + : MLXModelManager.defaultHubBaseURL + let preferredModelID = defaults.string(forKey: AppPreferenceKey.whisperModelID) ?? WhisperKitModelManager.defaultModelID + let candidateModelIDs = [preferredModelID, "large-v3", "small", "base"] + WhisperKitModelManager.availableModels.map(\.id) + let probeManager = WhisperKitModelManager(modelID: preferredModelID, hubBaseURL: hubURL) + guard let chosenModelID = candidateModelIDs + .map(WhisperKitModelManager.canonicalModelID(_:)) + .first(where: { probeManager.isModelDownloaded(id: $0) }) else { + throw XCTSkip("No downloaded Whisper model is available for realtime replay.") + } + return (chosenModelID, WhisperKitModelManager(modelID: chosenModelID, hubBaseURL: hubURL)) + } + + func testReplayProvidedClipProducesLiveAndFinalEvents() async throws { + let clipPath = resolvedReplayClipPath() + let clipURL = URL(fileURLWithPath: clipPath) + try XCTSkipUnless( + FileManager.default.fileExists(atPath: clipURL.path), + "Replay clip is missing: \(clipURL.path)" + ) + + let resolved = try resolvedModelManager() + + let transcriber = WhisperKitTranscriber(modelManager: resolved.manager) + let diagnostics = try await transcriber.debugReplayRealtimeAudioFileWithTrace(clipURL) + let events = diagnostics.events + + XCTAssertFalse(events.isEmpty) + XCTAssertTrue(events.contains(where: { !$0.isFinal })) + XCTAssertTrue(events.contains(where: \.isFinal)) + } + + func testReplayPauseAwareClipKeepsPublishingIntoLaterPortion() async throws { + let clipPath = try resolvedPauseAwareClipPath() + let clipURL = URL(fileURLWithPath: clipPath) + let resolved = try resolvedModelManager() + let transcriber = WhisperKitTranscriber(modelManager: resolved.manager) + + let diagnostics = try await transcriber.debugReplayRealtimeAudioFileWithTrace(clipURL) + let events = diagnostics.events + let finalEvent = try XCTUnwrap(events.last(where: { $0.isFinal })) + let liveEvents = events.filter { !$0.isFinal } + let liveTexts = liveEvents + .map { $0.text.trimmingCharacters(in: .whitespacesAndNewlines) } + .filter { !$0.isEmpty } + + XCTAssertGreaterThanOrEqual(liveTexts.count, 3, "Expected several progressive live updates.") + + let latestLiveTime = liveEvents.map(\.elapsedSeconds).max() ?? 0 + XCTAssertGreaterThanOrEqual( + latestLiveTime, + finalEvent.elapsedSeconds * 0.75, + "Realtime replay should keep publishing updates into the later portion of the clip, including after pauses." + ) + + if let silenceFlushIndex = liveEvents.firstIndex(where: { $0.source == "silence-flush" }), + silenceFlushIndex > 0 { + let previousEvent = liveEvents[silenceFlushIndex - 1] + let silenceFlushEvent = liveEvents[silenceFlushIndex] + XCTAssertLessThanOrEqual( + silenceFlushEvent.elapsedSeconds - previousEvent.elapsedSeconds, + 1.0, + "Pause handling should flush the previous phrase quickly instead of leaving the last words pending for about a second." + ) + } + + XCTAssertFalse(finalEvent.text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty) + } +}