-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
added more 3 vars for LLM post-processing transcript #704
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
2657c8e
cbc7494
1d1d1ac
b8ea76a
ed669c6
8db875c
d1a11d6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,23 +3,28 @@ use crate::apple_intelligence; | |
| use crate::audio_feedback::{play_feedback_sound, play_feedback_sound_blocking, SoundType}; | ||
| use crate::managers::audio::AudioRecordingManager; | ||
| use crate::managers::history::HistoryManager; | ||
| use crate::managers::model::{EngineType, ModelManager}; | ||
| use crate::managers::transcription::TranscriptionManager; | ||
| use crate::settings::{get_settings, AppSettings, APPLE_INTELLIGENCE_PROVIDER_ID}; | ||
| use crate::shortcut; | ||
| use crate::tray::{change_tray_icon, TrayIconState}; | ||
| use crate::utils::{ | ||
| self, show_processing_overlay, show_recording_overlay, show_transcribing_overlay, | ||
| }; | ||
| use crate::TranscriptionCoordinator; | ||
| use crate::{active_app, transcript_context, TranscriptionCoordinator}; | ||
| use ferrous_opencc::{config::BuiltinConfig, OpenCC}; | ||
| use log::{debug, error, warn}; | ||
| use once_cell::sync::Lazy; | ||
| use std::collections::HashMap; | ||
| use std::sync::Arc; | ||
| use std::sync::{Arc, Mutex}; | ||
| use std::time::Instant; | ||
| use tauri::AppHandle; | ||
| use tauri::Manager; | ||
|
|
||
| /// Tracks the frontmost application captured at recording start, keyed by binding_id | ||
| static RECORDING_APP_CONTEXT: Lazy<Mutex<HashMap<String, String>>> = | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: entries are inserted in |
||
| Lazy::new(|| Mutex::new(HashMap::new())); | ||
|
|
||
| /// Drop guard that notifies the [`TranscriptionCoordinator`] when the | ||
| /// transcription pipeline finishes — whether it completes normally or panics. | ||
| struct FinishGuard(AppHandle); | ||
|
|
@@ -42,6 +47,20 @@ struct TranscribeAction { | |
| post_process: bool, | ||
| } | ||
|
|
||
| /// Context information for LLM post-processing prompts. | ||
| /// These fields are available as variables in the prompt template. | ||
| #[derive(Clone, Debug, Default)] | ||
| pub struct PostProcessContext { | ||
| /// The name of the frontmost application when transcription started | ||
| pub current_app: String, | ||
| /// Short excerpt from previous transcript in the same app (last 200 words, expires after 5 min) | ||
| pub short_prev_transcript: String, | ||
| /// Current local time formatted as "Tuesday, February 3, 2026 10:33:39 AM" | ||
| pub time_local: String, | ||
| /// The selected language for transcription (e.g., "en", "zh-Hans"), or "auto" if not specified | ||
| pub language: String, | ||
| } | ||
|
|
||
| /// Field name for structured output JSON schema | ||
| const TRANSCRIPTION_FIELD: &str = "transcription"; | ||
|
|
||
|
|
@@ -52,11 +71,23 @@ fn strip_invisible_chars(s: &str) -> String { | |
|
|
||
| /// Build a system prompt from the user's prompt template. | ||
| /// Removes `${output}` placeholder since the transcription is sent as the user message. | ||
| fn build_system_prompt(prompt_template: &str) -> String { | ||
| prompt_template.replace("${output}", "").trim().to_string() | ||
| /// Substitutes all context variables so the LLM receives actual values. | ||
| fn build_system_prompt(prompt_template: &str, context: &PostProcessContext) -> String { | ||
| prompt_template | ||
| .replace("${output}", "") | ||
| .replace("${current_app}", &context.current_app) | ||
| .replace("${short_prev_transcript}", &context.short_prev_transcript) | ||
| .replace("${time_local}", &context.time_local) | ||
| .replace("${language}", &context.language) | ||
| .trim() | ||
| .to_string() | ||
| } | ||
|
|
||
| async fn post_process_transcription(settings: &AppSettings, transcription: &str) -> Option<String> { | ||
| async fn post_process_transcription( | ||
| settings: &AppSettings, | ||
| transcription: &str, | ||
| context: &PostProcessContext, | ||
| ) -> Option<String> { | ||
| let provider = match settings.active_post_process_provider().cloned() { | ||
| Some(provider) => provider, | ||
| None => { | ||
|
|
@@ -121,7 +152,7 @@ async fn post_process_transcription(settings: &AppSettings, transcription: &str) | |
| if provider.supports_structured_output { | ||
| debug!("Using structured outputs for provider '{}'", provider.id); | ||
|
|
||
| let system_prompt = build_system_prompt(&prompt); | ||
| let system_prompt = build_system_prompt(&prompt, context); | ||
| let user_content = transcription.to_string(); | ||
|
|
||
| // Handle Apple Intelligence separately since it uses native Swift APIs | ||
|
|
@@ -233,8 +264,13 @@ async fn post_process_transcription(settings: &AppSettings, transcription: &str) | |
| } | ||
| } | ||
|
|
||
| // Legacy mode: Replace ${output} variable in the prompt with the actual text | ||
| let processed_prompt = prompt.replace("${output}", transcription); | ||
| // Legacy mode: Replace all variables in the prompt with actual values | ||
| let processed_prompt = prompt | ||
| .replace("${output}", transcription) | ||
| .replace("${current_app}", &context.current_app) | ||
| .replace("${short_prev_transcript}", &context.short_prev_transcript) | ||
| .replace("${time_local}", &context.time_local) | ||
| .replace("${language}", &context.language); | ||
| debug!("Processed prompt length: {} chars", processed_prompt.len()); | ||
|
|
||
| match crate::llm_client::send_chat_completion(&provider, api_key, &model, processed_prompt) | ||
|
|
@@ -313,6 +349,16 @@ impl ShortcutAction for TranscribeAction { | |
| let start_time = Instant::now(); | ||
| debug!("TranscribeAction::start called for binding: {}", binding_id); | ||
|
|
||
| // Capture the frontmost application name for LLM context | ||
| // This is done early before any UI changes that might affect focus | ||
| let frontmost_app = active_app::get_frontmost_app_name().unwrap_or_default(); | ||
| debug!("Captured frontmost app: '{}'", frontmost_app); | ||
|
|
||
| // Store the captured app name for use when transcription completes | ||
| if let Ok(mut context) = RECORDING_APP_CONTEXT.lock() { | ||
| context.insert(binding_id.to_string(), frontmost_app); | ||
| } | ||
|
|
||
| // Load model in the background | ||
| let tm = app.state::<Arc<TranscriptionManager>>(); | ||
| tm.initiate_model_load(); | ||
|
|
@@ -385,6 +431,17 @@ impl ShortcutAction for TranscribeAction { | |
| let stop_time = Instant::now(); | ||
| debug!("TranscribeAction::stop called for binding: {}", binding_id); | ||
|
|
||
| // Retrieve the captured frontmost app name from recording start | ||
| let current_app = RECORDING_APP_CONTEXT | ||
| .lock() | ||
| .ok() | ||
| .and_then(|mut ctx| ctx.remove(binding_id)) | ||
| .unwrap_or_default(); | ||
| debug!( | ||
| "Retrieved frontmost app for binding '{}': '{}'", | ||
| binding_id, current_app | ||
| ); | ||
|
|
||
| let ah = app.clone(); | ||
| let rm = Arc::clone(&app.state::<Arc<AudioRecordingManager>>()); | ||
| let tm = Arc::clone(&app.state::<Arc<TranscriptionManager>>()); | ||
|
|
@@ -440,13 +497,51 @@ impl ShortcutAction for TranscribeAction { | |
| final_text = converted_text; | ||
| } | ||
|
|
||
| // Build context for LLM post-processing | ||
| // Get previous transcript from same app (last 200 words, 5 min expiry) | ||
| let short_prev_transcript = | ||
| transcript_context::get_short_prev_transcript(¤t_app); | ||
|
|
||
| // Generate formatted local time: "Tuesday, February 3, 2026 10:33:39 AM" | ||
| let time_local = chrono::Local::now() | ||
| .format("%A, %B %-d, %Y %-I:%M:%S %p") | ||
| .to_string(); | ||
|
|
||
| let pp_context = PostProcessContext { | ||
| current_app: current_app.clone(), | ||
| short_prev_transcript, | ||
| time_local, | ||
| language: { | ||
| // Only use selected_language for Whisper models, 'auto' for other models (Parakeet, Moonshine) | ||
| let mm = ah.state::<Arc<ModelManager>>(); | ||
| let is_whisper = mm | ||
| .get_model_info(&settings.selected_model) | ||
| .map(|m| matches!(m.engine_type, EngineType::Whisper)) | ||
| .unwrap_or(false); | ||
|
|
||
| if is_whisper && !settings.selected_language.is_empty() { | ||
| settings.selected_language.clone() | ||
| } else { | ||
| "auto".to_string() | ||
| } | ||
| }, | ||
| }; | ||
| debug!( | ||
| "Post-process context: app='{}', prev_transcript_len={}, time='{}', language='{}'", | ||
| pp_context.current_app, | ||
| pp_context.short_prev_transcript.len(), | ||
| pp_context.time_local, | ||
| pp_context.language | ||
| ); | ||
|
|
||
| // Then apply LLM post-processing if this is the post-process hotkey | ||
| // Uses final_text which may already have Chinese conversion applied | ||
| if post_process { | ||
| show_processing_overlay(&ah); | ||
| } | ||
| let processed = if post_process { | ||
| post_process_transcription(&settings, &final_text).await | ||
| post_process_transcription(&settings, &final_text, &pp_context) | ||
| .await | ||
| } else { | ||
| None | ||
| }; | ||
|
|
@@ -469,6 +564,13 @@ impl ShortcutAction for TranscribeAction { | |
| post_processed_text = Some(final_text.clone()); | ||
| } | ||
|
|
||
| // Update the transcript context for this app | ||
| // Use the original transcription (before post-processing) for context | ||
| transcript_context::update_transcript_context( | ||
| ¤t_app, | ||
| &transcription, | ||
| ); | ||
|
|
||
| // Save to history with post-processed text and prompt | ||
| let hm_clone = Arc::clone(&hm); | ||
| let transcription_for_history = transcription.clone(); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,122 @@ | ||
| //! Module for getting the frontmost/active application name. | ||
| //! This is platform-specific and returns the name of the application | ||
| //! that has keyboard focus when the user starts transcribing. | ||
|
|
||
| #[cfg(target_os = "macos")] | ||
| #[allow(unexpected_cfgs)] | ||
| pub fn get_frontmost_app_name() -> Option<String> { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm so macOS impl returns app name via |
||
| use objc::{msg_send, sel, sel_impl}; | ||
| use std::ffi::CStr; | ||
|
|
||
| unsafe { | ||
| // Get NSWorkspace shared instance | ||
| let workspace: *mut objc::runtime::Object = | ||
| msg_send![objc::class!(NSWorkspace), sharedWorkspace]; | ||
| if workspace.is_null() { | ||
| return None; | ||
| } | ||
|
|
||
| // Get frontmost application (NSRunningApplication) | ||
| let frontmost_app: *mut objc::runtime::Object = msg_send![workspace, frontmostApplication]; | ||
| if frontmost_app.is_null() { | ||
| return None; | ||
| } | ||
|
|
||
| // Get localized name of the application | ||
| let name: *mut objc::runtime::Object = msg_send![frontmost_app, localizedName]; | ||
| if name.is_null() { | ||
| return None; | ||
| } | ||
|
|
||
| // Convert NSString to Rust String | ||
| let utf8_ptr: *const i8 = msg_send![name, UTF8String]; | ||
| if utf8_ptr.is_null() { | ||
| return None; | ||
| } | ||
|
|
||
| let c_str = CStr::from_ptr(utf8_ptr); | ||
| match c_str.to_str() { | ||
| Ok(s) if !s.is_empty() => Some(s.to_string()), | ||
| _ => None, | ||
| } | ||
| } | ||
| } | ||
|
|
||
| #[cfg(target_os = "windows")] | ||
| pub fn get_frontmost_app_name() -> Option<String> { | ||
| use std::ffi::OsString; | ||
| use std::os::windows::ffi::OsStringExt; | ||
| use windows::Win32::Foundation::HWND; | ||
| use windows::Win32::UI::WindowsAndMessaging::{ | ||
| GetForegroundWindow, GetWindowTextLengthW, GetWindowTextW, | ||
| }; | ||
|
|
||
| unsafe { | ||
| let hwnd: HWND = GetForegroundWindow(); | ||
| if hwnd.0.is_null() { | ||
| return None; | ||
| } | ||
|
|
||
| let length = GetWindowTextLengthW(hwnd); | ||
| if length == 0 { | ||
| return None; | ||
| } | ||
|
|
||
| let mut buffer: Vec<u16> = vec![0; (length + 1) as usize]; | ||
| let chars_copied = GetWindowTextW(hwnd, &mut buffer); | ||
|
|
||
| if chars_copied > 0 { | ||
| buffer.truncate(chars_copied as usize); | ||
| let title = OsString::from_wide(&buffer).to_string_lossy().into_owned(); | ||
| if !title.is_empty() { | ||
| return Some(title); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| None | ||
| } | ||
|
|
||
| #[cfg(target_os = "linux")] | ||
| pub fn get_frontmost_app_name() -> Option<String> { | ||
| use std::process::Command; | ||
|
|
||
| // Try xdotool first (X11) | ||
| if let Ok(output) = Command::new("xdotool") | ||
| .args(["getactivewindow", "getwindowname"]) | ||
| .output() | ||
| { | ||
| if output.status.success() { | ||
| let name = String::from_utf8_lossy(&output.stdout).trim().to_string(); | ||
| if !name.is_empty() { | ||
| return Some(name); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Fallback for Wayland - try to get from environment or use a generic name | ||
| // Most Wayland compositors don't expose window info to external tools | ||
| if std::env::var("WAYLAND_DISPLAY").is_ok() { | ||
| // On Wayland, we can't easily get the active window name | ||
| // Return None and let the caller handle it | ||
| return None; | ||
| } | ||
|
|
||
| None | ||
| } | ||
|
|
||
| #[cfg(not(any(target_os = "macos", target_os = "windows", target_os = "linux")))] | ||
| pub fn get_frontmost_app_name() -> Option<String> { | ||
| None | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
| mod tests { | ||
| use super::*; | ||
|
|
||
| #[test] | ||
| fn test_get_frontmost_app_returns_something_or_none() { | ||
| // This test just ensures the function doesn't panic | ||
| let _result = get_frontmost_app_name(); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
crate is deprecated in favor of
objc2which is already a transitive dep in the project. maybe we look intoobjc2-app-kitor a simpler approach for the macOS active app detection