From 161c7fc9355896d2c163f7b6174e83ce3672d2d5 Mon Sep 17 00:00:00 2001 From: Cindy Fu Date: Sat, 8 Feb 2025 11:06:56 -0500 Subject: [PATCH 1/3] some audio recognition done --- Backend/utils/audioStore.py | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/Backend/utils/audioStore.py b/Backend/utils/audioStore.py index f09d848..1490be0 100644 --- a/Backend/utils/audioStore.py +++ b/Backend/utils/audioStore.py @@ -1,2 +1,33 @@ +import speech_recognition as sr +import time + +# TODO: +# store text locally for redundancy +# make this function also send to api + +listen_len = 20 # may change later + +# transcribes at most listen_len seconds of speech, then sends it to the perplexity api def transcribeAudio(): - pass \ No newline at end of file + # Initialize recognizer class (for recognizing the speech) + r = sr.Recognizer() + # list_of_phrases = [] + list_of_phrases_raw = [] + # arr_of_phrases = [None] * 5 + # i = 0 # index into arr_of_phrases + + init_time = time.time() + while (time.time() - init_time < listen_len): + with sr.Microphone() as source: + print("Talk") # keeping these print statements here for debugging purposes + r.pause_threshold = 0.8 + list_of_phrases_raw.append(r.listen(source, phrase_time_limit=10.0)) #good? + print("Time over, thanks") + + list_of_phrases_text = [] + for raw in list_of_phrases_raw: + try: + list_of_phrases_text.append(r.recognize_google(raw)) + except: + print("Sorry, I did not get that") + From f9574b0910fbe818444143d1f0a780d022c872cf Mon Sep 17 00:00:00 2001 From: Cindy Fu Date: Sat, 8 Feb 2025 14:44:15 -0500 Subject: [PATCH 2/3] speech to text from .wav file --- Backend/.DS_Store | Bin 6148 -> 6148 bytes Backend/speechAPI.py | 2 +- Backend/utils/audioStore.py | 148 +++++++++++++++++++++++++++++++----- Backend/utils/textGen.py | 20 ++--- 4 files changed, 140 insertions(+), 30 deletions(-) diff --git a/Backend/.DS_Store b/Backend/.DS_Store index 269ada770cc65da7a4e1e2a167ed658af354d17b..7e5f689d40a146189199a2c38068113551aa52e9 100644 GIT binary patch delta 428 zcmYjNO;5r=5S{YXZH%Fbm3YK>1#N;&NIZ%Ez!E*EAzj$r$u6C4>SATyv&=K+jIN1Rwx*Yr`qE9aIXRY8h{BPX`({PC0(xHwP}i% z;T(qG1N-L?{h}dfDz@!!txznLD>~814!Ls&GI3)s9;%KP-K4*+l$>sBk6y7L>6zAr z6tTxd5M=`92iS6Z&4e$LjvNYKW!kB`VTeJDo_R1HJ5H-nZ#wp*Q6INkPNVLewkH#V z>>r(+UG^UMqma*N4nxCrve^C0+w_7(ZY5vcbBY$z_8dHNBf+IB`Ut0#4jd%(@i7q4 z&#QC@VE_?$pdbL1r^CJzu~2NHo}wrt0|NsP3otM!Fc>lDG2}87Fc?oPR5t}la50oHq%sr( zNhC?noc!dZoctu9JWv+{KM4Pag3XD{%UCuSFfU}<%+A5j0kj4v@|}4yzlbg;$iM?Y P%rM!8M|yLN$P#7%W#A#S diff --git a/Backend/speechAPI.py b/Backend/speechAPI.py index ecee1cf..9b7bea5 100644 --- a/Backend/speechAPI.py +++ b/Backend/speechAPI.py @@ -25,7 +25,7 @@ async def generate_conversation(config: ConvoConfig)->str: input: enviornment configuration returns: AudioFile (.wav type) ''' - questionText = textGen.generateText( + questionText = textGen.generateRequestText( config.agentRole, config.agentTone, config.userRole, diff --git a/Backend/utils/audioStore.py b/Backend/utils/audioStore.py index 1490be0..61884ee 100644 --- a/Backend/utils/audioStore.py +++ b/Backend/utils/audioStore.py @@ -1,33 +1,141 @@ import speech_recognition as sr import time +from pydub import AudioSegment +from pydub import silence +# from textGen import generateRequestText -# TODO: -# store text locally for redundancy -# make this function also send to api +import os +import glob -listen_len = 20 # may change later +listen_len = 2 # amount of speech to collect before sending to chatgpt api. measured in seconds. may change the value later +phrase_time_limit = 10.0 # max amount of time a "phrase" can be +max_audiofile_time = 60 # max amount of time an audio file can be # transcribes at most listen_len seconds of speech, then sends it to the perplexity api def transcribeAudio(): - # Initialize recognizer class (for recognizing the speech) r = sr.Recognizer() - # list_of_phrases = [] list_of_phrases_raw = [] - # arr_of_phrases = [None] * 5 - # i = 0 # index into arr_of_phrases - - init_time = time.time() - while (time.time() - init_time < listen_len): - with sr.Microphone() as source: - print("Talk") # keeping these print statements here for debugging purposes - r.pause_threshold = 0.8 - list_of_phrases_raw.append(r.listen(source, phrase_time_limit=10.0)) #good? - print("Time over, thanks") - - list_of_phrases_text = [] + + # each wav is like at most 60 sec long + + audio_file_name = "Jane_eyre_enunciated.wav" + # split the wav file + audio = AudioSegment.from_wav(f"../audio/{audio_file_name}") + phrases = silence.split_on_silence(audio, min_silence_len=600, silence_thresh=-32) + + # save all phrases into wav files + i = 0 + # delete existing wav phrase files + files = glob.glob('../phrases/*') + for f in files: + os.remove(f) + + # TODO: move these next 3 lines into the fn that calls transcribeAudio() + # speech_trans_file = open("speech_transcription.txt", "w") + # speech_trans_file.write("") + # speech_trans_file.close() + + speech_trans_file = open("speech_transcription.txt", "a") + + for phrase in phrases: + print(f"saving phrase{i}.wav") + + # Create 0.5 seconds silence chunk + chunk_silent = AudioSegment.silent(duration = 10) + + # add 0.5 sec silence to beginning and end of audio chunk. This is done so that + # it doesn't seem abruptly sliced. + phrase = chunk_silent + phrase + chunk_silent + + # the name of the newly created chunk + phrase_audio_file_path = f"../phrases/phrase{i}.wav" # file path relative to utils folder + phrase.export(phrase_audio_file_path, bitrate='256k', format="wav") + + print("Processing phrase " + str(i)) + with sr.AudioFile(f"{phrase_audio_file_path}") as source: + # generate text for the phrase audio file + list_of_phrases_raw.append(r.record(source, duration=phrase_time_limit)) + + i = i + 1 + + combined_phrases = "" + j = 0 for raw in list_of_phrases_raw: try: - list_of_phrases_text.append(r.recognize_google(raw)) + combined_phrases += (" " + r.recognize_google(raw) + ",") except: - print("Sorry, I did not get that") + print(f"Phrase {j} not recognized") + j = j + 1 + + print(combined_phrases) + + # TODO: eventually uncomment the next 2 lines + # gpt_response = generateRequestText(userQuery=combined_phrases) #need to forward this to unity/frontend + # gpt_response_text = gpt_response["text"] + gpt_response_text = "response" + # TODO: if performance is bad, may want to try to write each phrase directly to the txt file instead of first appending to the combined phrases string + speech_trans_file.write(f"audio transcription: {combined_phrases}\ngpt response: {gpt_response_text}\n\n") + speech_trans_file.close() + + + + + + + +# # transcribes at most listen_len seconds of speech, then sends it to the perplexity api +# def transcribeAudio(): +# # Initialize recognizer class (for recognizing the speech) +# r = sr.Recognizer() +# # list_of_phrases = [] +# # arr_of_phrases = [None] * 5 +# list_of_phrases_raw = [] + +# # Using microphone +# init_time = time.time() +# # each wav is like at most 60 sec long +# # while (time.time() - init_time < listen_len): +# # with sr.Microphone() as source: +# # print("Talk") # keeping these print statements here for debugging purposes for now +# # r.pause_threshold = 0.8 +# # list_of_phrases_raw.append(r.listen(source, phrase_time_limit=phrase_time_limit)) #good? +# # print("Time over, thanks") + +# # using audio file from unity +# # audio_file_name = "user_audio_file.wav" +# # audio_file_name = "Jane_eyre.wav" +# # with sr.AudioFile(f"../audio/{audio_file_name}") as source: #TODO: Check with abby about audio file name +# # print("Talk") # keeping these print statements here for debugging purposes for now +# # r.pause_threshold = 0.8 +# # list_of_phrases_raw.append(r.record(source, duration=phrase_time_limit)) #good? +# # print("Time over, thanks") + +# for i in range(int(max_audiofile_time / phrase_time_limit) + 1): +# audio_file_name = "Jane_eyre.wav" +# with sr.AudioFile(f"../audio/{audio_file_name}") as source: #TODO: Check with abby about audio file name +# print("Talk") # keeping these print statements here for debugging purposes for now +# r.pause_threshold = 0.8 +# list_of_phrases_raw.append(r.listen(source, phrase_time_limit=phrase_time_limit)) #good? +# print("Time over, thanks") + +# # list_of_phrases_text = [] +# combined_phrases = "" +# for raw in list_of_phrases_raw: +# try: +# # list_of_phrases_text.append(r.recognize_google(raw)) +# combined_phrases += (" " + r.recognize_google(raw) + ",") +# except: +# print("Sorry, I did not get that") + +# print(combined_phrases) +# # gpt_response = generateRequestText(userQuery=combined_phrases) #need to forward this to unity/frontend + +# # add to text file as well +# # gpt_response_text = gpt_response["text"] +# gpt_response_text = "response" +# speech_trans_file = open("speech_transcription.txt", "a") +# speech_trans_file.write(f"audio transcription: {combined_phrases}\ngpt response: {gpt_response_text}\n\n") +# speech_trans_file.close() + +# transcribeAudio() diff --git a/Backend/utils/textGen.py b/Backend/utils/textGen.py index dc1d113..7021b45 100644 --- a/Backend/utils/textGen.py +++ b/Backend/utils/textGen.py @@ -16,7 +16,7 @@ def constructMsg( userQuery = None ): if not userQuery: - return {"ok":False,"error":"no user string provided","messages":[]} + return {"ok":False, "error":"no user string provided", "messages":[]} systemMsg = { "role": "system", "content": ( @@ -30,21 +30,23 @@ def constructMsg( userQuery ), } - return {"ok":True,"error":"","messages":[systemMsg,userMsg]} + return {"ok":True, "error":"", "messages":[systemMsg,userMsg]} -def generateText( - agentRole, - agentTone, - userRole, - userQuery +def generateRequestText( + agentRole = "interviewer", + agentTone = "confused", + userRole = "interviewee", + userQuery = None ): + if not userQuery: + return {"ok":False, "error":"no user string provided", "text":""} logging.basicConfig(filename='./logging/textGen.log', level=logging.INFO) ok,error,messages = constructMsg(agentRole,agentTone,userRole, userQuery).values() if not ok: logger.info(f"error with message generation: \n {error} \n") - return {"ok": False, "error": f"error with message generation: \n {error}","text":""} + return {"ok": False, "error": f"error with message generation: \n {error}", "text":""} response = client.chat.completions.create( @@ -64,5 +66,5 @@ def generateText( logger.info(choice) res = response.choices[0].message.content - return {"ok": True, "error": "","text":res} + return {"ok": True, "error": "", "text":res} \ No newline at end of file From 121dd842418c28243de19f8e157e8ae6e4c13bc9 Mon Sep 17 00:00:00 2001 From: Cindy Fu Date: Sat, 8 Feb 2025 14:52:00 -0500 Subject: [PATCH 3/3] a comment --- Backend/utils/audioStore.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Backend/utils/audioStore.py b/Backend/utils/audioStore.py index 61884ee..21b6ff1 100644 --- a/Backend/utils/audioStore.py +++ b/Backend/utils/audioStore.py @@ -1,3 +1,4 @@ +# adapted from https://www.geeksforgeeks.org/python-speech-recognition-on-large-audio-files/ import speech_recognition as sr import time from pydub import AudioSegment