diff --git a/Backend/.DS_Store b/Backend/.DS_Store index 269ada7..7e5f689 100644 Binary files a/Backend/.DS_Store and b/Backend/.DS_Store differ diff --git a/Backend/speechAPI.py b/Backend/speechAPI.py index ecee1cf..9b7bea5 100644 --- a/Backend/speechAPI.py +++ b/Backend/speechAPI.py @@ -25,7 +25,7 @@ async def generate_conversation(config: ConvoConfig)->str: input: enviornment configuration returns: AudioFile (.wav type) ''' - questionText = textGen.generateText( + questionText = textGen.generateRequestText( config.agentRole, config.agentTone, config.userRole, diff --git a/Backend/utils/audioStore.py b/Backend/utils/audioStore.py index f09d848..21b6ff1 100644 --- a/Backend/utils/audioStore.py +++ b/Backend/utils/audioStore.py @@ -1,2 +1,142 @@ +# adapted from https://www.geeksforgeeks.org/python-speech-recognition-on-large-audio-files/ +import speech_recognition as sr +import time +from pydub import AudioSegment +from pydub import silence +# from textGen import generateRequestText + +import os +import glob + +listen_len = 2 # amount of speech to collect before sending to chatgpt api. measured in seconds. may change the value later +phrase_time_limit = 10.0 # max amount of time a "phrase" can be +max_audiofile_time = 60 # max amount of time an audio file can be + +# transcribes at most listen_len seconds of speech, then sends it to the perplexity api def transcribeAudio(): - pass \ No newline at end of file + r = sr.Recognizer() + list_of_phrases_raw = [] + + # each wav is like at most 60 sec long + + audio_file_name = "Jane_eyre_enunciated.wav" + # split the wav file + audio = AudioSegment.from_wav(f"../audio/{audio_file_name}") + phrases = silence.split_on_silence(audio, min_silence_len=600, silence_thresh=-32) + + # save all phrases into wav files + i = 0 + # delete existing wav phrase files + files = glob.glob('../phrases/*') + for f in files: + os.remove(f) + + # TODO: move these next 3 lines into the fn that calls transcribeAudio() + # speech_trans_file = open("speech_transcription.txt", "w") + # speech_trans_file.write("") + # speech_trans_file.close() + + speech_trans_file = open("speech_transcription.txt", "a") + + for phrase in phrases: + print(f"saving phrase{i}.wav") + + # Create 0.5 seconds silence chunk + chunk_silent = AudioSegment.silent(duration = 10) + + # add 0.5 sec silence to beginning and end of audio chunk. This is done so that + # it doesn't seem abruptly sliced. + phrase = chunk_silent + phrase + chunk_silent + + # the name of the newly created chunk + phrase_audio_file_path = f"../phrases/phrase{i}.wav" # file path relative to utils folder + phrase.export(phrase_audio_file_path, bitrate='256k', format="wav") + + print("Processing phrase " + str(i)) + with sr.AudioFile(f"{phrase_audio_file_path}") as source: + # generate text for the phrase audio file + list_of_phrases_raw.append(r.record(source, duration=phrase_time_limit)) + + i = i + 1 + + combined_phrases = "" + j = 0 + for raw in list_of_phrases_raw: + try: + combined_phrases += (" " + r.recognize_google(raw) + ",") + except: + print(f"Phrase {j} not recognized") + j = j + 1 + + print(combined_phrases) + + # TODO: eventually uncomment the next 2 lines + # gpt_response = generateRequestText(userQuery=combined_phrases) #need to forward this to unity/frontend + # gpt_response_text = gpt_response["text"] + gpt_response_text = "response" + # TODO: if performance is bad, may want to try to write each phrase directly to the txt file instead of first appending to the combined phrases string + speech_trans_file.write(f"audio transcription: {combined_phrases}\ngpt response: {gpt_response_text}\n\n") + speech_trans_file.close() + + + + + + + +# # transcribes at most listen_len seconds of speech, then sends it to the perplexity api +# def transcribeAudio(): +# # Initialize recognizer class (for recognizing the speech) +# r = sr.Recognizer() +# # list_of_phrases = [] +# # arr_of_phrases = [None] * 5 +# list_of_phrases_raw = [] + +# # Using microphone +# init_time = time.time() +# # each wav is like at most 60 sec long +# # while (time.time() - init_time < listen_len): +# # with sr.Microphone() as source: +# # print("Talk") # keeping these print statements here for debugging purposes for now +# # r.pause_threshold = 0.8 +# # list_of_phrases_raw.append(r.listen(source, phrase_time_limit=phrase_time_limit)) #good? +# # print("Time over, thanks") + +# # using audio file from unity +# # audio_file_name = "user_audio_file.wav" +# # audio_file_name = "Jane_eyre.wav" +# # with sr.AudioFile(f"../audio/{audio_file_name}") as source: #TODO: Check with abby about audio file name +# # print("Talk") # keeping these print statements here for debugging purposes for now +# # r.pause_threshold = 0.8 +# # list_of_phrases_raw.append(r.record(source, duration=phrase_time_limit)) #good? +# # print("Time over, thanks") + +# for i in range(int(max_audiofile_time / phrase_time_limit) + 1): +# audio_file_name = "Jane_eyre.wav" +# with sr.AudioFile(f"../audio/{audio_file_name}") as source: #TODO: Check with abby about audio file name +# print("Talk") # keeping these print statements here for debugging purposes for now +# r.pause_threshold = 0.8 +# list_of_phrases_raw.append(r.listen(source, phrase_time_limit=phrase_time_limit)) #good? +# print("Time over, thanks") + +# # list_of_phrases_text = [] +# combined_phrases = "" +# for raw in list_of_phrases_raw: +# try: +# # list_of_phrases_text.append(r.recognize_google(raw)) +# combined_phrases += (" " + r.recognize_google(raw) + ",") +# except: +# print("Sorry, I did not get that") + +# print(combined_phrases) +# # gpt_response = generateRequestText(userQuery=combined_phrases) #need to forward this to unity/frontend + +# # add to text file as well +# # gpt_response_text = gpt_response["text"] +# gpt_response_text = "response" +# speech_trans_file = open("speech_transcription.txt", "a") +# speech_trans_file.write(f"audio transcription: {combined_phrases}\ngpt response: {gpt_response_text}\n\n") +# speech_trans_file.close() + +# transcribeAudio() + diff --git a/Backend/utils/textGen.py b/Backend/utils/textGen.py index dc1d113..7021b45 100644 --- a/Backend/utils/textGen.py +++ b/Backend/utils/textGen.py @@ -16,7 +16,7 @@ def constructMsg( userQuery = None ): if not userQuery: - return {"ok":False,"error":"no user string provided","messages":[]} + return {"ok":False, "error":"no user string provided", "messages":[]} systemMsg = { "role": "system", "content": ( @@ -30,21 +30,23 @@ def constructMsg( userQuery ), } - return {"ok":True,"error":"","messages":[systemMsg,userMsg]} + return {"ok":True, "error":"", "messages":[systemMsg,userMsg]} -def generateText( - agentRole, - agentTone, - userRole, - userQuery +def generateRequestText( + agentRole = "interviewer", + agentTone = "confused", + userRole = "interviewee", + userQuery = None ): + if not userQuery: + return {"ok":False, "error":"no user string provided", "text":""} logging.basicConfig(filename='./logging/textGen.log', level=logging.INFO) ok,error,messages = constructMsg(agentRole,agentTone,userRole, userQuery).values() if not ok: logger.info(f"error with message generation: \n {error} \n") - return {"ok": False, "error": f"error with message generation: \n {error}","text":""} + return {"ok": False, "error": f"error with message generation: \n {error}", "text":""} response = client.chat.completions.create( @@ -64,5 +66,5 @@ def generateText( logger.info(choice) res = response.choices[0].message.content - return {"ok": True, "error": "","text":res} + return {"ok": True, "error": "", "text":res} \ No newline at end of file