Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified Backend/.DS_Store
Binary file not shown.
2 changes: 1 addition & 1 deletion Backend/speechAPI.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ async def generate_conversation(config: ConvoConfig)->str:
input: enviornment configuration
returns: AudioFile (.wav type)
'''
questionText = textGen.generateText(
questionText = textGen.generateRequestText(
config.agentRole,
config.agentTone,
config.userRole,
Expand Down
142 changes: 141 additions & 1 deletion Backend/utils/audioStore.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,142 @@
# adapted from https://www.geeksforgeeks.org/python-speech-recognition-on-large-audio-files/
import speech_recognition as sr
import time
from pydub import AudioSegment
from pydub import silence
# from textGen import generateRequestText

import os
import glob

listen_len = 2 # amount of speech to collect before sending to chatgpt api. measured in seconds. may change the value later
phrase_time_limit = 10.0 # max amount of time a "phrase" can be
max_audiofile_time = 60 # max amount of time an audio file can be

# transcribes at most listen_len seconds of speech, then sends it to the perplexity api
def transcribeAudio():
pass
r = sr.Recognizer()
list_of_phrases_raw = []

# each wav is like at most 60 sec long

audio_file_name = "Jane_eyre_enunciated.wav"
# split the wav file
audio = AudioSegment.from_wav(f"../audio/{audio_file_name}")
phrases = silence.split_on_silence(audio, min_silence_len=600, silence_thresh=-32)

# save all phrases into wav files
i = 0
# delete existing wav phrase files
files = glob.glob('../phrases/*')
for f in files:
os.remove(f)

# TODO: move these next 3 lines into the fn that calls transcribeAudio()
# speech_trans_file = open("speech_transcription.txt", "w")
# speech_trans_file.write("")
# speech_trans_file.close()

speech_trans_file = open("speech_transcription.txt", "a")

for phrase in phrases:
print(f"saving phrase{i}.wav")

# Create 0.5 seconds silence chunk
chunk_silent = AudioSegment.silent(duration = 10)

# add 0.5 sec silence to beginning and end of audio chunk. This is done so that
# it doesn't seem abruptly sliced.
phrase = chunk_silent + phrase + chunk_silent

# the name of the newly created chunk
phrase_audio_file_path = f"../phrases/phrase{i}.wav" # file path relative to utils folder
phrase.export(phrase_audio_file_path, bitrate='256k', format="wav")

print("Processing phrase " + str(i))
with sr.AudioFile(f"{phrase_audio_file_path}") as source:
# generate text for the phrase audio file
list_of_phrases_raw.append(r.record(source, duration=phrase_time_limit))

i = i + 1

combined_phrases = ""
j = 0
for raw in list_of_phrases_raw:
try:
combined_phrases += (" " + r.recognize_google(raw) + ",")
except:
print(f"Phrase {j} not recognized")
j = j + 1

print(combined_phrases)

# TODO: eventually uncomment the next 2 lines
# gpt_response = generateRequestText(userQuery=combined_phrases) #need to forward this to unity/frontend
# gpt_response_text = gpt_response["text"]
gpt_response_text = "response"
# TODO: if performance is bad, may want to try to write each phrase directly to the txt file instead of first appending to the combined phrases string
speech_trans_file.write(f"audio transcription: {combined_phrases}\ngpt response: {gpt_response_text}\n\n")
speech_trans_file.close()







# # transcribes at most listen_len seconds of speech, then sends it to the perplexity api
# def transcribeAudio():
# # Initialize recognizer class (for recognizing the speech)
# r = sr.Recognizer()
# # list_of_phrases = []
# # arr_of_phrases = [None] * 5
# list_of_phrases_raw = []

# # Using microphone
# init_time = time.time()
# # each wav is like at most 60 sec long
# # while (time.time() - init_time < listen_len):
# # with sr.Microphone() as source:
# # print("Talk") # keeping these print statements here for debugging purposes for now
# # r.pause_threshold = 0.8
# # list_of_phrases_raw.append(r.listen(source, phrase_time_limit=phrase_time_limit)) #good?
# # print("Time over, thanks")

# # using audio file from unity
# # audio_file_name = "user_audio_file.wav"
# # audio_file_name = "Jane_eyre.wav"
# # with sr.AudioFile(f"../audio/{audio_file_name}") as source: #TODO: Check with abby about audio file name
# # print("Talk") # keeping these print statements here for debugging purposes for now
# # r.pause_threshold = 0.8
# # list_of_phrases_raw.append(r.record(source, duration=phrase_time_limit)) #good?
# # print("Time over, thanks")

# for i in range(int(max_audiofile_time / phrase_time_limit) + 1):
# audio_file_name = "Jane_eyre.wav"
# with sr.AudioFile(f"../audio/{audio_file_name}") as source: #TODO: Check with abby about audio file name
# print("Talk") # keeping these print statements here for debugging purposes for now
# r.pause_threshold = 0.8
# list_of_phrases_raw.append(r.listen(source, phrase_time_limit=phrase_time_limit)) #good?
# print("Time over, thanks")

# # list_of_phrases_text = []
# combined_phrases = ""
# for raw in list_of_phrases_raw:
# try:
# # list_of_phrases_text.append(r.recognize_google(raw))
# combined_phrases += (" " + r.recognize_google(raw) + ",")
# except:
# print("Sorry, I did not get that")

# print(combined_phrases)
# # gpt_response = generateRequestText(userQuery=combined_phrases) #need to forward this to unity/frontend

# # add to text file as well
# # gpt_response_text = gpt_response["text"]
# gpt_response_text = "response"
# speech_trans_file = open("speech_transcription.txt", "a")
# speech_trans_file.write(f"audio transcription: {combined_phrases}\ngpt response: {gpt_response_text}\n\n")
# speech_trans_file.close()

# transcribeAudio()

20 changes: 11 additions & 9 deletions Backend/utils/textGen.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def constructMsg(
userQuery = None
):
if not userQuery:
return {"ok":False,"error":"no user string provided","messages":[]}
return {"ok":False, "error":"no user string provided", "messages":[]}
systemMsg = {
"role": "system",
"content": (
Expand All @@ -30,21 +30,23 @@ def constructMsg(
userQuery
),
}
return {"ok":True,"error":"","messages":[systemMsg,userMsg]}
return {"ok":True, "error":"", "messages":[systemMsg,userMsg]}

def generateText(
agentRole,
agentTone,
userRole,
userQuery
def generateRequestText(
agentRole = "interviewer",
agentTone = "confused",
userRole = "interviewee",
userQuery = None
):
if not userQuery:
return {"ok":False, "error":"no user string provided", "text":""}
logging.basicConfig(filename='./logging/textGen.log', level=logging.INFO)

ok,error,messages = constructMsg(agentRole,agentTone,userRole, userQuery).values()

if not ok:
logger.info(f"error with message generation: \n {error} \n")
return {"ok": False, "error": f"error with message generation: \n {error}","text":""}
return {"ok": False, "error": f"error with message generation: \n {error}", "text":""}


response = client.chat.completions.create(
Expand All @@ -64,5 +66,5 @@ def generateText(
logger.info(choice)

res = response.choices[0].message.content
return {"ok": True, "error": "","text":res}
return {"ok": True, "error": "", "text":res}