-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsentiment_audio.py
More file actions
223 lines (177 loc) · 7.54 KB
/
sentiment_audio.py
File metadata and controls
223 lines (177 loc) · 7.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
#!/usr/bin/env python3
import os
import pyaudio
import librosa
import wave
import random
import numpy as np
import tensorflow
from tensorflow import keras
from keras.models import model_from_json
# Default values for PyAudio
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
class Microphone():
def __init__(self, rate=RATE, chunk=CHUNK):
"""
Initialize all the values for using PyAudio
correctly.
"""
self.mic = None
self.audio = None
self.chunk = chunk
self.rate = rate
# We write the file into ram instead of
# wasting time writing it to disk.
self.wav_file = "/dev/shm/tmp_out.wav"
self.open()
def open(self):
"""
Opens the audio stream from the default device.
"""
if self.audio is None:
self.audio = pyaudio.PyAudio()
# Open a stream from the default device
self.mic = self.audio.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
def close(self):
"""
Handles cleaning up all the objects and streams
when we are done with them as well as the temp file
used for getting the data into something librosa can
read easily (.wav) file.
"""
self.mic.close()
self.audio.terminate()
self.mic = None
self.audio = None
try:
os.remove(self.wav_file)
except OSError:
pass
def _mic_to_wav_file(self, seconds):
"""
Converts a `seconds` long clip of audio from the
default microphone to a .wav file for processing by
librosa.
"""
data = []
if self.mic is not None:
# https://www.youtube.com/watch?v=SlL7VYYaTGA
for idx in range(int(RATE/CHUNK * seconds)):
data.append(self.mic.read(CHUNK))
with wave.open(self.wav_file, "wb") as wf:
wf.setnchannels(CHANNELS)
wf.setsampwidth(self.audio.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(data))
else:
print("Microphone not opened.")
def get_sample(self, seconds):
"""
Public function which returns an array of the last
<seconds> samples at <framerate>.
"""
self._mic_to_wav_file(seconds)
data, framerate = librosa.load(self.wav_file)
return (data, framerate)
class AudioCNN():
def __init__(self):
"""
Initialize the microphone and any other needed objects.
"""
self.mic = Microphone()
def noise(self,data):
noise_amp = 0.035*np.random.uniform()*np.amax(data)
data = data + noise_amp*np.random.normal(size=data.shape[0])
return data
def stretch(self,data): ## rate = 0.8
return librosa.effects.time_stretch(data,rate=0.8)
def shift(self,data):
shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
return np.roll(data, shift_range)
def pitch(self,data, sampling_rate, pitch_factor=0.7):
return librosa.effects.pitch_shift(data,sr=sampling_rate,n_steps=1)
def extract_features_vanilla(self, data, sample_rate):
# ZCR
result = np.array([])
zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
result=np.hstack((result, zcr)) # stacking horizontally
# Chroma_stft
stft = np.abs(librosa.stft(data))
chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
result = np.hstack((result, chroma_stft)) # stacking horizontally
# MFCC
mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
result = np.hstack((result, mfcc)) # stacking horizontally
# Root Mean Square Value
rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
result = np.hstack((result, rms)) # stacking horizontally
# MelSpectogram
mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
result = np.hstack((result, mel)) # stacking horizontally
return result
def get_features_vanilla_datafile(self,data,sample_rate):
# duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
# without augmentation
res1 = self.extract_features_vanilla(data,sample_rate)
result = np.array(res1)
# data with noise
noise_data = self.noise(data)
res2 = self.extract_features_vanilla(noise_data,sample_rate)
result = np.vstack((result, res2)) # stacking vertically
# data with stretching and pitching
new_data = self.stretch(data)
data_stretch_pitch = self.pitch(new_data, sample_rate)
res3 = self.extract_features_vanilla(data_stretch_pitch,sample_rate)
result = np.vstack((result, res3)) # stacking vertically
return result
def emotion_score_from_sound(self,sound_data,sample_rate): ## input the sound data as a two channel numpy array with raw audio data
## this is a local file, so ensure that these parameters are changed if filepath is modified
abs_path = "C:/Users/noahv/OneDrive/NDSU Research/Coding Projects/ML 677 Project Testings/sentiment-analysis/"
filename = abs_path + "CNN_audio_model_v1.h5"
model = model_from_json(open(abs_path + "CNN_audio_model_v1.json", "r").read())
model.load_weights(filename)
## emotion list data for output
emotion_list = ['neutral','calm','happy','sad','angry','fear','disgust','surprise']
emotion_value = [5,5,10,0,3,3,3,7] ## happy = 10, sad = 0, neutral = calm = 5; disgust = angry = fear = 3; surprise = 7
# sound_data = data[:,0]
sound_features = self.get_features_vanilla_datafile(sound_data,sample_rate)
emotion_pred = model.predict(sound_features)
## getting total prediction from each of the output feature vectors
reg_sound = emotion_pred[0]
noise_sound = emotion_pred[1]
stretch_sound = emotion_pred[2]
total_sound = (reg_sound + noise_sound + stretch_sound)/3
## emotion score of entire array
score = total_sound * emotion_value
score = np.sum(score)
## getting the actual predicted emotion output
loc_emotion = np.where(total_sound > 0.75)
loc_emotion = loc_emotion[0][0]
loc_emotion = emotion_list[loc_emotion]
return score ## can also return loc_emotion if desired
def close(self):
"""
Cleanup any leftover objects like the microphone.
"""
self.mic.close()
def inference(self,recorded_sound,sampling_rate):
"""
Run the actual inference engine.
"""
# TODO: implement AudioCNN inference function
sound_score = self.emotion_score_from_sound(recorded_sound,sampling_rate)
out = random.randint(0,10)
return sound_score
# # C:\Users\noahv\OneDrive\NDSU Research\Coding Projects\ML 677 Project Testings\Speech Databases\RAVDESS\Audio_Speech_Actors_01-24
# abs_path = "C:/Users/noahv/OneDrive/NDSU Research/Coding Projects/ML 677 Project Testings/Speech Databases/RAVDESS/Audio_Speech_Actors_01-24/Actor_01/03-01-01-01-01-01-01.wav"
# test = AudioCNN()
# audio, sr = librosa.load(abs_path)
# score = test.inference(audio,sr)
# print(score)