-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathget_inputs.py
More file actions
113 lines (84 loc) · 3.45 KB
/
get_inputs.py
File metadata and controls
113 lines (84 loc) · 3.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import numpy as np
import pandas as pd
import json
import _pickle as cPickle
print('make inputs!')
# make everything more 'hip hop'
def rap_izer(line):
# convert words to be more rap like
line = line.lower().replace('(','').replace(')','').replace('#','').replace(':','').replace('*','')
line = line.replace('--','').replace('+','').replace('-','').replace(',','')
line = line.replace('ing',"in'").replace('you all',"y'all")
line = line.replace("do not", "do'nt").replace('that is',"that's").replace('’','')
line = line.replace("'ll"," will").replace(' are', "'re").replace('somethin',"somethin'")
line = line.replace('aaaah','aah').replace('aagh','aah').replace('skkrrrrrrttt','skrrrt')
line = line.replace('skrrr','skrrrt').replace('skrrt','skrrrt').replace('skrt','skrrrt')
return line
# read in csv of lines & convert to standard format:
df = pd.read_csv('fire_bars.csv')
# get rid of the empty lines in the dataset
df = df.loc[df['verses'] != '']
# open vocabulary:
vocab = pd.read_csv('rap_vocab.csv')
# remove the row if it has a frequency less than a threshold amount:
#thresh = 0
#vocab = vocab.loc[vocab['freqs'] >= thresh].reset_index(drop=True)
# turn vocab into a dictionary (lookup table):
LUT = { str(vocab.ix[i,1]) : vocab.ix[i,0] for i in range(len(vocab))}
LUT['UNK'] = len(LUT)+2 # add the UNKnown words at the end of dictionary
print('lookup table is the length of: ', len(LUT))
# save memory by deleting unneeded array
del vocab
def get_word_index(word):
'''
takes in the word, finds it location in the LUT or assigns
the UNK index number to it
'''
try:
# we have the word in vocab
indx = LUT[str(word)]
except:
# it's not in vocabulary
indx = LUT['UNK']
return indx
# for each "sse" tag, loop over each line and create (inpt,target) pairs, by index #
data = [] # will append data to this list...
inpt = [] # for the input array
outpt = [] # for the output array
song = -1 # counts up when reaches next song, set as -1 to be initialized as 0
for i in range(len(df)-1):
# ignores the last line because it will be covered in the code below
# turn on switch if sse - (start of new song):
if df['verses'][i] == 'sse':
song += 1 # record as next song
# don't add this line or the next line basically as anew song starting
else:
if df['verses'][i+1] == 'sse':
# if the next line is sse then song ends and we can get rid of the line:
# ignore
continue
else:
# turn the lines into a vector of words...
line1 = rap_izer(str(df['verses'][i])).split()
line2 = rap_izer(str(df['verses'][i+1])).split()
# convert each word into its index number, if it's not in
# there then give it a UNK number
l1 = [get_word_index(word) for word in line1]
l2 = [get_word_index(word) for word in line2]
# if l1 OR l2 is shorter than the threshold then don't add to data:
len_min = 5
len_max = 20
if (len(l1) >= len_min) & (len(l2) >= len_min) & (len(l1) <= len_max) & (len(l2) <= len_max):
# add these sequences to the dataset:
data.append([l1,l2])
inpt.append(l1)
outpt.append(l2)
if (i % 100) == 0:
# show the data every so often because, why not?
print('iteration %s\'s data looks like' % i)
print('input bar\n',l1,'\n\noutput bar\n',l2)
# turn into input and output csvs
inpts = pd.DataFrame(inpt).to_csv('inputs.csv', index=False)
targs = pd.DataFrame(outpt).to_csv('targets.csv', index= False)
# save the 3d data set by pickling it:
#cPickle.dump( data, open( "filename.pkl", "wb" ) )