-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathpreprocess.py
More file actions
72 lines (57 loc) · 2.05 KB
/
preprocess.py
File metadata and controls
72 lines (57 loc) · 2.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import numpy as np
import pandas as pd
import json
# read in csv & convert to standard format:
df = pd.read_csv('fire_bars.csv')
# make everything more 'hip hop'
def rap_izer(line):
# convert words to be more rap like
line = line.lower().replace('(','').replace(')','').replace('#','').replace(':','').replace('*','')
line = line.replace('--','').replace('+','').replace('-','')
line = line.replace('ing',"in'").replace('you all',"y'all")
line = line.replace("do not", "do'nt").replace('that is',"that's").replace('’','')
line = line.replace("'ll"," will").replace(' are', "'re").replace('somethin',"somethin'")
line = line.replace('aaaah','aah').replace('aagh','aah').replace('skkrrrrrrttt','skrrrt')
line = line.replace('skrrr','skrrrt').replace('skrrt','skrrrt').replace('skrt','skrrrt')
return line
# build vocabulary
voca = []
for i in range(len(df)):
line = rap_izer(str(df['verses'][i])).split()
for j,word in enumerate(line):
voca.append(str(word))
# then make voca unique to save memory
voca = np.unique(voca).tolist()
#show vocab len at every 100 lines:
if i % 100 == 0:
print('iteration %s' % i)
print('vocabulary is currently: ' , len(voca) )
# show the vocab at every 1000 lines
if i % 1000 == 0:
print(voca)
# save vocab:
print('saving csv...')
vocab = pd.DataFrame(voca)
vocab.to_csv('rap_vocab.csv')
print('csv saved')
vocab = pd.read_csv('rap_vocab.csv')
# # convert vocab to dictionary LUT:
# LUT = { str(vocab.ix[i,1]) : vocab.ix[i,0] for i in range(len(vocab))}
# print(LUT)
# # get word frequencies:
# # setup new col in dataframe:
# vocab['freqs'] = np.zeros(len(vocab))
# for i in range(len(df)):
# line = rap_izer(str(df['verses'][i])).split()
# for j,word in enumerate(line):
# # add one to the word frequency
# vocab['freqs'][LUT[str(word)]] = vocab['freqs'][LUT[str(word)]] + 1
# if i % 100 == 0:
# print('iteration %s' % i)
# # save CSV with frequencies now
# print('overwriting csv...')
# vocab.to_csv('rap_vocab_freq.csv')
# print('csv saved')
# # for memory delete what is not needed
# del vocab
# del voca