Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Word2Vec.py #74

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 16 additions & 15 deletions Word2Vec.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
import numpy as np
import re
from collections import Counter
Expand All @@ -25,7 +26,7 @@
# into one huge string, and then uses a Counter to identify words
# and the number of occurences
def processDataset(filename):
openedFile = open(filename, 'r')
openedFile = open(filename, 'r', encoding='utf-8')
allLines = openedFile.readlines()
myStr = ""
for line in allLines:
Expand All @@ -41,7 +42,7 @@ def createTrainingMatrices(dictionary, corpus):
yTrain=[]
for i in range(numTotalWords):
if i % 100000 == 0:
print 'Finished %d/%d total words' % (i, numTotalWords)
print ('Finished %d/%d total words' % (i, numTotalWords))
wordsAfter = allWords[i + 1:i + windowSize + 1]
wordsBefore = allWords[max(0, i - windowSize):i]
wordsAdded = wordsAfter + wordsBefore
Expand All @@ -61,19 +62,19 @@ def getTrainingBatch():
if (os.path.isfile('Word2VecXTrain.npy') and os.path.isfile('Word2VecYTrain.npy') and os.path.isfile('wordList.txt')):
xTrain = np.load('Word2VecXTrain.npy')
yTrain = np.load('Word2VecYTrain.npy')
print 'Finished loading training matrices'
print ('Finished loading training matrices')
with open("wordList.txt", "rb") as fp:
wordList = pickle.load(fp)
print 'Finished loading word list'
print ('Finished loading word list')

else:
fullCorpus, datasetDictionary = processDataset('conversationData.txt')
print 'Finished parsing and cleaning dataset'
print ('Finished parsing and cleaning dataset')
wordList = list(datasetDictionary.keys())
createOwnVectors = raw_input('Do you want to create your own vectors through Word2Vec (y/n)?')
createOwnVectors = input('Do you want to create your own vectors through Word2Vec (y/n)?')
if (createOwnVectors == 'y'):
xTrain, yTrain = createTrainingMatrices(datasetDictionary, fullCorpus)
print 'Finished creating training matrices'
print ('Finished creating training matrices')
np.save('Word2VecXTrain.npy', xTrain)
np.save('Word2VecYTrain.npy', yTrain)
else:
Expand All @@ -90,13 +91,13 @@ def getTrainingBatch():
numTrainingExamples = len(xTrain)
vocabSize = len(wordList)

sess = tf.Session()
embeddingMatrix = tf.Variable(tf.random_uniform([vocabSize, wordVecDimensions], -1.0, 1.0))
nceWeights = tf.Variable(tf.truncated_normal([vocabSize, wordVecDimensions], stddev=1.0 / math.sqrt(wordVecDimensions)))
sess = tf.compat.v1.Session()
embeddingMatrix = tf.Variable(tf.random.uniform([vocabSize, wordVecDimensions], -1.0, 1.0))
nceWeights = tf.Variable(tf.random.truncated_normal([vocabSize, wordVecDimensions], stddev=1.0 / math.sqrt(wordVecDimensions)))
nceBiases = tf.Variable(tf.zeros([vocabSize]))

inputs = tf.placeholder(tf.int32, shape=[batchSize])
outputs = tf.placeholder(tf.int32, shape=[batchSize, 1])
inputs = tf.compat.v1.placeholder(tf.int32, shape=[batchSize])
outputs = tf.compat.v1.placeholder(tf.int32, shape=[batchSize, 1])

embed = tf.nn.embedding_lookup(embeddingMatrix, inputs)

Expand All @@ -108,14 +109,14 @@ def getTrainingBatch():
num_sampled=numNegativeSample,
num_classes=vocabSize))

optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)
optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)

sess.run(tf.global_variables_initializer())
sess.run(tf.compat.v1.global_variables_initializer())
for i in range(numIterations):
trainInputs, trainLabels = getTrainingBatch()
_, curLoss = sess.run([optimizer, loss], feed_dict={inputs: trainInputs, outputs: trainLabels})
if (i % 10000 == 0):
print ('Current loss is:', curLoss)
print 'Saving the word embedding matrix'
print ('Saving the word embedding matrix')
embedMatrix = embeddingMatrix.eval(session=sess)
np.save('embeddingMatrix.npy', embedMatrix)