init

Kyu-Young · Kyu-Young · commit 17f8a395e8df · 2019-06-10T17:47:53.000-07:00
diff --git a/input_fn.py b/input_fn.py
@@ -0,0 +1,88 @@
+"""Input pipeline using Dataset API."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from absl import flags
+
+import tensorflow as tf
+import tensorflow_datasets as tfds
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_integer('shuffle_buffer_size', 5000, 'Size of the shuffle buffer.')
+
+
+class InputDataset(object):
+  """Input pipeline for the IMDB dataset.
+
+  Attributes:
+    tokenizer: Tokenizer used to encode and decode text.
+  """
+
+  def __init__(self, encoding, max_length=None):
+    """Creates an InputDataset instance.
+
+    Args:
+      encoding: Type of encoding to use. Should be one of 'plain_text', 'bytes',
+        'subwords8k', and 'subwords32k'.
+    """
+    if encoding not in ('plain_text', 'bytes', 'subwords8k', 'subwords32k'):
+      raise ValueError('Unsupported encoding type %s' % encoding)
+
+    loaded_imdb = tfds.load(
+        'imdb_reviews/{}'.format(encoding), with_info=True, as_supervised=True)
+    self._dataset, self._info = loaded_imdb
+    self.tokenizer = self._info.features['text'].encoder
+    self.max_length = max_length
+
+  def input_fn(self, mode, batch_size, bucket_boundaries=None, bow=False):
+    """Returns an instance of tf.data.Dataset.
+
+    Args:
+      mode: One of 'train' or 'test'.
+      batch_size: Size of a batch.
+      bucket_boundaries: List of boundaries for bucketing.
+      bow: True to process the input as a bag-of-words.
+    """
+    if mode not in ('train', 'test'):
+      raise ValueError('Unsupported mode type %s' % mode)
+    dataset = self._dataset[mode]
+
+    # Transform into a bag-of-words input if applicable.
+    def bag_of_words(tokens, label):
+      indices = tf.expand_dims(tokens, axis=-1)
+      updates = tf.ones([tf.shape(indices)[0]])
+      shape = tf.constant([self.tokenizer.vocab_size], dtype=indices.dtype)
+      scatter = tf.scatter_nd(indices, updates, shape)
+      return scatter, label
+    if bow:
+      dataset = dataset.map(bag_of_words, num_parallel_calls=12)
+
+    # Shuffle the data.
+    if self.max_length:
+      dataset = dataset.filter(lambda f, l: tf.shape(f)[0] < self.max_length)
+    dataset = dataset.shuffle(
+        buffer_size=FLAGS.shuffle_buffer_size, reshuffle_each_iteration=True)
+
+    # Create batches of examples and pad.
+    if mode == 'train' and bucket_boundaries:
+      bucket_batch_sizes = [batch_size] * (len(bucket_boundaries) + 1)
+      dataset = dataset.apply(
+          tf.data.experimental.bucket_by_sequence_length(
+              lambda feature, label: tf.shape(feature)[0],
+              bucket_boundaries=bucket_boundaries,
+              bucket_batch_sizes=bucket_batch_sizes,
+              padded_shapes=dataset.output_shapes))
+    else:
+      output_shapes = dataset.output_shapes
+      if self.max_length:
+        output_shapes = (tf.TensorShape([tf.Dimension(sefl.max_length)]),
+                         tf.TensorShape([]))
+      dataset = dataset.padded_batch(batch_size, output_shapes)
+
+    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+
+    return dataset
diff --git a/knn.py b/knn.py
@@ -0,0 +1,135 @@
+"""Sentiment analysis using KNN."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import heapq
+import random
+import time
+
+from absl import app
+from absl import flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_enum('mode', 'knn', ['knn', 'analyze'], 'Execution mode.')
+
+flags.DEFINE_string('train_data', None, 'Train file in LIBSVM format.')
+
+flags.DEFINE_string('test_data', None, 'Test file in LIBSVM format.')
+
+flags.DEFINE_integer('k_value', 0, 'Value of k.')
+
+
+def parse_libsvm_file(filename):
+  """Parses a file in LIBSVM format."""
+  # Features and label.
+  data_points = []
+  with open(filename) as f:
+    for line in f:
+      line = line.split()
+      assert len(line) > 1
+      d = {'features': {}, 'norm': 0.0, 'label': int(line[0])}
+      for bow in line[1:]:
+        word_id, num_occur = bow.split(':')
+        num_occur = float(num_occur)
+        d['features'][word_id] = num_occur
+        d['norm'] += num_occur ** 2
+      data_points.append(d)
+  return data_points
+
+
+def l2_dist(d1, d2):
+  """L2 distance between two sparse vectors represented as dicts."""
+  if len(d1['features']) < len(d2['features']):
+    return l2_dist(d2, d1)
+  d1_norm, d2_norm = d1['norm'], d2['norm']
+  return (d1_norm + d2_norm - 2 * sum(
+      d1['features'].get(key, 0.0) * d2['features'].get(key, 0.0)
+      for key in d2['features'].keys()))
+
+
+def find_knn(data_points, d, k):
+  """Finds k-nearest data points."""
+  neighbors = []
+  heapq.heapify(neighbors)
+  for data_point in data_points:
+    l2d = l2_dist(data_point, d)
+    if len(neighbors) < k:
+      heapq.heappush(neighbors, (-l2d, data_point))
+    else:
+      heapq.heappushpop(neighbors, (-l2d, data_point))
+  return [item[1] for item in neighbors]
+
+
+def run_knn(train_data_points, test_data_points, k):
+  """Runs knn and report the overall error rate."""
+  count = 0
+  num_pos, num_neg = 0.0, 0.0
+  num_pos_correct, num_neg_correct = 0.0, 0.0
+  for test_data_point in test_data_points:
+    count += 1
+    if count % 1000 == 0:
+      print('Processed {} examples.'.format(count))
+    neighbors = find_knn(train_data_points, test_data_point, k)
+    score = sum(neighbor['label'] for neighbor in neighbors)
+    score /= float(len(neighbors))
+    true_score = test_data_point['label']
+    if true_score >= 7:
+      num_pos += 1
+      if score >= 7:
+        num_pos_correct += 1
+    if true_score <= 4:
+      num_neg += 1
+      if score <= 4:
+        num_neg_correct += 1
+
+  pos_error_rate = 1.0 - num_pos_correct / (num_pos + 1e-8)
+  neg_error_rate = 1.0 - num_neg_correct / (num_neg + 1e-8)
+  tot_error_rate = (
+      1.0 - (num_pos_correct + num_neg_correct) / (num_pos + num_neg + 1e-8))
+  print('Pos error rate: {}'.format(round(pos_error_rate, 5)))
+  print('Neg error rate: {}'.format(round(neg_error_rate, 5)))
+  print('Tot error rate: {}'.format(round(tot_error_rate, 5)))
+
+
+def run_analysis(data_points):
+  """Analyzes input data."""
+  num_unique_words_dict = collections.defaultdict(int)
+  num_total_words_dict = collections.defaultdict(int)
+  for d in data_points:
+    num_unique_words_dict[len(d['features']) // 100] += 1
+    num_words = sum(d['features'].values())
+    num_total_words_dict[num_words // 100] += 1
+  num_total = float(len(data_points))
+  avg_unique_words = (
+      sum(k * v for k, v in num_unique_words_dict.items()) / num_total)
+  avg_total_words = (
+      sum(k * v for k, v in num_total_words_dict.items()) / num_total)
+  print('Dist of unique words count: {}'.format(num_unique_words_dict))
+  print('Dist of total words count: {}'.format(num_total_words_dict))
+
+
+def main(unused_argv):
+  print('Start parsing input data..')
+  train_data_points = parse_libsvm_file(FLAGS.train_data)
+  test_data_points = parse_libsvm_file(FLAGS.test_data)
+  if FLAGS.mode == 'knn':
+    random.shuffle(train_data_points)
+    random.shuffle(test_data_points)
+    print('Start running knn..')
+    start = time.time()
+    run_knn(train_data_points, test_data_points, FLAGS.k_value)
+    end = time.time()
+    print('Run time: {} secs'.format(round(end - start, 2)))
+  elif FLAGS.mode == 'analyze':
+    print('Analyze train data:')
+    run_analysis(train_data_points)
+    print('Analyze test data:')
+    run_analysis(test_data_points)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/main.py b/main.py
@@ -0,0 +1,141 @@
+"""Main to run TensorFlow models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl import flags
+
+import tensorflow as tf
+
+from . import input_fn
+from . import model
+from . import util
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_enum('mode', None, ['train', 'eval'], 'Execution mode.')
+
+flags.DEFINE_string('logdir', '/tmp/sentiment-analysis', 'Model directory.')
+
+flags.DEFINE_enum('model', 'rnn', ['mlp', 'rnn'], 'Type of model to use.')
+
+flags.DEFINE_enum('optimizer', 'adam',
+                  ['sgd', 'rmsprop', 'adam'],
+                  'Type of optimizer to use for training.')
+
+flags.DEFINE_enum('encoding', 'subwords8k',
+                  ['plain_text', 'bytes', 'subwords8k', 'subwords32k'],
+                  'Type of text encoding to use.')
+
+flags.DEFINE_integer('num_epochs', 10, 'Number of epochs to run for training.')
+
+flags.DEFINE_integer('num_layers', 1, 'Number of hidden layers.')
+
+flags.DEFINE_list('num_units', [64], 'Number of hidden units.')
+
+flags.DEFINE_enum('cell_type', 'lstm',
+                  ['gru', 'lstm', 'bidi-gru', 'bidi-lstm'],
+                  'Type of RNN cell to use.')
+
+flags.DEFINE_integer('embedding_size', 32, 'Size of the input embedding.')
+
+flags.DEFINE_integer('batch_size', 16, 'Size of the batch.')
+
+flags.DEFINE_bool('verbose', True, 'Verbosity.')
+
+flags.DEFINE_integer('max_length', None, 'Maximum length input to train on.')
+
+flags.DEFINE_bool('early_stop', False, 'True to early stop')
+
+
+def create_model(vocab_size):
+  """Creates a Keras model."""
+  num_units = [int(num_unit) for num_unit in FLAGS.num_units]
+  if FLAGS.model == 'rnn':
+    new_model = model.rnn_model(FLAGS.num_layers, FLAGS.cell_type, num_units,
+                                vocab_size, FLAGS.embedding_size)
+  else:
+    new_model = model.mlp_model(FLAGS.num_layers, num_units, vocab_size)
+  new_model.compile(optimizer=FLAGS.optimizer, loss='binary_crossentropy',
+                    metrics=['accuracy'])
+  new_model.summary()
+  return new_model
+
+
+def run_train():
+  """Trains a model."""
+  # Set up input pipeline.
+  input_dataset = input_fn.InputDataset(FLAGS.encoding)
+  tokenizer = input_dataset.tokenizer
+
+  use_bow = (FLAGS.model == 'mlp')
+  train_dataset = input_dataset.input_fn('train', FLAGS.batch_size, bow=use_bow)
+  test_dataset = input_dataset.input_fn('test', 10, bow=use_bow)
+
+  new_model = create_model(tokenizer.vocab_size)
+  latest_checkpoint = tf.train.latest_checkpoint(FLAGS.logdir)
+  if latest_checkpoint:
+    print("Reloading from {}".format(latest_checkpoint))
+    new_model.load_weights(latest_checkpoint)
+
+  # Define callbacks to run during training.
+  callbacks = []
+
+  checkpoint = util.CNSModelCheckpoint(os.path.join(FLAGS.logdir, FLAGS.model))
+  callbacks.append(checkpoint)
+
+  tensorboard = tf.keras.callbacks.TensorBoard(
+      log_dir=FLAGS.logdir, update_freq='batch')
+  callbacks.append(tensorboard)
+
+  if FLAGS.early_stop:
+    early_stop = tf.keras.callbacks.EarlyStopping(
+        monitor='val_accuracy', min_delta=0.0001, patience=10)
+    callbacks.append(early_stop)
+
+  # Start training.
+  history = new_model.fit(train_dataset, epochs=FLAGS.num_epochs,
+                          callbacks=callbacks,
+                          validation_data=test_dataset,
+                          validation_steps=25,
+                          verbose=int(FLAGS.verbose))
+
+  # Write out the training history.
+  dirname = os.path.dirname(FLAGS.logdir)
+  if not tf.gfile.Exists(dirname):
+    tf.gfile.MakeDirs(dirname)
+  with tf.gfile.GFile(os.path.join(FLAGS.logdir, 'history.txt'), 'w') as f:
+    f.write(str(history.history))
+
+
+def run_eval():
+  """Evaluates a model."""
+  # Set up input pipeline.
+  input_dataset = input_fn.InputDataset(FLAGS.encoding)
+  tokenizer = input_dataset.tokenizer
+
+  use_bow = (FLAGS.model == 'mlp')
+  dataset = input_dataset.input_fn('test', FLAGS.batch_size, bow=use_bow)
+
+  new_model = create_model(tokenizer.vocab_size)
+  latest_checkpoint = tf.train.latest_checkpoint(FLAGS.logdir)
+  if latest_checkpoint:
+    print("Reloading from {}".format(latest_checkpoint))
+    new_model.load_weights(latest_checkpoint)
+
+  ret = new_model.evaluate(dataset)
+  print('Eval results: {}'.format(ret))
+
+
+def main(unused_argv):
+  if FLAGS.mode == 'train':
+    run_train()
+  elif FLAGS.mode == 'eval':
+    run_eval()
+
+
+if __name__ == '__main__':
+  tf.app.run(main)
diff --git a/model.py b/model.py