diff --git a/.gitignore b/.gitignore index 8db6aabd..d99d837f 100644 --- a/.gitignore +++ b/.gitignore @@ -115,3 +115,5 @@ models/*.h5 # csv files *.csv + +.DS_Store diff --git a/audio/.gitignore b/audio/.gitignore new file mode 100644 index 00000000..2d3a8e93 --- /dev/null +++ b/audio/.gitignore @@ -0,0 +1,116 @@ +*tfevents* +*weights*/ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +config.ini + +# C extensions +*.so + +#PyCharm +.idea/ + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +.static_storage/ +.media/ +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +input/ +weights/*.h5 +models/*.h5 + +.DS_Store diff --git a/audio/VGGish_Audioset_&_Audio_embedding_Tutorial.ipynb b/audio/VGGish_Audioset_&_Audio_embedding_Tutorial.ipynb new file mode 100644 index 00000000..0590dd70 --- /dev/null +++ b/audio/VGGish_Audioset_&_Audio_embedding_Tutorial.ipynb @@ -0,0 +1,950 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# VGGish Audio Embedding Collab" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This colab demonstrates how to extract the AudioSet embeddings, using a VGGish deep neural network (DNN)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Importing and Testing the VGGish System" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Based on the directions at: https://github.com/tensorflow/models/tree/master/research/audioset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python --version" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --upgrade pip\n", + "!pip install numpy scipy\n", + "!pip install resampy tensorflow-gpu six " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip list | grep tensorflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!sudo git clone https://github.com/google/youtube-8m.git" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check to see where are in the kernel's file system.\n", + "!pwd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Grab the VGGish model\n", + "!sudo curl -O https://storage.googleapis.com/audioset/vggish_model.ckpt\n", + "!sudo curl -O https://storage.googleapis.com/audioset/vggish_pca_params.npz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Make sure we got the model data.\n", + "!ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copy the source files to the current directory.\n", + "!sudo curl -O http://storage.googleapis.com/us_audioset/youtube_corpus/v1/features/features.tar.gz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!sudo tar -xzf features.tar.gz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!git clone https://github.com/tensorflow/models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Make sure the source files got copied correctly.\n", + "!ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Verify the location of the AudioSet source files\n", + "%cd models/research/audioset\n", + "!ls" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Enabling GPU Device" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Install Docker\n", + "!sudo -S apt-get update" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!sudo -S apt-get -y install \\\n", + " apt-transport-https \\\n", + " ca-certificates \\\n", + " curl \\\n", + " software-properties-common" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!sudo add-apt-repository \\\n", + " 'deb [arch=amd64] https://download.docker.com/linux/ubuntu \\\n", + " $(lsb_release -cs) \\\n", + " stable'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!sudo apt-get update" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!sudo apt-get install docker \n", + "!sudo apt-get install -y docker.io\n", + "!pip install docker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!sudo apt-get -f -y install" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!sudo docker --version" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Install NVIDIA Drivers\n", + "!sudo apt-get install -y wget" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!sudo wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/cuda-repo-ubuntu1604_8.0.44-1_amd64.deb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!sudo dpkg -i --force-confdef cuda-repo-ubuntu1604_8.0.44-1_amd64.deb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!sudo apt-get update" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!sudo apt-get -y install cuda" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!sudo apt-get -f -y install" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!sudo apt-get install -y nvidia-docker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!sudo apt-get update" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!sudo nvidia-docker run --rm nvidia/cuda nvidia-smi\n", + "!/usr/bin/nvidia-smi" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Audioset Embedding Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!sudo curl -O https://storage.googleapis.com/us_audioset/youtube_corpus/v1/features/features.tar.gz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Unpack the Audioset Features\n", + "!sudo tar -xzf features.tar.gz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cd /home/yvradsmi/notebooks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cd youtube-8m" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!sudo chmod -R 777 /home/yvradsmi/notebooks/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!sudo rm -f readers.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile readers.py\n", + "# Copyright 2016 Google Inc. All Rights Reserved.\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS-IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License.\n", + "\n", + "\"\"\"Provides readers configured for different datasets.\"\"\"\n", + "\n", + "import tensorflow as tf\n", + "import utils\n", + "\n", + "from tensorflow import logging\n", + "def resize_axis(tensor, axis, new_size, fill_value=0):\n", + " \"\"\"Truncates or pads a tensor to new_size on on a given axis.\n", + " Truncate or extend tensor such that tensor.shape[axis] == new_size. If the\n", + " size increases, the padding will be performed at the end, using fill_value.\n", + " Args:\n", + " tensor: The tensor to be resized.\n", + " axis: An integer representing the dimension to be sliced.\n", + " new_size: An integer or 0d tensor representing the new value for\n", + " tensor.shape[axis].\n", + " fill_value: Value to use to fill any new entries in the tensor. Will be\n", + " cast to the type of tensor.\n", + " Returns:\n", + " The resized tensor.\n", + " \"\"\"\n", + " tensor = tf.convert_to_tensor(tensor)\n", + " shape = tf.unstack(tf.shape(tensor))\n", + "\n", + " pad_shape = shape[:]\n", + " pad_shape[axis] = tf.maximum(0, new_size - shape[axis])\n", + "\n", + " shape[axis] = tf.minimum(shape[axis], new_size)\n", + " shape = tf.stack(shape)\n", + "\n", + " resized = tf.concat([\n", + " tf.slice(tensor, tf.zeros_like(shape), shape),\n", + " tf.fill(tf.stack(pad_shape), tf.cast(fill_value, tensor.dtype))\n", + " ], axis)\n", + "\n", + " # Update shape.\n", + " new_shape = tensor.get_shape().as_list() # A copy is being made.\n", + " new_shape[axis] = new_size\n", + " resized.set_shape(new_shape)\n", + " return resized\n", + "\n", + "class BaseReader(object):\n", + " \"\"\"Inherit from this class when implementing new readers.\"\"\"\n", + "\n", + " def prepare_reader(self, unused_filename_queue):\n", + " \"\"\"Create a thread for generating prediction and label tensors.\"\"\"\n", + " raise NotImplementedError()\n", + "\n", + "\n", + "class YT8MAggregatedFeatureReader(BaseReader):\n", + " \"\"\"Reads TFRecords of pre-aggregated Examples.\n", + " The TFRecords must contain Examples with a sparse int64 'labels' feature and\n", + " a fixed length float32 feature, obtained from the features in 'feature_name'.\n", + " The float features are assumed to be an average of dequantized values.\n", + " \"\"\"\n", + "\n", + " def __init__(self,\n", + " num_classes=527,\n", + " feature_sizes=[1024, 128],\n", + " feature_names=[\"mean_rgb\", \"mean_audio\"]):\n", + " \"\"\"Construct a YT8MAggregatedFeatureReader.\n", + " Args:\n", + " num_classes: a positive integer for the number of classes.\n", + " feature_sizes: positive integer(s) for the feature dimensions as a list.\n", + " feature_names: the feature name(s) in the tensorflow record as a list.\n", + " \"\"\"\n", + "\n", + " assert len(feature_names) == len(feature_sizes), \\\n", + " \"length of feature_names (={}) != length of feature_sizes (={})\".format( \\\n", + " len(feature_names), len(feature_sizes))\n", + "\n", + " self.num_classes = num_classes\n", + " self.feature_sizes = feature_sizes\n", + " self.feature_names = feature_names\n", + "\n", + " def prepare_reader(self, filename_queue, batch_size=1024):\n", + " \"\"\"Creates a single reader thread for pre-aggregated YouTube 8M Examples.\n", + " Args:\n", + " filename_queue: A tensorflow queue of filename locations.\n", + " Returns:\n", + " A tuple of video indexes, features, labels, and padding data.\n", + " \"\"\"\n", + " reader = tf.TFRecordReader()\n", + " _, serialized_examples = reader.read_up_to(filename_queue, batch_size)\n", + "\n", + " tf.add_to_collection(\"serialized_examples\", serialized_examples)\n", + " return self.prepare_serialized_examples(serialized_examples)\n", + "\n", + " def prepare_serialized_examples(self, serialized_examples):\n", + " # set the mapping from the fields to data types in the proto\n", + " num_features = len(self.feature_names)\n", + " assert num_features > 0, \"self.feature_names is empty!\"\n", + " assert len(self.feature_names) == len(self.feature_sizes), \\\n", + " \"length of feature_names (={}) != length of feature_sizes (={})\".format( \\\n", + " len(self.feature_names), len(self.feature_sizes))\n", + "\n", + " feature_map = {\"video_id\": tf.FixedLenFeature([], tf.string),\n", + " \"labels\": tf.VarLenFeature(tf.int64)}\n", + " for feature_index in range(num_features):\n", + " feature_map[self.feature_names[feature_index]] = tf.FixedLenFeature(\n", + " [self.feature_sizes[feature_index]], tf.float32)\n", + "\n", + " features = tf.parse_example(serialized_examples, features=feature_map)\n", + " labels = tf.sparse_to_indicator(features[\"labels\"], self.num_classes)\n", + " labels.set_shape([None, self.num_classes])\n", + " concatenated_features = tf.concat([\n", + " features[feature_name] for feature_name in self.feature_names], 1)\n", + "\n", + " return features[\"video_id\"], concatenated_features, labels, tf.ones([tf.shape(serialized_examples)[0]])\n", + "\n", + "class YT8MFrameFeatureReader(BaseReader):\n", + " \"\"\"Reads TFRecords of SequenceExamples.\n", + " The TFRecords must contain SequenceExamples with the sparse in64 'labels'\n", + " context feature and a fixed length byte-quantized feature vector, obtained\n", + " from the features in 'feature_names'. The quantized features will be mapped\n", + " back into a range between min_quantized_value and max_quantized_value.\n", + " \"\"\"\n", + "\n", + " def __init__(self,\n", + " num_classes=527,\n", + " feature_sizes=[1024, 128],\n", + " feature_names=[\"rgb\", \"audio\"],\n", + " max_frames=300):\n", + " \"\"\"Construct a YT8MFrameFeatureReader.\n", + " Args:\n", + " num_classes: a positive integer for the number of classes.\n", + " feature_sizes: positive integer(s) for the feature dimensions as a list.\n", + " feature_names: the feature name(s) in the tensorflow record as a list.\n", + " max_frames: the maximum number of frames to process.\n", + " \"\"\"\n", + "\n", + " assert len(feature_names) == len(feature_sizes), \\\n", + " \"length of feature_names (={}) != length of feature_sizes (={})\".format( \\\n", + " len(feature_names), len(feature_sizes))\n", + "\n", + " self.num_classes = num_classes\n", + " self.feature_sizes = feature_sizes\n", + " self.feature_names = feature_names\n", + " self.max_frames = max_frames\n", + "\n", + " def get_video_matrix(self,\n", + " features,\n", + " feature_size,\n", + " max_frames,\n", + " max_quantized_value,\n", + " min_quantized_value):\n", + " \"\"\"Decodes features from an input string and quantizes it.\n", + " Args:\n", + " features: raw feature values\n", + " feature_size: length of each frame feature vector\n", + " max_frames: number of frames (rows) in the output feature_matrix\n", + " max_quantized_value: the maximum of the quantized value.\n", + " min_quantized_value: the minimum of the quantized value.\n", + " Returns:\n", + " feature_matrix: matrix of all frame-features\n", + " num_frames: number of frames in the sequence\n", + " \"\"\"\n", + " decoded_features = tf.reshape(\n", + " tf.cast(tf.decode_raw(features, tf.uint8), tf.float32),\n", + " [-1, feature_size])\n", + "\n", + " num_frames = tf.minimum(tf.shape(decoded_features)[0], max_frames)\n", + " feature_matrix = utils.Dequantize(decoded_features,\n", + " max_quantized_value,\n", + " min_quantized_value)\n", + " feature_matrix = resize_axis(feature_matrix, 0, max_frames)\n", + " return feature_matrix, num_frames\n", + "\n", + " def prepare_reader(self,\n", + " filename_queue,\n", + " max_quantized_value=2,\n", + " min_quantized_value=-2):\n", + " \"\"\"Creates a single reader thread for YouTube8M SequenceExamples.\n", + " Args:\n", + " filename_queue: A tensorflow queue of filename locations.\n", + " max_quantized_value: the maximum of the quantized value.\n", + " min_quantized_value: the minimum of the quantized value.\n", + " Returns:\n", + " A tuple of video indexes, video features, labels, and padding data.\n", + " \"\"\"\n", + " reader = tf.TFRecordReader()\n", + " _, serialized_example = reader.read(filename_queue)\n", + "\n", + " return self.prepare_serialized_examples(serialized_example,\n", + " max_quantized_value, min_quantized_value)\n", + "\n", + " def prepare_serialized_examples(self, serialized_example,\n", + " max_quantized_value=2, min_quantized_value=-2):\n", + "\n", + " contexts, features = tf.parse_single_sequence_example(\n", + " serialized_example,\n", + " context_features={\"video_id\": tf.FixedLenFeature(\n", + " [], tf.string),\n", + " \"labels\": tf.VarLenFeature(tf.int64)},\n", + " sequence_features={\n", + " feature_name : tf.FixedLenSequenceFeature([], dtype=tf.string)\n", + " for feature_name in self.feature_names\n", + " })\n", + "\n", + " # read ground truth labels\n", + " labels = (tf.cast(\n", + " tf.sparse_to_dense(contexts[\"labels\"].values, (self.num_classes,), 1,\n", + " validate_indices=False),\n", + " tf.bool))\n", + "\n", + " # loads (potentially) different types of features and concatenates them\n", + " num_features = len(self.feature_names)\n", + " assert num_features > 0, \"No feature selected: feature_names is empty!\"\n", + "\n", + " assert len(self.feature_names) == len(self.feature_sizes), \\\n", + " \"length of feature_names (={}) != length of feature_sizes (={})\".format( \\\n", + " len(self.feature_names), len(self.feature_sizes))\n", + "\n", + " num_frames = -1 # the number of frames in the video\n", + " feature_matrices = [None] * num_features # an array of different features\n", + " for feature_index in range(num_features):\n", + " feature_matrix, num_frames_in_this_feature = self.get_video_matrix(\n", + " features[self.feature_names[feature_index]],\n", + " self.feature_sizes[feature_index],\n", + " self.max_frames,\n", + " max_quantized_value,\n", + " min_quantized_value)\n", + " if num_frames == -1:\n", + " num_frames = num_frames_in_this_feature\n", + " else:\n", + " tf.assert_equal(num_frames, num_frames_in_this_feature)\n", + "\n", + " feature_matrices[feature_index] = feature_matrix\n", + "\n", + " # cap the number of frames at self.max_frames\n", + " num_frames = tf.minimum(num_frames, self.max_frames)\n", + "\n", + " # concatenate different features\n", + " video_matrix = tf.concat(feature_matrices, 1)\n", + "\n", + " # convert to batch format.\n", + " # TODO: Do proper batch reads to remove the IO bottleneck.\n", + " batch_video_ids = tf.expand_dims(contexts[\"video_id\"], 0)\n", + " batch_video_matrix = tf.expand_dims(video_matrix, 0)\n", + " batch_labels = tf.expand_dims(labels, 0)\n", + " batch_frames = tf.expand_dims(num_frames, 0)\n", + "\n", + " return batch_video_ids, batch_video_matrix, batch_labels, batch_frames" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cd .." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!git clone https://github.com/tensorflow/models.git" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cd models/research/audioset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!sudo chmod -R 777 audioset_v1_embeddings/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rm vggish_inference_demo.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile vggish_inference_demo.py\n", + "\n", + "# Copyright 2017 The TensorFlow Authors All Rights Reserved.\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License.\n", + "# ==============================================================================\n", + "\n", + "r\"\"\"A simple demonstration of running VGGish in inference mode.\n", + "\n", + "This is intended as a toy example that demonstrates how the various building\n", + "blocks (feature extraction, model definition and loading, postprocessing) work\n", + "together in an inference context.\n", + "\n", + "A WAV file (assumed to contain signed 16-bit PCM samples) is read in, converted\n", + "into log mel spectrogram examples, fed into VGGish, the raw embedding output is\n", + "whitened and quantized, and the postprocessed embeddings are optionally written\n", + "in a SequenceExample to a TFRecord file (using the same format as the embedding\n", + "features released in AudioSet).\n", + "\n", + "Usage:\n", + " # Run a WAV file through the model and print the embeddings. The model\n", + " # checkpoint is loaded from vggish_model.ckpt and the PCA parameters are\n", + " # loaded from vggish_pca_params.npz in the current directory.\n", + " $ python vggish_inference_demo.py --wav_file /path/to/a/wav/file\n", + "\n", + " # Run a WAV file through the model and also write the embeddings to\n", + " # a TFRecord file. The model checkpoint and PCA parameters are explicitly\n", + " # passed in as well.\n", + " $ python vggish_inference_demo.py --wav_file /path/to/a/wav/file \\\n", + " --tfrecord_file /path/to/tfrecord/file \\\n", + " --checkpoint /path/to/model/checkpoint \\\n", + " --pca_params /path/to/pca/params\n", + "\n", + " # Run a built-in input (a sine wav) through the model and print the\n", + " # embeddings. Associated model files are read from the current directory.\n", + " $ python vggish_inference_demo.py\n", + "\"\"\"\n", + "\n", + "from __future__ import print_function\n", + "\n", + "import numpy as np\n", + "from scipy.io import wavfile\n", + "import six\n", + "import tensorflow as tf\n", + "\n", + "import vggish_input\n", + "import vggish_params\n", + "import vggish_postprocess\n", + "import vggish_slim\n", + "\n", + "flags = tf.app.flags\n", + "\n", + "flags.DEFINE_string(\n", + " 'wav_file', None,\n", + " 'Path to a wav file. Should contain signed 16-bit PCM samples. '\n", + " 'If none is provided, a synthetic sound is used.')\n", + "\n", + "flags.DEFINE_string(\n", + " 'checkpoint', 'vggish_model.ckpt',\n", + " 'Path to the VGGish checkpoint file.')\n", + "\n", + "flags.DEFINE_string(\n", + " 'pca_params', 'vggish_pca_params.npz',\n", + " 'Path to the VGGish PCA parameters file.')\n", + "\n", + "flags.DEFINE_string(\n", + " 'tfrecord_file', None,\n", + " 'Path to a TFRecord file where embeddings will be written.')\n", + "\n", + "FLAGS = flags.FLAGS\n", + "\n", + "\n", + "def main(_):\n", + " # In this simple example, we run the examples from a single audio file through\n", + " # the model. If none is provided, we generate a synthetic input.\n", + " if FLAGS.wav_file:\n", + " wav_file = FLAGS.wav_file\n", + " else:\n", + " # Write a WAV of a sine wav into an in-memory file object.\n", + " num_secs = 5\n", + " freq = 1000\n", + " sr = 44100\n", + " t = np.linspace(0, num_secs, int(num_secs * sr))\n", + " x = np.sin(2 * np.pi * freq * t)\n", + " # Convert to signed 16-bit samples.\n", + " samples = np.clip(x * 32768, -32768, 32767).astype(np.int16)\n", + " wav_file = six.BytesIO()\n", + " wavfile.write(wav_file, sr, samples)\n", + " wav_file.seek(0)\n", + " examples_batch = vggish_input.wavfile_to_examples(wav_file)\n", + " print(examples_batch)\n", + "\n", + " # Prepare a postprocessor to munge the model embeddings.\n", + " pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)\n", + "\n", + " # If needed, prepare a record writer to store the postprocessed embeddings.\n", + " writer = tf.python_io.TFRecordWriter(\n", + " FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None\n", + "\n", + " with tf.Graph().as_default(), tf.Session() as sess:\n", + " # Define the model in inference mode, load the checkpoint, and\n", + " # locate input and output tensors.\n", + " vggish_slim.define_vggish_slim(training=False)\n", + " vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)\n", + " features_tensor = sess.graph.get_tensor_by_name(\n", + " vggish_params.INPUT_TENSOR_NAME)\n", + " embedding_tensor = sess.graph.get_tensor_by_name(\n", + " vggish_params.OUTPUT_TENSOR_NAME)\n", + "\n", + " # Run inference and postprocessing.\n", + " [embedding_batch] = sess.run([embedding_tensor],\n", + " feed_dict={features_tensor: examples_batch})\n", + " print(embedding_batch)\n", + " postprocessed_batch = pproc.postprocess(embedding_batch)\n", + " print(postprocessed_batch)\n", + "\n", + " # Write the postprocessed embeddings as a SequenceExample, in a similar\n", + " # format as the features released in AudioSet. Each row of the batch of\n", + " # embeddings corresponds to roughly a second of audio (96 10ms frames), and\n", + " # the rows are written as a sequence of bytes-valued features, where each\n", + " # feature value contains the 128 bytes of the whitened quantized embedding.\n", + " seq_example = tf.train.SequenceExample(\n", + " context=tf.train.Features(feature={\n", + " 'video_id': tf.train.Feature(bytes_list=tf.train.BytesList(value=[wav_file.encode()]))\n", + " }),\n", + " feature_lists=tf.train.FeatureLists(\n", + " feature_list={\n", + " vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:\n", + " tf.train.FeatureList(\n", + " feature=[\n", + " tf.train.Feature(\n", + " bytes_list=tf.train.BytesList(\n", + " value=[embedding.tobytes()]))\n", + " for embedding in postprocessed_batch\n", + " ]\n", + " )\n", + " }\n", + " )\n", + " )\n", + " print(seq_example)\n", + " if writer:\n", + " writer.write(seq_example.SerializeToString())\n", + "\n", + " if writer:\n", + " writer.close()\n", + "\n", + "\n", + "if __name__ == '__main__':\n", + " tf.app.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cd /home/yvradsmi/notebooks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python youtube-8m/train.py --frame_features --model=LstmModel --feature_names=audio_embedding --feature_sizes=128 --train_data_pattern=audioset_v1_embeddings/bal_train/*.tfrecord --train_dir model_new/dir --start_new_model --base_learning_rate=0.001 --num_epochs=5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!ls /home/yvradsmi/notebooks/audioset_v1_embeddings/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python youtube-8m/eval.py --eval_data_pattern=audioset_v1_embeddings/eval/*.tfrecord --train_dir model_new/dir --run_once" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python youtube-8m/inference.py --output_file Bal_SamplePredictions.csv --input_data_pattern=audioset_v1_embeddings/bal_train/a*.tfrecord --train_dir model_new/dir --top_k=3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!cat notebooks/Bal_SamplePredictions.csv" + ] + } + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/audio/audiodetection.md b/audio/audiodetection.md new file mode 100644 index 00000000..ed4bb921 --- /dev/null +++ b/audio/audiodetection.md @@ -0,0 +1,251 @@ +# Audio Detection using Audioset & Audio Analysis + +## Active Learning + Audio Detection + +The detection of audio and classification of sounds are currently green field research areas with limited resources and best practices. The labeling of audio, augmentation and model generation are essential for building Computer Audio Detection projects. Recent strides in the space such as the development of large scale datasets with labeled sound clips (Audioset, Freesound) and the classification models around those datasets are providing the ML space best practice parity with common Computer Vision and image labeling techniques. In this section of Active Learning we will explore beneficial practices, and ML pipeline techniques around audio augmentation, audio labeling, & sound classification to automate the detection of sounds for Human annotators. To begin, we leverage the Audioset Dataset to build base line tensorflow models to detect sounds in our custom dataset. + +## Environment Setup + +Use a local machine or a Data Science VM with GPUs. Our team leveraged the [Azure Data Science Virtual Machine](https://azure.microsoft.com/en-us/services/virtual-machines/data-science-virtual-machines/) from the Azure Marketplace. Any reasonably recent version of these packages should work. TensorFlow should be at least version 1.0. We have tested with Python 2.7.6 and 3.4.3 on an Ubuntu-like system with NumPy v1.13.1, SciPy v0.19.1, resampy v0.1.5, TensorFlow v1.2.1, and Six v1.10.0. + + +**Azure Data Science VM (Linux)** + +* Python - 3.5.5 N/A +* Numpy - 1.14.5 +* Scipy - 1.1.0 +* Resampy - 0.2.1 +* Tensorflow-GPU - 1.10.0 +* Six - 1.11.0 + + +### Enabling GPU Device + +If you have a different GPU / OS please go to [official website](https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1710/x86_64/) and find the appropriate driver. + +This command can help with find information on your GPU: + +``` +sudo lshw -C display +``` + +First, install the latest version of Docker: + +``` +$ sudo apt-get update +$ sudo apt-get install \ + apt-transport-https \ + ca-certificates \ + curl \ + software-properties-common +$ curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - +$ sudo add-apt-repository \ + "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ + $(lsb_release -cs) \ + stable" +$ sudo apt-get update +$ sudo apt-get install docker-ce +``` + +To install CUDA drivers for Ubuntu 16.04 for NVIDIA Tesla k80: + +``` +$ wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/cuda-repo-ubuntu1604_8.0.44-1_amd64.deb +$ sudo dpkg -i cuda-repo-ubuntu1604_8.0.44-1_amd64.deb +$ sudo apt-get update +$ sudo apt-get -f install +$ sudo apt-get install cuda +``` + +nvidia-smi is NVIDIA's System Management Interface. It provides a command line utility that allows monitoring and management capabilities for NVIDIA devices. + +To install nvidia-docker and test nvidia-smi: + +``` +$ sudo apt-get install nvidia-docker2 +$ sudo nvidia-docker run --rm nvidia/cuda nvidia-smi +``` +> **NOTE**: A VM restart may be required for packages to be fully installed. + +Example output: + +``` +Wed Sep 26 21:17:23 2018 ++-----------------------------------------------------------------------------+ +| NVIDIA-SMI 410.48 Driver Version: 410.48 | +|-------------------------------+----------------------+----------------------+ +| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | +|===============================+======================+======================| +| 0 Tesla K80 Off | 00006DE9:00:00.0 Off | 0 | +| N/A 40C P0 83W / 149W | 0MiB / 11441MiB | 1% Default | ++-------------------------------+----------------------+----------------------+ + ++-----------------------------------------------------------------------------+ +| Processes: GPU Memory | +| GPU PID Type Process name Usage | +|=============================================================================| +| No running processes found | ++-----------------------------------------------------------------------------+ +``` + + +### Environment Testing Set Up + +``` sh +# You can optionally install and test VGGish within a Python virtualenv, which +# is useful for isolating changes from the rest of your system. For example, you +# may have an existing version of some packages that you do not want to upgrade, +# or you want to try Python 3 instead of Python 2. If you decide to use a +# virtualenv, you can create one by running +# $ virtualenv vggish # For Python 2 +# or +# $ python3 -m venv vggish # For Python 3 +# and then enter the virtual environment by running +# $ source vggish/bin/activate # Assuming you use bash +# Leave the virtual environment at the end of the session by running +# $ deactivate +# Within the virtual environment, do not use 'sudo'. +# Upgrade pip first. +$ sudo python -m pip install --upgrade pip +# Install dependences. Resampy needs to be installed after NumPy and SciPy +# are already installed. +$ sudo pip install numpy scipy +$ sudo pip install resampy tensorflow six +# Clone TensorFlow models repo into a 'models' directory. +$ git clone https://github.com/tensorflow/models.git +$ cd models/research/audioset +# Download data files into same directory as code. +# These files are used for the VGG inferencing to convert audio wav files to a spectogram supported by the Audioset model we generate. +$ curl -O https://storage.googleapis.com/audioset/vggish_model.ckpt +$ curl -O https://storage.googleapis.com/audioset/vggish_pca_params.npz +# Installation ready, let's test it. +$ python vggish_smoke_test.py +# If we see "Looks Good To Me", then we're all set. +``` +> From + +## Custom Audio Conversion + +Here we will take a custom wav file and extract the audio feature embeddings through a VGGish inference. VGGish is the 128-dimension embeddings specifically found in Audioset which was trained on millions of youtube videos. A tensorflow record with the embeddings are outputted to a directory and you can use this to pass into your own model. For more on VGGish check out this [paper](https://arxiv.org/pdf/1609.09430.pdf). Here's what you need: + +- A WAV file (assumed to contain signed 16-bit PCM samples) + + * This wav file is converted into log mel spectrogram examples, fed into VGGish, the raw embedding output is whitened and quantized, and the postprocessed embeddings are optionally written in a SequenceExample to a TFRecord file (using the same format as the embedding features released in AudioSet). + * Size of the file - The VGG inference script Converts audio waveform into an array of examples for VGGish. So whichever size your training data set is based on. For audioset this is 10 seconds. + + - Input: data: np.array of either one dimension (mono) or two dimensions(multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. + + - Output: 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. + +- Converting to .WAV 16-bit PCM Sample + - Audio Samples + - I am using 2 audio samples for gunshots provided by freesound.org + - Sample 1 - https://freesound.org/people/watupgroupie/sounds/36815/ + - Sample 2 - https://freesound.org/people/fastson/sounds/399065/ + - Converting WAV 16bit signed PCM + - [Online-Convert](https://www.online-convert.com) + - No changes in the sampling rate + - No Changes to the audio channel + - In advanced options select the following as your PCM format PCM 16bit signed Small Endian + - Sample 1 - https://rtwrt.blob.core.windows.net/post5-audioset/samples/sample1_16bit_PCM_signed_smallendian.wav + - Sample 2 - https://rtwrt.blob.core.windows.net/post5-audioset/samples/sample2_16bit_PCM_signed_smallendian.wav + + +| Clip | Converter | Channel | Sample Rate | Endian | PCM-16bit| Signed | +| -------- |:---------------:| ---------:|-----------: |---------| ---------| -------| +| Clip | Online-Converter| No-Change | No-Change | Small | Yes | Yes | +| Clip2 | Online-Converter| No-Change | No-Change | Small | Yes | Yes | + +> Upload to a cloud storage like Azure Blob & curl your files: +> `curl -O https://rtwrt.blob.core.windows.net/post5-audioset/samples/sample1_16bit_PCM_signed_smallendian.wav` + +## VGG Conversion Tester + +Use vggish_inference_demo.py to create a VGG analysis of your custom wave file in a tensor flow record. Specify the path to the wav file and the output path of where you want the tfrecord to be generated. + +`python vggish_inference_demo.py --wav_file clip2-02_16bit_PCM_signed_smallendian.wav \ + --tfrecord_file tfrecrods/new2 \ + --checkpoint /path/to/model/checkpoint \ + --pca_params /path/to/pca/params` + +> Due to an outdated version of audioset leveraging the video_id parameter, replace your file with the [vgg_inference_demo.py](vgg_inference_demo.py) we provided. We added a context property that appends a video_id property based on the name of the wav file inputted. + + +## Selection of model: + +* **Video-Level Models** + - `LogisticModel`: Linear projection of the output features into the label space, followed by a sigmoid function to convert logit values to probabilities. + - `MoeModel`: A per-class softmax distribution over a configurable number of logistic classifiers. One of the classifiers in the mixture is not trained, and always predicts 0. +* **Frame-Level Models** + - `LstmModel`: Processes the features for each frame using a multi-layered LSTM neural net. The final internal state of the LSTM is input to a video-level model for classification. Note that you will need to change the learning rate to 0.001 when using this model. + - `DbofModel`: Projects the features for each frame into a higher dimensional 'clustering' space, pools across frames in that space, and then uses a video-level model to classify the now aggregated features. + - `FrameLevelLogisticModel`: Equivalent to 'LogisticModel', but performs average-pooling on the fly over frame-level features rather than using pre-aggregated features. + +> From + +## Audioset Ontology + +``` +Firearm +{ + "id": "/m/032s66", + "name": "Gunshot, gunfire", + "description": "The sound of the discharge of a firearm, or multiple such discharges.", + "citation_uri": "http://en.wikipedia.org/wiki/Gunshot", + "positive_examples": ["youtu.be/--PG66A3lo4?start=80&end=90", "youtu.be/PKEhOxE-Ovs?start=130&end=140", "youtu.be/c9030y4sJo0?start=140&end=150", "youtu.be/6slrju_ar9U?start=290&end=300", "youtu.be/K1cnDXbkPu0?start=170&end=180", "youtu.be/AjIQf3HK_Vc?start=130&end=140", "youtu.be/klCJfirqUF8?start=30&end=40", "youtu.be/Ma65O2T_hN0?start=10&end=20", "youtu.be/-Ho5tDtuah0?start=50&end=60"], + "child_ids": ["/m/04zjc", "/m/02z32qm", "/m/0_1c", "/m/073cg4"], + "restrictions": [] + }, +``` +> From + +## Dataset Ingestion & Sample Models + +1. Download the Audioset feature embeddings dataset in a tar file and unpack the tar. + - Download using `curl -O storage.googleapis.com/us_audioset/youtube_corpus/v1/features/features.tar.gz` + - Unpack using `tar -xzf features.tar.gz` +2. Clone the youtube8m repository - `git clone https://github.com/google/youtube-8m.git` +> **Learnings**: Audioset uses an outdated version of the youtube 8m model templates and requires a change in the readers.py file. Our team changed all instances of `id` to `video_id` and also edited the hard coded number fo classes from 3826 to 527 (The current number of Audioset class). Refer to [readers.py](readers.py) + +## Building a Model + +### Train + +Now that you have the Youtube-8m model samples and the audioset features to train the model lets now train an frame-level model `LstmModel` on our audio embedding features. Run the command below where" +- `--train_data_pattern` is the path to the balanced_train tensorflow records for the audioset embeddings +- `--train_dir` is an arbitrary path to a directory where the model will be creates +- `--base_learning_rate` is set to 0.001 since we are using an LSTM Model +- `--num_epochs` is an arbitrary number for the amount of times the model will be trained on the dataset. Ideally we would like to have a 0.01 loss so train the model long enough so the loss value is relatively close to this. *try not to overfit the model* + +`python youtube-8m/train.py --frame_features --model=LstmModel --feature_names=audio_embedding --feature_sizes=128 --train_data_pattern=audioset_v1_embeddings/bal_train/*.tfrecord --train_dir model_new/dir --start_new_model --base_learning_rate=0.001 --num_epochs=1500` +#### Tracking Training +* You can use **Nohup** to write the console to a txt and monitor the loss + * Use `nohup python youtube-8m/train.py --frame_features --model=LstmModel --feature_names=audio_embedding --feature_sizes=128 --train_data_pattern=audioset_v1_embeddings/bal_train/*.tfrecord --train_dir model_new/dir --start_new_model --base_learning_rate=0.001 --num_epochs=1500 &` + * Then open another terminal and monitor the loss using `tail -f nohup.out` +* You can also use **Tensorboard** to monitor the model loss and other metrics using a UI. + * In the portal add port `6006` as an inbound rule for your network security group that you VM is configured to. + * Use `tensorboard --logdir=model_new --host=0.0.0.0` from a new terminal in the VM and navigate to tensorboard by entering `:6006` + +### Evalute + +You can now use the binaries for evaluating Tensorflow models on the YouTube-8M dataset with audioset embeddings. Run this command once or for an arbitrary time: + +* `--train_data_pattern` is the path to the eval_train tensorflor wecords for the audtioset embeddings +* `--train_dir` is the path to the previousl created directory for your model + +`python youtube-8m/eval.py --eval_data_pattern=features/audioset_v1_embeddings/eval/*.tfrecord --train_dir model_new/dir` + +### Inference + +Now that you have a working model you can run 3 commands to evaluate how our model scores audio based on the audio embedding features. + +First we'll run an inference on our bal_train dataset. These are audio records that our model was built using. Run the command where: +- `--top_k` is the top 3 labels tagged by the model +- `--input_data_pattern` is a partition of some of the tfrecords for us to evaluate the model scores. In this example we use all tfrecords with a* +- `--output_file` is a user defined path to the output csv file the model generates with the scores. + +`python youtube-8m/inference.py --output_file Bal_SamplePredictions.csv --input_data_pattern=features/audioset_v1_embeddings/bal_train/a*.tfrecord --train_dir model_new/dir --top_k=3` + +With the output csv file use [Bal_Train_Segments.csv](http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/balanced_train_segments.csv) to validate that the video files were correctly labeled. Repeat this inference step replacing the `input_data_pattern` with the Unbalance_Data and then with the output tfrecord that is created with a custom wav file. +> The Balance_Train dataset scores should be very accurate since we used this data to train the LstmModel. The Unbalnced_Train dataset will be a bit less accurate and your custom audio tfrecord for a user wav file will vary based on a variety of variables that will need to be further explored. diff --git a/audio/readers.py b/audio/readers.py new file mode 100644 index 00000000..ad9e2e1a --- /dev/null +++ b/audio/readers.py @@ -0,0 +1,269 @@ +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Provides readers configured for different datasets.""" + +import tensorflow as tf +import utils + +from tensorflow import logging +def resize_axis(tensor, axis, new_size, fill_value=0): + """Truncates or pads a tensor to new_size on on a given axis. + + Truncate or extend tensor such that tensor.shape[axis] == new_size. If the + size increases, the padding will be performed at the end, using fill_value. + + Args: + tensor: The tensor to be resized. + axis: An integer representing the dimension to be sliced. + new_size: An integer or 0d tensor representing the new value for + tensor.shape[axis]. + fill_value: Value to use to fill any new entries in the tensor. Will be + cast to the type of tensor. + + Returns: + The resized tensor. + """ + tensor = tf.convert_to_tensor(tensor) + shape = tf.unstack(tf.shape(tensor)) + + pad_shape = shape[:] + pad_shape[axis] = tf.maximum(0, new_size - shape[axis]) + + shape[axis] = tf.minimum(shape[axis], new_size) + shape = tf.stack(shape) + + resized = tf.concat([ + tf.slice(tensor, tf.zeros_like(shape), shape), + tf.fill(tf.stack(pad_shape), tf.cast(fill_value, tensor.dtype)) + ], axis) + + # Update shape. + new_shape = tensor.get_shape().as_list() # A copy is being made. + new_shape[axis] = new_size + resized.set_shape(new_shape) + return resized + +class BaseReader(object): + """Inherit from this class when implementing new readers.""" + + def prepare_reader(self, unused_filename_queue): + """Create a thread for generating prediction and label tensors.""" + raise NotImplementedError() + + +class YT8MAggregatedFeatureReader(BaseReader): + """Reads TFRecords of pre-aggregated Examples. + + The TFRecords must contain Examples with a sparse int64 'labels' feature and + a fixed length float32 feature, obtained from the features in 'feature_name'. + The float features are assumed to be an average of dequantized values. + """ + + def __init__(self, + num_classes=527, + feature_sizes=[1024, 128], + feature_names=["mean_rgb", "mean_audio"]): + """Construct a YT8MAggregatedFeatureReader. + + Args: + num_classes: a positive integer for the number of classes. + feature_sizes: positive integer(s) for the feature dimensions as a list. + feature_names: the feature name(s) in the tensorflow record as a list. + """ + + assert len(feature_names) == len(feature_sizes), \ + "length of feature_names (={}) != length of feature_sizes (={})".format( \ + len(feature_names), len(feature_sizes)) + + self.num_classes = num_classes + self.feature_sizes = feature_sizes + self.feature_names = feature_names + + def prepare_reader(self, filename_queue, batch_size=1024): + """Creates a single reader thread for pre-aggregated YouTube 8M Examples. + + Args: + filename_queue: A tensorflow queue of filename locations. + + Returns: + A tuple of video indexes, features, labels, and padding data. + """ + reader = tf.TFRecordReader() + _, serialized_examples = reader.read_up_to(filename_queue, batch_size) + + tf.add_to_collection("serialized_examples", serialized_examples) + return self.prepare_serialized_examples(serialized_examples) + + def prepare_serialized_examples(self, serialized_examples): + # set the mapping from the fields to data types in the proto + num_features = len(self.feature_names) + assert num_features > 0, "self.feature_names is empty!" + assert len(self.feature_names) == len(self.feature_sizes), \ + "length of feature_names (={}) != length of feature_sizes (={})".format( \ + len(self.feature_names), len(self.feature_sizes)) + + feature_map = {"video_id": tf.FixedLenFeature([], tf.string), + "labels": tf.VarLenFeature(tf.int64)} + for feature_index in range(num_features): + feature_map[self.feature_names[feature_index]] = tf.FixedLenFeature( + [self.feature_sizes[feature_index]], tf.float32) + + features = tf.parse_example(serialized_examples, features=feature_map) + labels = tf.sparse_to_indicator(features["labels"], self.num_classes) + labels.set_shape([None, self.num_classes]) + concatenated_features = tf.concat([ + features[feature_name] for feature_name in self.feature_names], 1) + + return features["video_id"], concatenated_features, labels, tf.ones([tf.shape(serialized_examples)[0]]) + +class YT8MFrameFeatureReader(BaseReader): + """Reads TFRecords of SequenceExamples. + + The TFRecords must contain SequenceExamples with the sparse in64 'labels' + context feature and a fixed length byte-quantized feature vector, obtained + from the features in 'feature_names'. The quantized features will be mapped + back into a range between min_quantized_value and max_quantized_value. + """ + + def __init__(self, + num_classes=527, + feature_sizes=[1024, 128], + feature_names=["rgb", "audio"], + max_frames=300): + """Construct a YT8MFrameFeatureReader. + + Args: + num_classes: a positive integer for the number of classes. + feature_sizes: positive integer(s) for the feature dimensions as a list. + feature_names: the feature name(s) in the tensorflow record as a list. + max_frames: the maximum number of frames to process. + """ + + assert len(feature_names) == len(feature_sizes), \ + "length of feature_names (={}) != length of feature_sizes (={})".format( \ + len(feature_names), len(feature_sizes)) + + self.num_classes = num_classes + self.feature_sizes = feature_sizes + self.feature_names = feature_names + self.max_frames = max_frames + + def get_video_matrix(self, + features, + feature_size, + max_frames, + max_quantized_value, + min_quantized_value): + """Decodes features from an input string and quantizes it. + + Args: + features: raw feature values + feature_size: length of each frame feature vector + max_frames: number of frames (rows) in the output feature_matrix + max_quantized_value: the maximum of the quantized value. + min_quantized_value: the minimum of the quantized value. + + Returns: + feature_matrix: matrix of all frame-features + num_frames: number of frames in the sequence + """ + decoded_features = tf.reshape( + tf.cast(tf.decode_raw(features, tf.uint8), tf.float32), + [-1, feature_size]) + + num_frames = tf.minimum(tf.shape(decoded_features)[0], max_frames) + feature_matrix = utils.Dequantize(decoded_features, + max_quantized_value, + min_quantized_value) + feature_matrix = resize_axis(feature_matrix, 0, max_frames) + return feature_matrix, num_frames + + def prepare_reader(self, + filename_queue, + max_quantized_value=2, + min_quantized_value=-2): + """Creates a single reader thread for YouTube8M SequenceExamples. + + Args: + filename_queue: A tensorflow queue of filename locations. + max_quantized_value: the maximum of the quantized value. + min_quantized_value: the minimum of the quantized value. + + Returns: + A tuple of video indexes, video features, labels, and padding data. + """ + reader = tf.TFRecordReader() + _, serialized_example = reader.read(filename_queue) + + return self.prepare_serialized_examples(serialized_example, + max_quantized_value, min_quantized_value) + + def prepare_serialized_examples(self, serialized_example, + max_quantized_value=2, min_quantized_value=-2): + + contexts, features = tf.parse_single_sequence_example( + serialized_example, + context_features={"video_id": tf.FixedLenFeature( + [], tf.string), + "labels": tf.VarLenFeature(tf.int64)}, + sequence_features={ + feature_name : tf.FixedLenSequenceFeature([], dtype=tf.string) + for feature_name in self.feature_names + }) + + # read ground truth labels + labels = (tf.cast( + tf.sparse_to_dense(contexts["labels"].values, (self.num_classes,), 1, + validate_indices=False), + tf.bool)) + + # loads (potentially) different types of features and concatenates them + num_features = len(self.feature_names) + assert num_features > 0, "No feature selected: feature_names is empty!" + + assert len(self.feature_names) == len(self.feature_sizes), \ + "length of feature_names (={}) != length of feature_sizes (={})".format( \ + len(self.feature_names), len(self.feature_sizes)) + + num_frames = -1 # the number of frames in the video + feature_matrices = [None] * num_features # an array of different features + for feature_index in range(num_features): + feature_matrix, num_frames_in_this_feature = self.get_video_matrix( + features[self.feature_names[feature_index]], + self.feature_sizes[feature_index], + self.max_frames, + max_quantized_value, + min_quantized_value) + if num_frames == -1: + num_frames = num_frames_in_this_feature + else: + tf.assert_equal(num_frames, num_frames_in_this_feature) + + feature_matrices[feature_index] = feature_matrix + + # cap the number of frames at self.max_frames + num_frames = tf.minimum(num_frames, self.max_frames) + + # concatenate different features + video_matrix = tf.concat(feature_matrices, 1) + + # convert to batch format. + # TODO: Do proper batch reads to remove the IO bottleneck. + batch_video_ids = tf.expand_dims(contexts["video_id"], 0) + batch_video_matrix = tf.expand_dims(video_matrix, 0) + batch_labels = tf.expand_dims(labels, 0) + batch_frames = tf.expand_dims(num_frames, 0) + + return batch_video_ids, batch_video_matrix, batch_labels, batch_frames \ No newline at end of file diff --git a/audio/vgg_inference_demo.py b/audio/vgg_inference_demo.py new file mode 100644 index 00000000..d4ef64b3 --- /dev/null +++ b/audio/vgg_inference_demo.py @@ -0,0 +1,157 @@ +# Copyright 2017 The TensorFlow Authors All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +r"""A simple demonstration of running VGGish in inference mode. + +This is intended as a toy example that demonstrates how the various building +blocks (feature extraction, model definition and loading, postprocessing) work +together in an inference context. + +A WAV file (assumed to contain signed 16-bit PCM samples) is read in, converted +into log mel spectrogram examples, fed into VGGish, the raw embedding output is +whitened and quantized, and the postprocessed embeddings are optionally written +in a SequenceExample to a TFRecord file (using the same format as the embedding +features released in AudioSet). + +Usage: + # Run a WAV file through the model and print the embeddings. The model + # checkpoint is loaded from vggish_model.ckpt and the PCA parameters are + # loaded from vggish_pca_params.npz in the current directory. + $ python vggish_inference_demo.py --wav_file /path/to/a/wav/file + + # Run a WAV file through the model and also write the embeddings to + # a TFRecord file. The model checkpoint and PCA parameters are explicitly + # passed in as well. + $ python vggish_inference_demo.py --wav_file /path/to/a/wav/file \ + --tfrecord_file /path/to/tfrecord/file \ + --checkpoint /path/to/model/checkpoint \ + --pca_params /path/to/pca/params + + # Run a built-in input (a sine wav) through the model and print the + # embeddings. Associated model files are read from the current directory. + $ python vggish_inference_demo.py +""" + +from __future__ import print_function + +import numpy as np +from scipy.io import wavfile +import six +import tensorflow as tf + +import vggish_input +import vggish_params +import vggish_postprocess +import vggish_slim + +flags = tf.app.flags + +flags.DEFINE_string( + 'wav_file', None, + 'Path to a wav file. Should contain signed 16-bit PCM samples. ' + 'If none is provided, a synthetic sound is used.') + +flags.DEFINE_string( + 'checkpoint', 'vggish_model.ckpt', + 'Path to the VGGish checkpoint file.') + +flags.DEFINE_string( + 'pca_params', 'vggish_pca_params.npz', + 'Path to the VGGish PCA parameters file.') + +flags.DEFINE_string( + 'tfrecord_file', None, + 'Path to a TFRecord file where embeddings will be written.') + +FLAGS = flags.FLAGS + + +def main(_): + # In this simple example, we run the examples from a single audio file through + # the model. If none is provided, we generate a synthetic input. + if FLAGS.wav_file: + wav_file = FLAGS.wav_file + else: + # Write a WAV of a sine wav into an in-memory file object. + num_secs = 5 + freq = 1000 + sr = 44100 + t = np.linspace(0, num_secs, int(num_secs * sr)) + x = np.sin(2 * np.pi * freq * t) + # Convert to signed 16-bit samples. + samples = np.clip(x * 32768, -32768, 32767).astype(np.int16) + wav_file = six.BytesIO() + wavfile.write(wav_file, sr, samples) + wav_file.seek(0) + examples_batch = vggish_input.wavfile_to_examples(wav_file) + print(examples_batch) + + # Prepare a postprocessor to munge the model embeddings. + pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) + + # If needed, prepare a record writer to store the postprocessed embeddings. + writer = tf.python_io.TFRecordWriter( + FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None + + with tf.Graph().as_default(), tf.Session() as sess: + # Define the model in inference mode, load the checkpoint, and + # locate input and output tensors. + vggish_slim.define_vggish_slim(training=False) + vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) + features_tensor = sess.graph.get_tensor_by_name( + vggish_params.INPUT_TENSOR_NAME) + embedding_tensor = sess.graph.get_tensor_by_name( + vggish_params.OUTPUT_TENSOR_NAME) + + # Run inference and postprocessing. + [embedding_batch] = sess.run([embedding_tensor], + feed_dict={features_tensor: examples_batch}) + print(embedding_batch) + postprocessed_batch = pproc.postprocess(embedding_batch) + print(postprocessed_batch) + + # Write the postprocessed embeddings as a SequenceExample, in a similar + # format as the features released in AudioSet. Each row of the batch of + # embeddings corresponds to roughly a second of audio (96 10ms frames), and + # the rows are written as a sequence of bytes-valued features, where each + # feature value contains the 128 bytes of the whitened quantized embedding. + seq_example = tf.train.SequenceExample( + context=tf.train.Features(feature={ + 'video_id': tf.train.Feature(bytes_list=tf.train.BytesList(value=[wav_file.encode()])) + }), + feature_lists=tf.train.FeatureLists( + feature_list={ + vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: + tf.train.FeatureList( + feature=[ + tf.train.Feature( + bytes_list=tf.train.BytesList( + value=[embedding.tobytes()])) + for embedding in postprocessed_batch + ] + ) + } + ) + ) + print(seq_example) + if writer: + writer.write(seq_example.SerializeToString()) + + if writer: + writer.close() + + +if __name__ == '__main__': + tf.app.run()