steven-jun-liu
diff --git a/‎speech-commands/demo/dataset-vis.js
Lines changed: 15 additions & 2 deletions b/‎speech-commands/demo/dataset-vis.js
Lines changed: 15 additions & 2 deletions
diff --git a/‎speech-commands/demo/index.html
Lines changed: 4 additions & 0 deletions b/‎speech-commands/demo/index.html
Lines changed: 4 additions & 0 deletions
diff --git a/‎speech-commands/demo/index.js
Lines changed: 8 additions & 2 deletions b/‎speech-commands/demo/index.js
Lines changed: 8 additions & 2 deletions
diff --git a/‎speech-commands/demo/style.css
Lines changed: 8 additions & 0 deletions b/‎speech-commands/demo/style.css
Lines changed: 8 additions & 0 deletions
diff --git a/‎speech-commands/src/browser_fft_extractor.ts
Lines changed: 33 additions & 4 deletions b/‎speech-commands/src/browser_fft_extractor.ts
Lines changed: 33 additions & 4 deletions
diff --git a/‎speech-commands/src/browser_fft_recognizer.ts
Lines changed: 36 additions & 23 deletions b/‎speech-commands/src/browser_fft_recognizer.ts
Lines changed: 36 additions & 23 deletions
@@ -113,10 +113,11 @@ export class DatasetViz {
    * @param {SpectrogramData} spectrogram Optional spectrogram data.
    *   If provided, will use it as is. If not provided, will use WebAudio
    *   to collect an example.
+   * @param {RawAudio} rawAudio Raw audio waveform. Optional
    * @param {string} uid UID of the example being drawn. Must match the UID
    *   of the example from `this.transferRecognizer`.
    */
-  async drawExample(wordDiv, word, spectrogram, uid) {
+  async drawExample(wordDiv, word, spectrogram, rawAudio, uid) {
     if (uid == null) {
       throw new Error('Error: UID is not provided for pre-existing example.');
     }
@@ -192,6 +193,17 @@ export class DatasetViz {
           keyFrameIndex: spectrogram.keyFrameIndex
         });
 
+    if (rawAudio != null) {
+      const playButton = document.createElement('button');
+      playButton.textContent = '▶️';
+      playButton.addEventListener('click', () => {
+        playButton.disabled = true;
+        speechCommands.utils.playRawAudio(
+            rawAudio, () => playButton.disabled = false);
+      });
+      wordDiv.appendChild(playButton);
+    }
+
     // Create Delete button.
     const deleteButton = document.createElement('button');
     deleteButton.textContent = 'X';
@@ -250,7 +262,8 @@ export class DatasetViz {
       }
 
       const spectrogram = example.example.spectrogram;
-      await this.drawExample(wordDiv, word, spectrogram, example.uid);
+      await this.drawExample(
+          wordDiv, word, spectrogram, example.example.rawAudio, example.uid);
     } else {
       removeNonFixedChildrenFromWordDiv(wordDiv);
     }
 
@@ -31,6 +31,10 @@
       <option value="1">Duration x1</option>
       <option value="2" selected="true">Duration x2</option>
     </select>
+
+    <input type="checkbox" id="include-audio-waveform">
+    <span id="include-audio-waveform-label">Include audio waveform</span>
+
     <button id="enter-learn-words" disabled="true">Enter transfer words</button>
 
     <div id="transfer-learn-history"></div>
 
@@ -56,6 +56,8 @@ const transferModelNameInput = document.getElementById('transfer-model-name');
 const learnWordsInput = document.getElementById('learn-words');
 const durationMultiplierSelect = document.getElementById('duration-multiplier');
 const enterLearnWordsButton = document.getElementById('enter-learn-words');
+const includeTimeDomainWaveformCheckbox =
+  document.getElementById('include-audio-waveform');
 const collectButtonsDiv = document.getElementById('collect-words');
 const startTransferLearnButton =
     document.getElementById('start-transfer-learn');
@@ -278,18 +280,22 @@ function createWordDivs(transferWords) {
         }
       }
 
+      collectExampleOptions.includeRawAudio =
+          includeTimeDomainWaveformCheckbox.checked;
       const spectrogram = await transferRecognizer.collectExample(
           word, collectExampleOptions);
 
+
       if (intervalJob != null) {
         clearInterval(intervalJob);
       }
       if (progressBar != null) {
         wordDiv.removeChild(progressBar);
       }
       const examples = transferRecognizer.getExamples(word)
-      const exampleUID = examples[examples.length - 1].uid;
-      await datasetViz.drawExample(wordDiv, word, spectrogram, exampleUID);
+      const example = examples[examples.length - 1];
+      await datasetViz.drawExample(
+          wordDiv, word, spectrogram, example.example.rawAudio, example.uid);
       enableAllCollectWordButtons();
     });
   }
 
@@ -173,3 +173,11 @@ textarea {
 input[type=checkbox] {
   transform: scale(2);
 }
+
+#include-audio-waveform {
+  margin-left: 20px;
+}
+
+#include-audio-waveform-label {
+  font-size: 17px;
+}
@@ -20,10 +20,12 @@
  */
 
 import * as tf from '@tensorflow/tfjs';
+
 import {getAudioContextConstructor, getAudioMediaStream} from './browser_fft_utils';
 import {FeatureExtractor, RecognizerParams} from './types';
 
-export type SpectrogramCallback = (x: tf.Tensor) => Promise<boolean>;
+export type SpectrogramCallback = (freqData: tf.Tensor, timeData?: tf.Tensor) =>
+    Promise<boolean>;
 
 /**
  * Configurations for constructing BrowserFftFeatureExtractor.
@@ -68,6 +70,14 @@ export interface BrowserFftFeatureExtractorConfig extends RecognizerParams {
    * will be taken every 600 ms.
    */
   overlapFactor: number;
+
+  /**
+   * Whether to collect the raw time-domain audio waveform in addition to the
+   * spectrogram.
+   *
+   * Default: `false`.
+   */
+  includeRawAudio?: boolean;
 }
 
 /**
@@ -91,6 +101,7 @@ export class BrowserFftFeatureExtractor implements FeatureExtractor {
   // Overlapping factor: the ratio between the temporal spacing between
   // consecutive spectrograms and the length of each individual spectrogram.
   readonly overlapFactor: number;
+  readonly includeRawAudio: boolean;
 
   private readonly spectrogramCallback: SpectrogramCallback;
 
@@ -101,7 +112,9 @@ export class BrowserFftFeatureExtractor implements FeatureExtractor {
   private analyser: AnalyserNode;
   private tracker: Tracker;
   private freqData: Float32Array;
+  private timeData: Float32Array;
   private freqDataQueue: Float32Array[];
+  private timeDataQueue: Float32Array[];
   // tslint:disable-next-line:no-any
   private frameIntervalTask: any;
   private frameDurationMillis: number;
@@ -144,6 +157,7 @@ export class BrowserFftFeatureExtractor implements FeatureExtractor {
     this.frameDurationMillis = this.fftSize / this.sampleRateHz * 1e3;
     this.columnTruncateLength = config.columnTruncateLength || this.fftSize;
     this.overlapFactor = config.overlapFactor;
+    this.includeRawAudio = config.includeRawAudio;
 
     tf.util.assert(
         this.overlapFactor >= 0 && this.overlapFactor < 1,
@@ -183,6 +197,10 @@ export class BrowserFftFeatureExtractor implements FeatureExtractor {
     // Reset the queue.
     this.freqDataQueue = [];
     this.freqData = new Float32Array(this.fftSize);
+    if (this.includeRawAudio) {
+      this.timeDataQueue = [];
+      this.timeData = new Float32Array(this.fftSize);
+    }
     const period =
         Math.max(1, Math.round(this.numFrames * (1 - this.overlapFactor)));
     this.tracker = new Tracker(
@@ -199,20 +217,31 @@ export class BrowserFftFeatureExtractor implements FeatureExtractor {
     }
 
     this.freqDataQueue.push(this.freqData.slice(0, this.columnTruncateLength));
+    if (this.includeRawAudio) {
+      this.analyser.getFloatTimeDomainData(this.timeData);
+      this.timeDataQueue.push(this.timeData.slice());
+    }
     if (this.freqDataQueue.length > this.numFrames) {
       // Drop the oldest frame (least recent).
       this.freqDataQueue.shift();
     }
     const shouldFire = this.tracker.tick();
     if (shouldFire) {
       const freqData = flattenQueue(this.freqDataQueue);
-      const inputTensor = getInputTensorFromFrequencyData(
+      const freqDataTensor = getInputTensorFromFrequencyData(
           freqData, [1, this.numFrames, this.columnTruncateLength, 1]);
-      const shouldRest = await this.spectrogramCallback(inputTensor);
+      let timeDataTensor: tf.Tensor;
+      if (this.includeRawAudio) {
+        const timeData = flattenQueue(this.timeDataQueue);
+        timeDataTensor = getInputTensorFromFrequencyData(
+            timeData, [1, this.numFrames * this.fftSize]);
+      }
+      const shouldRest =
+          await this.spectrogramCallback(freqDataTensor, timeDataTensor);
       if (shouldRest) {
         this.tracker.suppress();
       }
-      inputTensor.dispose();
+      tf.dispose([freqDataTensor, timeDataTensor]);
     }
   }
 
 
@@ -16,12 +16,13 @@
  */
 
 import * as tf from '@tensorflow/tfjs';
+
 import {BrowserFftFeatureExtractor, SpectrogramCallback} from './browser_fft_extractor';
 import {loadMetadataJson, normalize, normalizeFloat32Array} from './browser_fft_utils';
 import {BACKGROUND_NOISE_TAG, Dataset} from './dataset';
 import {concatenateFloat32Arrays} from './generic_utils';
 import {balancedTrainValSplit} from './training_utils';
-import {EvaluateConfig, EvaluateResult, Example, ExampleCollectionOptions, RecognizeConfig, RecognizerCallback, RecognizerParams, ROCCurve, SpectrogramData, SpeechCommandRecognizer, SpeechCommandRecognizerMetadata, SpeechCommandRecognizerResult, StreamingRecognitionConfig, TransferLearnConfig, TransferSpeechCommandRecognizer, AudioDataAugmentationOptions} from './types';
+import {AudioDataAugmentationOptions, EvaluateConfig, EvaluateResult, Example, ExampleCollectionOptions, RecognizeConfig, RecognizerCallback, RecognizerParams, ROCCurve, SpectrogramData, SpeechCommandRecognizer, SpeechCommandRecognizerMetadata, SpeechCommandRecognizerResult, StreamingRecognitionConfig, TransferLearnConfig, TransferSpeechCommandRecognizer} from './types';
 import {version} from './version';
 
 export const UNKNOWN_TAG = '_unknown_';
@@ -206,7 +207,8 @@ export class BrowserFftSpeechCommandRecognizer implements
         () => `Expected overlapFactor to be >= 0 and < 1, but got ${
             overlapFactor}`);
 
-    const spectrogramCallback: SpectrogramCallback = async (x: tf.Tensor) => {
+    const spectrogramCallback: SpectrogramCallback =
+        async (x: tf.Tensor, timeData?: tf.Tensor) => {
       const normalizedX = normalize(x);
       let y: tf.Tensor;
       let embedding: tf.Tensor;
@@ -714,27 +716,33 @@ class TransferBrowserFftSpeechCommandRecognizer extends
       let lastIndex = -1;
       const spectrogramSnippets: Float32Array[] = [];
 
-      const spectrogramCallback: SpectrogramCallback = async (x: tf.Tensor) => {
+      const spectrogramCallback: SpectrogramCallback =
+          async (freqData: tf.Tensor, timeData?: tf.Tensor) => {
         // TODO(cais): can we consolidate the logic in the two branches?
         if (options.onSnippet == null) {
-          const normalizedX = normalize(x);
+          const normalizedX = normalize(freqData);
           this.dataset.addExample({
             label: word,
             spectrogram: {
               data: await normalizedX.data() as Float32Array,
               frameSize: this.nonBatchInputShape[1],
-            }
+            },
+            rawAudio: options.includeRawAudio ? {
+              data: await timeData.data() as Float32Array,
+              sampleRateHz: this.audioDataExtractor.sampleRateHz
+            } :
+                                                undefined
           });
           normalizedX.dispose();
           await this.audioDataExtractor.stop();
           this.streaming = false;
           this.collateTransferWords();
           resolve({
-            data: await x.data() as Float32Array,
+            data: await freqData.data() as Float32Array,
             frameSize: this.nonBatchInputShape[1],
           });
         } else {
-          const data = await x.data() as Float32Array;
+          const data = await freqData.data() as Float32Array;
           if (lastIndex === -1) {
             lastIndex = data.length;
           }
@@ -763,8 +771,15 @@ class TransferBrowserFftSpeechCommandRecognizer extends
               data: normalized,
               frameSize: this.nonBatchInputShape[1]
             };
-            this.dataset.addExample(
-                {label: word, spectrogram: finalSpectrogram});
+            this.dataset.addExample({
+              label: word,
+              spectrogram: finalSpectrogram,
+              rawAudio: options.includeRawAudio ? {
+                data: await timeData.data() as Float32Array,
+                sampleRateHz: this.audioDataExtractor.sampleRateHz
+              } :
+                                                  undefined
+            });
             // TODO(cais): Fix 1-tensor memory leak.
             resolve(finalSpectrogram);
           }
@@ -777,7 +792,8 @@ class TransferBrowserFftSpeechCommandRecognizer extends
         columnTruncateLength: this.nonBatchInputShape[1],
         suppressionTimeMillis: 0,
         spectrogramCallback,
-        overlapFactor
+        overlapFactor,
+        includeRawAudio: options.includeRawAudio
       });
       this.audioDataExtractor.start(options.audioTrackConstraints);
     });
@@ -910,11 +926,9 @@ class TransferBrowserFftSpeechCommandRecognizer extends
     const numFrames = this.nonBatchInputShape[0];
     windowHopRatio = windowHopRatio || DEFAULT_WINDOW_HOP_RATIO;
     const hopFrames = Math.round(windowHopRatio * numFrames);
-    const out = this.dataset.getData(null, {
-      numFrames,
-      hopFrames,
-      ...augmentationOptions
-    }) as {xs: tf.Tensor4D, ys?: tf.Tensor2D};
+    const out = this.dataset.getData(
+                    null, {numFrames, hopFrames, ...augmentationOptions}) as
+        {xs: tf.Tensor4D, ys?: tf.Tensor2D};
     return {xs: out.xs, ys: out.ys as tf.Tensor};
   }
 
@@ -936,8 +950,8 @@ class TransferBrowserFftSpeechCommandRecognizer extends
    *   `this.model.fitDataset`.
    */
   private collectTransferDataAsTfDataset(
-      windowHopRatio?: number, validationSplit = 0.15,
-      batchSize = 32, augmentationOptions?: AudioDataAugmentationOptions):
+      windowHopRatio?: number, validationSplit = 0.15, batchSize = 32,
+      augmentationOptions?: AudioDataAugmentationOptions):
       [tf.data.Dataset<{}>, tf.data.Dataset<{}>] {
     const numFrames = this.nonBatchInputShape[0];
     windowHopRatio = windowHopRatio || DEFAULT_WINDOW_HOP_RATIO;
@@ -1037,9 +1051,8 @@ class TransferBrowserFftSpeechCommandRecognizer extends
     const batchSize = config.batchSize == null ? 32 : config.batchSize;
     const windowHopRatio = config.windowHopRatio || DEFAULT_WINDOW_HOP_RATIO;
     const [trainDataset, valDataset] = this.collectTransferDataAsTfDataset(
-        windowHopRatio, config.validationSplit, batchSize, {
-          augmentByMixingNoiseRatio: config.augmentByMixingNoiseRatio
-        });
+        windowHopRatio, config.validationSplit, batchSize,
+        {augmentByMixingNoiseRatio: config.augmentByMixingNoiseRatio});
     const t0 = tf.util.now();
     const history = await this.model.fitDataset(trainDataset, {
       epochs: config.epochs,
@@ -1067,9 +1080,9 @@ class TransferBrowserFftSpeechCommandRecognizer extends
       Promise<tf.History|[tf.History, tf.History]> {
     // Prepare the data.
     const windowHopRatio = config.windowHopRatio || DEFAULT_WINDOW_HOP_RATIO;
-    const {xs, ys} = this.collectTransferDataAsTensors(windowHopRatio, {
-      augmentByMixingNoiseRatio: config.augmentByMixingNoiseRatio
-    });
+    const {xs, ys} = this.collectTransferDataAsTensors(
+        windowHopRatio,
+        {augmentByMixingNoiseRatio: config.augmentByMixingNoiseRatio});
     console.log(
         `Training data: xs.shape = ${xs.shape}, ys.shape = ${ys.shape}`);