[Genomics] Add op to calculate Phred Quality Scores (#620)

suyashkumar · yongtang · commit 3c64cf9d0f9a · 2019-11-10T20:54:35.000-08:00
* Add ops to convert phred quality scores

* Ensure dim of quality is set

* minor cleanup

* update API, lint cleanup

* rm newline

* lint

* cast to tf.int64

* Add eager mode tests for genome

* fix lint
diff --git a/tensorflow_io/core/ops/genome_ops.cc b/tensorflow_io/core/ops/genome_ops.cc
@@ -25,6 +25,7 @@ REGISTER_OP("IO>ReadFastq")
     .Output("raw_quality: string")
     .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
       c->set_output(0, c->MakeShape({c->UnknownDim()}));
+      c->set_output(1, c->MakeShape({c->UnknownDim()}));
       return Status::OK();
     });
 
diff --git a/tensorflow_io/core/python/api/v0/genome.py b/tensorflow_io/core/python/api/v0/genome.py
@@ -16,3 +16,4 @@
 
 from tensorflow_io.core.python.ops.genome_ops import read_fastq # pylint: disable=unused-import
 from tensorflow_io.core.python.ops.genome_ops import sequences_to_onehot # pylint: disable=unused-import
+from tensorflow_io.core.python.ops.genome_ops import phred_sequences_to_probability # pylint: disable=unused-import
diff --git a/tensorflow_io/core/python/ops/genome_ops.py b/tensorflow_io/core/python/ops/genome_ops.py
@@ -84,5 +84,56 @@ def sequences_to_onehot(sequences):
         sequence_splits.size(), global_nucleotide_idx)
   return tf.RaggedTensor.from_row_splits(
       values=all_onehot_nucleotides.stack(),
-      row_splits=sequence_splits.stack()
+      row_splits=tf.cast(sequence_splits.stack(), tf.int64)
   )
+
+
+@tf.function
+def _decode_byte_str(b_str):
+  return tf.dtypes.cast(
+      tf.strings.unicode_decode(b_str, "ASCII"), dtype=tf.float32)
+
+
+@tf.function
+def _phred_byte_to_probability(phred_byte_str):
+  return tf.math.pow(
+      10.,
+      -(_decode_byte_str(phred_byte_str) - 33) / 10
+  )
+
+
+@tf.function
+def _phred_sequence_to_probability(seq_quality):
+  return tf.map_fn(_phred_byte_to_probability,
+                   seq_quality,
+                   dtype=tf.float32)
+
+
+@tf.function
+def phred_sequences_to_probability(phred_qualities):
+  """Converts raw phred quality scores into base-calling error probabilities.
+
+  For each ASCII encoded phred quality score (X), the probability that there
+  was an error calling that base is computed by:
+
+  P = 10 ^ (-(X - 33) / 10)
+
+  This is assuming an "ASCII base" of 33.
+
+  The input is a tf.string tensor of ASCII encoded phred qualities,
+  one string per DNA sequence, with each character representing the quality
+  of a nucelotide.
+
+  For example:
+  phred_qualities = [["BB<"], ["BBBB"]]
+
+  Args:
+    phred_qualities: A tf.string tensor where each string represents the phred
+                     quality of a DNA sequence. Each character in the string
+                     is the ASCII representation of the phred quality number.
+
+  Returns:
+    tf.RaggedTensor: The quality scores for each base in each sequence provided.
+  """
+  return tf.ragged.map_flat_values(_phred_sequence_to_probability,
+                                   tf.strings.bytes_split(phred_qualities))
diff --git a/tests/test_genome.py b/tests/test_genome.py
@@ -102,5 +102,35 @@ def test_genome_sequences_to_onehot():
 
   assert np.all(out.to_list() == expected)
 
+
+def test_genome_phred_sequences_to_probability():
+  """Test conversion of phred qualities to probabilities"""
+  example_quality_list = [b'BB<', b'ABFF']
+  expected_probabilities = [0.0005011872854083776, 0.0005011872854083776,
+                            0.0019952619913965464, 0.0006309572490863502,
+                            0.0005011872854083776, 0.00019952621369156986,
+                            0.00019952621369156986]
+
+  with tf.compat.v1.Session() as sess:
+    example_quality = tf.constant(example_quality_list)
+    converted_phred = tfio.genome.phred_sequences_to_probability(
+        example_quality)
+    out = sess.run(converted_phred)
+
+  # Compare flat values
+  assert np.allclose(out.flat_values.flatten(), expected_probabilities)
+  # Ensure nested array lengths are correct
+  assert np.all(
+      [len(a) == len(b) for a, b in zip(out.to_list(), example_quality_list)])
+
+def test_genome_phred_sequences_to_probability_with_other_genome_ops():
+  """Test quality op in graph with read_fastq op, ensure no errors"""
+  with tf.compat.v1.Session() as sess:
+    raw_data = tfio.genome.read_fastq(filename=fastq_path)
+    data = tfio.genome.phred_sequences_to_probability(
+        phred_qualities=raw_data.raw_quality)
+    sess.run(data)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tests/test_genome_eager.py b/tests/test_genome_eager.py
@@ -0,0 +1,119 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ==============================================================================
+"""Tests for Genome."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+
+import tensorflow as tf
+import tensorflow_io as tfio # pylint: disable=wrong-import-position
+
+fastq_path = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)),
+    "test_genome", "test.fastq")
+
+def test_genome_fastq_reader():
+  """test_genome_fastq_reader"""
+
+  data = tfio.genome.read_fastq(filename=fastq_path)
+
+  data_expected = [
+      b'GATTACA',
+      b'CGTTAGCGCAGGGGGCATCTTCACACTGGTGACAGGTAACCGCCGTAGTAAAGGTTCCGCCTTTCACT',
+      b'CGGCTGGTCAGGCTGACATCGCCGCCGGCCTGCAGCGAGCCGCTGC',
+      b'CGG']
+
+  quality_expected = [
+      b'BB>B@FA',
+      b'AAAAABF@BBBDGGGG?FFGFGHBFBFBFABBBHGGGFHHCEFGGGGG?FGFFHEDG3EFGGGHEGHG',
+      b'FAFAF;F/9;.:/;999B/9A.DFFF;-->.AAB/FC;9-@-=;=.',
+      b'FAD']
+
+  assert np.all(data.sequences == data_expected)
+  assert np.all(data.raw_quality == quality_expected)
+
+
+def test_genome_sequences_to_onehot():
+  """test sequence one hot encoder"""
+  expected = [
+      [[0, 0, 1, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 0, 1], [1, 0, 0, 0],
+       [0, 1, 0, 0], [1, 0, 0, 0]],
+      [[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 0, 1], [1, 0, 0, 0],
+       [0, 0, 1, 0], [0, 1, 0, 0],
+       [0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0],
+       [0, 0, 1, 0], [0, 0, 1, 0],
+       [0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 1, 0, 0],
+       [0, 0, 0, 1], [0, 0, 0, 1],
+       [0, 1, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0],
+       [0, 0, 0, 1], [0, 0, 1, 0],
+       [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 1, 0], [1, 0, 0, 0], [0, 1, 0, 0],
+       [1, 0, 0, 0], [0, 0, 1, 0],
+       [0, 0, 1, 0], [0, 0, 0, 1], [1, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0],
+       [0, 1, 0, 0], [0, 0, 1, 0],
+       [0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [1, 0, 0, 0],
+       [0, 0, 1, 0], [0, 0, 0, 1],
+       [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0],
+       [0, 0, 0, 1], [0, 0, 0, 1],
+       [0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 1, 0, 0],
+       [0, 0, 0, 1], [0, 0, 0, 1],
+       [0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1]],
+      [[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1],
+       [0, 0, 1, 0], [0, 0, 1, 0],
+       [0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0],
+       [0, 1, 0, 0], [0, 0, 0, 1],
+       [0, 0, 1, 0], [1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1],
+       [0, 1, 0, 0], [0, 0, 1, 0],
+       [0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 1, 0, 0],
+       [0, 0, 1, 0], [0, 0, 1, 0],
+       [0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0],
+       [1, 0, 0, 0], [0, 0, 1, 0],
+       [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0],
+       [0, 1, 0, 0], [0, 0, 1, 0],
+       [0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0]],
+      [[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0]]]
+
+  raw_data = tfio.genome.read_fastq(filename=fastq_path)
+  data = tfio.genome.sequences_to_onehot(
+      sequences=raw_data.sequences)
+
+  assert np.all(data.to_list() == expected)
+
+
+def test_genome_phred_sequences_to_probability():
+  """Test conversion of phred qualities to probabilities"""
+  example_quality_list = [b'BB<', b'ABFF']
+  expected_probabilities = [0.0005011872854083776, 0.0005011872854083776,
+                            0.0019952619913965464, 0.0006309572490863502,
+                            0.0005011872854083776, 0.00019952621369156986,
+                            0.00019952621369156986]
+
+  example_quality = tf.constant(example_quality_list)
+  converted_phred = tfio.genome.phred_sequences_to_probability(
+      example_quality)
+
+  # Compare flat values
+  assert np.allclose(
+      converted_phred.flat_values.numpy().flatten(), expected_probabilities)
+  # Ensure nested array lengths are correct
+  assert np.all(
+      [len(a) == len(b)
+       for a, b in zip(converted_phred.to_list(), example_quality_list)])
+
+if __name__ == "__main__":
+  test.main()

Original file line number	Diff line number	Diff line change
`@@ -16,3 +16,4 @@`
`16`	`16`
`17`	`17`	`from tensorflow_io.core.python.ops.genome_ops import read_fastq # pylint: disable=unused-import`
`18`	`18`	`from tensorflow_io.core.python.ops.genome_ops import sequences_to_onehot # pylint: disable=unused-import`
	`19`	`+from tensorflow_io.core.python.ops.genome_ops import phred_sequences_to_probability # pylint: disable=unused-import`