Skip to content

Commit 3c64cf9

Browse files
suyashkumaryongtang
authored andcommitted
[Genomics] Add op to calculate Phred Quality Scores (#620)
* Add ops to convert phred quality scores * Ensure dim of quality is set * minor cleanup * update API, lint cleanup * rm newline * lint * cast to tf.int64 * Add eager mode tests for genome * fix lint
1 parent fb4640b commit 3c64cf9

File tree

5 files changed

+203
-1
lines changed

5 files changed

+203
-1
lines changed

tensorflow_io/core/ops/genome_ops.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ REGISTER_OP("IO>ReadFastq")
2525
.Output("raw_quality: string")
2626
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
2727
c->set_output(0, c->MakeShape({c->UnknownDim()}));
28+
c->set_output(1, c->MakeShape({c->UnknownDim()}));
2829
return Status::OK();
2930
});
3031

tensorflow_io/core/python/api/v0/genome.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,4 @@
1616

1717
from tensorflow_io.core.python.ops.genome_ops import read_fastq # pylint: disable=unused-import
1818
from tensorflow_io.core.python.ops.genome_ops import sequences_to_onehot # pylint: disable=unused-import
19+
from tensorflow_io.core.python.ops.genome_ops import phred_sequences_to_probability # pylint: disable=unused-import

tensorflow_io/core/python/ops/genome_ops.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,5 +84,56 @@ def sequences_to_onehot(sequences):
8484
sequence_splits.size(), global_nucleotide_idx)
8585
return tf.RaggedTensor.from_row_splits(
8686
values=all_onehot_nucleotides.stack(),
87-
row_splits=sequence_splits.stack()
87+
row_splits=tf.cast(sequence_splits.stack(), tf.int64)
8888
)
89+
90+
91+
@tf.function
92+
def _decode_byte_str(b_str):
93+
return tf.dtypes.cast(
94+
tf.strings.unicode_decode(b_str, "ASCII"), dtype=tf.float32)
95+
96+
97+
@tf.function
98+
def _phred_byte_to_probability(phred_byte_str):
99+
return tf.math.pow(
100+
10.,
101+
-(_decode_byte_str(phred_byte_str) - 33) / 10
102+
)
103+
104+
105+
@tf.function
106+
def _phred_sequence_to_probability(seq_quality):
107+
return tf.map_fn(_phred_byte_to_probability,
108+
seq_quality,
109+
dtype=tf.float32)
110+
111+
112+
@tf.function
113+
def phred_sequences_to_probability(phred_qualities):
114+
"""Converts raw phred quality scores into base-calling error probabilities.
115+
116+
For each ASCII encoded phred quality score (X), the probability that there
117+
was an error calling that base is computed by:
118+
119+
P = 10 ^ (-(X - 33) / 10)
120+
121+
This is assuming an "ASCII base" of 33.
122+
123+
The input is a tf.string tensor of ASCII encoded phred qualities,
124+
one string per DNA sequence, with each character representing the quality
125+
of a nucelotide.
126+
127+
For example:
128+
phred_qualities = [["BB<"], ["BBBB"]]
129+
130+
Args:
131+
phred_qualities: A tf.string tensor where each string represents the phred
132+
quality of a DNA sequence. Each character in the string
133+
is the ASCII representation of the phred quality number.
134+
135+
Returns:
136+
tf.RaggedTensor: The quality scores for each base in each sequence provided.
137+
"""
138+
return tf.ragged.map_flat_values(_phred_sequence_to_probability,
139+
tf.strings.bytes_split(phred_qualities))

tests/test_genome.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,5 +102,35 @@ def test_genome_sequences_to_onehot():
102102

103103
assert np.all(out.to_list() == expected)
104104

105+
106+
def test_genome_phred_sequences_to_probability():
107+
"""Test conversion of phred qualities to probabilities"""
108+
example_quality_list = [b'BB<', b'ABFF']
109+
expected_probabilities = [0.0005011872854083776, 0.0005011872854083776,
110+
0.0019952619913965464, 0.0006309572490863502,
111+
0.0005011872854083776, 0.00019952621369156986,
112+
0.00019952621369156986]
113+
114+
with tf.compat.v1.Session() as sess:
115+
example_quality = tf.constant(example_quality_list)
116+
converted_phred = tfio.genome.phred_sequences_to_probability(
117+
example_quality)
118+
out = sess.run(converted_phred)
119+
120+
# Compare flat values
121+
assert np.allclose(out.flat_values.flatten(), expected_probabilities)
122+
# Ensure nested array lengths are correct
123+
assert np.all(
124+
[len(a) == len(b) for a, b in zip(out.to_list(), example_quality_list)])
125+
126+
def test_genome_phred_sequences_to_probability_with_other_genome_ops():
127+
"""Test quality op in graph with read_fastq op, ensure no errors"""
128+
with tf.compat.v1.Session() as sess:
129+
raw_data = tfio.genome.read_fastq(filename=fastq_path)
130+
data = tfio.genome.phred_sequences_to_probability(
131+
phred_qualities=raw_data.raw_quality)
132+
sess.run(data)
133+
134+
105135
if __name__ == "__main__":
106136
test.main()

tests/test_genome_eager.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
4+
# use this file except in compliance with the License. You may obtain a copy of
5+
# the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12+
# License for the specific language governing permissions and limitations under
13+
# the License.
14+
# ==============================================================================
15+
"""Tests for Genome."""
16+
17+
from __future__ import absolute_import
18+
from __future__ import division
19+
from __future__ import print_function
20+
21+
import os
22+
import numpy as np
23+
24+
import tensorflow as tf
25+
import tensorflow_io as tfio # pylint: disable=wrong-import-position
26+
27+
fastq_path = os.path.join(
28+
os.path.dirname(os.path.abspath(__file__)),
29+
"test_genome", "test.fastq")
30+
31+
def test_genome_fastq_reader():
32+
"""test_genome_fastq_reader"""
33+
34+
data = tfio.genome.read_fastq(filename=fastq_path)
35+
36+
data_expected = [
37+
b'GATTACA',
38+
b'CGTTAGCGCAGGGGGCATCTTCACACTGGTGACAGGTAACCGCCGTAGTAAAGGTTCCGCCTTTCACT',
39+
b'CGGCTGGTCAGGCTGACATCGCCGCCGGCCTGCAGCGAGCCGCTGC',
40+
b'CGG']
41+
42+
quality_expected = [
43+
b'BB>B@FA',
44+
b'AAAAABF@BBBDGGGG?FFGFGHBFBFBFABBBHGGGFHHCEFGGGGG?FGFFHEDG3EFGGGHEGHG',
45+
b'FAFAF;F/9;.:/;999B/9A.DFFF;-->.AAB/FC;9-@-=;=.',
46+
b'FAD']
47+
48+
assert np.all(data.sequences == data_expected)
49+
assert np.all(data.raw_quality == quality_expected)
50+
51+
52+
def test_genome_sequences_to_onehot():
53+
"""test sequence one hot encoder"""
54+
expected = [
55+
[[0, 0, 1, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 0, 1], [1, 0, 0, 0],
56+
[0, 1, 0, 0], [1, 0, 0, 0]],
57+
[[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 0, 1], [1, 0, 0, 0],
58+
[0, 0, 1, 0], [0, 1, 0, 0],
59+
[0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0],
60+
[0, 0, 1, 0], [0, 0, 1, 0],
61+
[0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 1, 0, 0],
62+
[0, 0, 0, 1], [0, 0, 0, 1],
63+
[0, 1, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0],
64+
[0, 0, 0, 1], [0, 0, 1, 0],
65+
[0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 1, 0], [1, 0, 0, 0], [0, 1, 0, 0],
66+
[1, 0, 0, 0], [0, 0, 1, 0],
67+
[0, 0, 1, 0], [0, 0, 0, 1], [1, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0],
68+
[0, 1, 0, 0], [0, 0, 1, 0],
69+
[0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [1, 0, 0, 0],
70+
[0, 0, 1, 0], [0, 0, 0, 1],
71+
[1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0],
72+
[0, 0, 0, 1], [0, 0, 0, 1],
73+
[0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 1, 0, 0],
74+
[0, 0, 0, 1], [0, 0, 0, 1],
75+
[0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1]],
76+
[[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1],
77+
[0, 0, 1, 0], [0, 0, 1, 0],
78+
[0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0],
79+
[0, 1, 0, 0], [0, 0, 0, 1],
80+
[0, 0, 1, 0], [1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1],
81+
[0, 1, 0, 0], [0, 0, 1, 0],
82+
[0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 1, 0, 0],
83+
[0, 0, 1, 0], [0, 0, 1, 0],
84+
[0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0],
85+
[1, 0, 0, 0], [0, 0, 1, 0],
86+
[0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0],
87+
[0, 1, 0, 0], [0, 0, 1, 0],
88+
[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0]],
89+
[[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0]]]
90+
91+
raw_data = tfio.genome.read_fastq(filename=fastq_path)
92+
data = tfio.genome.sequences_to_onehot(
93+
sequences=raw_data.sequences)
94+
95+
assert np.all(data.to_list() == expected)
96+
97+
98+
def test_genome_phred_sequences_to_probability():
99+
"""Test conversion of phred qualities to probabilities"""
100+
example_quality_list = [b'BB<', b'ABFF']
101+
expected_probabilities = [0.0005011872854083776, 0.0005011872854083776,
102+
0.0019952619913965464, 0.0006309572490863502,
103+
0.0005011872854083776, 0.00019952621369156986,
104+
0.00019952621369156986]
105+
106+
example_quality = tf.constant(example_quality_list)
107+
converted_phred = tfio.genome.phred_sequences_to_probability(
108+
example_quality)
109+
110+
# Compare flat values
111+
assert np.allclose(
112+
converted_phred.flat_values.numpy().flatten(), expected_probabilities)
113+
# Ensure nested array lengths are correct
114+
assert np.all(
115+
[len(a) == len(b)
116+
for a, b in zip(converted_phred.to_list(), example_quality_list)])
117+
118+
if __name__ == "__main__":
119+
test.main()

0 commit comments

Comments
 (0)