Skip to content

Correct pylint for summarization datasets #1651

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions tensorflow_datasets/summarization/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@
# limitations under the License.

# Lint as: python3

"""Text datasets."""

# pylint: disable=c0103

from tensorflow_datasets.summarization.aeslc import Aeslc
from tensorflow_datasets.summarization.big_patent import BigPatent
from tensorflow_datasets.summarization.billsum import Billsum
Expand Down
21 changes: 6 additions & 15 deletions tensorflow_datasets/summarization/cnn_dailymail.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,9 @@
# limitations under the License.

# Lint as: python3

"""CNN/DailyMail Summarization dataset, non-anonymized version."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import hashlib
import os
from absl import logging
Expand Down Expand Up @@ -78,14 +77,6 @@

_HIGHLIGHTS = 'highlights'
_ARTICLE = 'article'
_SUPPORTED_VERSIONS = [
# Same data as 0.0.2
tfds.core.Version('1.0.0',
'New split API (https://tensorflow.org/datasets/splits)'),
# Having the model predict newline separators makes it easier to evaluate
# using summary-level ROUGE.
tfds.core.Version('2.0.0', 'Separate target sentences with newline.')
]

# Using cased version.
_DEFAULT_VERSION = tfds.core.Version('3.0.0', 'Using cased version.')
Expand Down Expand Up @@ -212,12 +203,11 @@ def fix_missing_period(line):
for line in lines:
if not line:
continue # empty line
elif line.startswith('@highlight'):
if line.startswith('@highlight'):
next_is_highlight = True
elif next_is_highlight:
if next_is_highlight:
highlights.append(line)
else:
article_lines.append(line)
article_lines.append(line)

# Make article into a single string
article = ' '.join(article_lines)
Expand Down Expand Up @@ -277,6 +267,7 @@ def _vocab_text_gen(self, paths):
yield ' '.join([ex[_ARTICLE], ex[_HIGHLIGHTS]])

def _split_generators(self, dl_manager):
"""Generate Splits."""
dl_paths = dl_manager.download_and_extract(_DL_URLS)
train_files = _subset_filenames(dl_paths, tfds.Split.TRAIN)
# Generate shared vocabulary
Expand Down
11 changes: 6 additions & 5 deletions tensorflow_datasets/summarization/cnn_dailymail_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,9 @@
# limitations under the License.

# Lint as: python3

"""Tests for tensorflow_datasets.text.cnn_dailymail."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function


import tempfile

Expand All @@ -43,6 +42,7 @@


class CnnDailymailTest(testing.DatasetBuilderTestCase):
""" Create testing.DatasetBuilderTestCase for Test"""
DATASET_CLASS = cnn_dailymail.CnnDailymail
SPLITS = {'train': 3, 'validation': 2, 'test': 2}
DL_EXTRACT_RESULT = {
Expand All @@ -54,10 +54,11 @@ class CnnDailymailTest(testing.DatasetBuilderTestCase):
}

def test_get_art_abs(self):
"""Generate some article text"""
with tempfile.NamedTemporaryFile(delete=True) as f:
f.write(_STORY_FILE)
f.flush()
article, abstract = cnn_dailymail._get_art_abs(f.name,
article, abstract = cnn_dailymail._get_art_abs(f.name, # pylint: disable=protected-access
tfds.core.Version('1.0.0'))
self.assertEqual('Some article. This is some article text.', article)
# This is a bit weird, but the original code at
Expand All @@ -66,7 +67,7 @@ def test_get_art_abs(self):
self.assertEqual('highlight text . Highlight two . highlight Three .',
abstract)

article, abstract = cnn_dailymail._get_art_abs(f.name,
article, abstract = cnn_dailymail._get_art_abs(f.name, # pylint: disable=protected-access
tfds.core.Version('2.0.0'))
self.assertEqual('highlight text .\nHighlight two .\nhighlight Three .',
abstract)
Expand Down
9 changes: 3 additions & 6 deletions tensorflow_datasets/summarization/wikihow.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,9 @@
# limitations under the License.

# Lint as: python3

"""WikiHow Datasets."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import csv
import os
Expand Down Expand Up @@ -170,7 +168,7 @@ def _generate_examples(self, path=None, title_set=None):
"""Yields examples."""
with tf.io.gfile.GFile(path) as f:
reader = csv.reader(f)
headers = next(reader)
headers = next(reader) # pylint: disable=stop-iteration-return
if self.builder_config.name == "all" and headers != [
"headline", "title", "text"
]:
Expand Down Expand Up @@ -210,5 +208,4 @@ def _filter_and_clean(abstract, article):
# remove extra commas in articles
article = re.sub(r"[.]+[\n]+[,]", ".\n", article)
return abstract, article
else:
return "", ""
return "", ""