16
16
"""Multi-News dataset."""
17
17
18
18
import os
19
-
20
- from tensorflow_datasets .core .utils .lazy_imports_utils import tensorflow as tf
19
+ from etils import epath
21
20
import tensorflow_datasets .public_api as tfds
22
21
23
22
_CITATION = """
42
41
- summary: news summary.
43
42
"""
44
43
45
- _URL = "https://drive.google.com/uc?export=download&id=1vRY2wM6rlOZrf9exGTm5pXj5ExlVwJ0C"
44
+ _URL_PATH = "https://huggingface.co/datasets/multi_news/resolve/main/data"
45
+
46
46
47
47
_DOCUMENT = "document"
48
48
_SUMMARY = "summary"
51
51
class MultiNews (tfds .core .GeneratorBasedBuilder ):
52
52
"""Multi-News dataset."""
53
53
54
- VERSION = tfds .core .Version ("1 .0.0" )
54
+ VERSION = tfds .core .Version ("2 .0.0" )
55
55
56
56
def _info (self ):
57
57
return tfds .core .DatasetInfo (
@@ -67,35 +67,35 @@ def _info(self):
67
67
68
68
def _split_generators (self , dl_manager ):
69
69
"""Returns SplitGenerators."""
70
- extract_path = os .path .join (
71
- dl_manager .download_and_extract (_URL ), "multi-news-original"
72
- )
73
- return [
74
- tfds .core .SplitGenerator (
75
- name = tfds .Split .TRAIN ,
76
- gen_kwargs = {"path" : os .path .join (extract_path , "train" )},
77
- ),
78
- tfds .core .SplitGenerator (
79
- name = tfds .Split .VALIDATION ,
80
- gen_kwargs = {"path" : os .path .join (extract_path , "val" )},
70
+ data_dict = {
71
+ "train_src" : _URL_PATH + "train.src.cleaned" ,
72
+ "train_tgt" : _URL_PATH + "train.tgt" ,
73
+ "val_src" : _URL_PATH + "val.src.cleaned" ,
74
+ "val_tgt" : _URL_PATH + "val.tgt" ,
75
+ "test_src" : _URL_PATH + "test.src.cleaned" ,
76
+ "test_tgt" : _URL_PATH + "test.tgt" ,
77
+ }
78
+ files = dl_manager .download_and_extract (data_dict )
79
+ return {
80
+ "train" : self ._generate_examples (
81
+ files ["train_src" ], files ["train_tgt" ]
81
82
),
82
- tfds .core .SplitGenerator (
83
- name = tfds .Split .TEST ,
84
- gen_kwargs = {"path" : os .path .join (extract_path , "test" )},
83
+ "validation" : self ._generate_examples (
84
+ files ["val_src" ], files ["val_tgt" ]
85
85
),
86
- ]
86
+ "test" : self ._generate_examples (files ["test_src" ], files ["test_tgt" ]),
87
+ }
87
88
88
- def _generate_examples (self , path = None ):
89
+ def _generate_examples (self , src_file , tgt_file ):
89
90
"""Yields examples."""
90
- with tf . io . gfile . GFile (
91
- os . path . join ( path + ".src" )
92
- ) as src_f , tf . io . gfile . GFile ( os . path . join ( path + ".tgt" ) ) as tgt_f :
91
+ with epath . Path ( src_file ). open () as src_f , epath . Path (
92
+ tgt_file
93
+ ). open ( ) as tgt_f :
93
94
for i , (src_line , tgt_line ) in enumerate (zip (src_f , tgt_f )):
94
95
yield i , {
95
96
# In original file, each line has one example and natural newline
96
97
# tokens "\n" are being replaced with "NEWLINE_CHAR". Here restore
97
98
# the natural newline token to avoid special vocab "NEWLINE_CHAR".
98
99
_DOCUMENT : src_line .strip ().replace ("NEWLINE_CHAR" , "\n " ),
99
- # Remove the starting token "- " for every target sequence.
100
- _SUMMARY : tgt_line .strip ().lstrip ("- " ),
100
+ _SUMMARY : tgt_line .strip ().lstrip (),
101
101
}
0 commit comments