forked from EleutherAI/gpt-neo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tasks.py
214 lines (196 loc) · 8.5 KB
/
tasks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import os.path
import json
import requests
import numpy as np
import ftfy
from data.encoders import fetch_encoder, encode
import tensorflow as tf
import re
from functools import partial
lambada_src_uri = 'https://storage.googleapis.com/gpt-2/data/lambada_test.jsonl'
normalization = 'NFKC'
# Note: this task is called "lambada" but it really refers to OpenAI's version
# of the task, which actually differs in some ways from the task described in
# the original paper. So, strictly speaking, accuracy values from this task
# should not be compared to accuracy values from the original lambada task.
# For more information, see
# https://github.com/openai/gpt-2/issues/131
def lambada_create_tokens_data(params, path):
with open(path, 'w') as f:
req = requests.get(lambada_src_uri)
req.raise_for_status()
jsons = [json.loads(l) for l in req.iter_lines()]
texts = [ftfy.fix_text(j['text'], normalization=normalization) for j in jsons]
enc = fetch_encoder(params)
arrays = [encode(enc, t) for t in texts]
json.dump(arrays, f)
return arrays
def lambada_read_or_create_tokens_data(params, path):
# if you tell me where the file should go, i will helpfully create it for you
if not os.path.exists(path):
return lambada_create_tokens_data(params, path)
with open(path) as f:
return json.load(f)
def bin_pack(params, tokens_data):
eos_token = params['eos_id']
n_ctx = params['n_ctx']
dummy_token = 1
pad_batch_size = params['eval_batch_size']
bins = []
for a in tokens_data:
if len(bins) == 0 or len(bins[-1])+len(a)+1 > n_ctx:
bins.append([])
bins[-1] += a
bins[-1].append(eos_token)
while len(bins)%pad_batch_size != 0:
bins.append([])
bins_array = np.full((len(bins), n_ctx), dummy_token, dtype=np.uint16)
for i, b in enumerate(bins):
bins_array[i, 0:len(b)] = b
return bins_array
def lambada_init(params):
ds_configs = params['dataset_configs']
l = [
ds_configs[ds_id].get('lambada_tokens_path', "./lambada.json")
for ds_id, _, _, _ in params['datasets']
]
assert len(l) > 0, 'lambada_tokens_path not found in the dataset config'
lt_path = l[0]
assert lt_path.endswith('.json'), 'lambada_tokens_path must have extension json'
tokens_data = lambada_read_or_create_tokens_data(params, lt_path)
bins_array = bin_pack(params, tokens_data)
params['lambada_tokens_path'] = lt_path
params['lambada_n_steps'] = len(bins_array)//params['eval_batch_size']
def lambada_get_task_info(params):
return {
'n_steps': params['lambada_n_steps'],
}
def wikitext_detokenizer(string):
# contractions
string = string.replace("s '", "s'")
string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
# number separators
string = string.replace(" @-@ ", "-")
string = string.replace(" @,@ ", ",")
string = string.replace(" @.@ ", ".")
# punctuation
string = string.replace(" : ", ": ")
string = string.replace(" ; ", "; ")
string = string.replace(" . ", ". ")
string = string.replace(" ! ", "! ")
string = string.replace(" ? ", "? ")
string = string.replace(" , ", ", ")
# double brackets
string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
# miscellaneous
string = string.replace("= = = =", "====")
string = string.replace("= = =", "===")
string = string.replace("= =", "==")
string = string.replace(" " + chr(176) + " ", chr(176))
string = string.replace(" \n", "\n")
string = string.replace("\n ", "\n")
string = string.replace(" N ", " 1 ")
string = string.replace(" 's", "'s")
return string
# The LAMBADA evaluation code looks at the logits of each position just before an eos_token
def lambada_input(params):
eos_token = 50256 if params['n_vocab'] >= 50257 else 0
n_ctx = params['n_ctx']
lt_path = params['lambada_tokens_path']
tokens_data = lambada_read_or_create_tokens_data(params, lt_path)
bins_array = bin_pack(params, tokens_data)
dataset = tf.data.Dataset.from_tensor_slices(bins_array)
def _get_output(bin):
bin = tf.cast(bin, dtype=tf.int32)
indexes = tf.range(n_ctx)
results = tf.gather(bin, (indexes+1)%n_ctx)
eos_next_positions = tf.math.equal(tf.gather(bin, (indexes+2)%n_ctx), eos_token)
output = tf.where(eos_next_positions, results, tf.constant(eos_token, shape=[n_ctx]))
bin = tf.reshape(bin, [n_ctx])
bin = tf.cast(bin, dtype=tf.int32)
output = tf.reshape(output, [n_ctx])
output = tf.cast(output, dtype=tf.int32)
return bin, output
dataset = dataset.map(_get_output)
dataset = dataset.batch(params['eval_batch_size'], drop_remainder=True)
dataset = dataset.repeat()
return dataset
def wikitext_create_tokens_data(params, path, version):
assert version.lower() in ["wikitext2", "wikitext103"]
wikitext2_src = "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip"
wikitext103_src = "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip"
version_src = wikitext103_src if version.lower() == "wikitext103" else wikitext2_src
with open(path, 'w') as f:
wikitext_path = f"./{version}-raw-v1.zip"
os.system(f"wget {version_src} -O {wikitext_path}")
os.makedirs(f"{version}", exist_ok=True)
os.system(f"unzip {wikitext_path} -d {version}")
n = 103 if version.lower() == "wikitext103" else 2
with open(f"./{version}/wikitext-{n}-raw/wiki.test.raw", 'r') as wt:
text = ftfy.fix_text(wikitext_detokenizer(wt.read()))
enc = fetch_encoder(params)
encoded_text = encode(enc, text)
arrays = []
for i in range(0, len(encoded_text), params["n_ctx"] - 1):
arrays.append(encoded_text[i:i + params["n_ctx"] - 1])
json.dump(arrays, f)
return arrays
def wikitext_read_or_create_tokens_data(params, path, version):
# if you tell me where the file should go, i will helpfully create it for you
if not os.path.exists(path):
return wikitext_create_tokens_data(params, path, version)
with open(path) as f:
return json.load(f)
def wikitext_init(params, version):
wikitext_path = version + ".json"
tokens_data = wikitext_read_or_create_tokens_data(params, wikitext_path, version)
bins_array = bin_pack(params, tokens_data)
params['wikitext_path'] = wikitext_path
params['wikitext_n_steps'] = len(bins_array)//params['eval_batch_size']
def wikitext_get_task_info(params, version):
return {
'n_steps': params['wikitext_n_steps'],
}
def wikitext_input(params, version):
eos_token = 50256 if params['n_vocab'] >= 50257 else 0
n_ctx = params['n_ctx']
wt_path = params['wikitext_path']
tokens_data = wikitext_read_or_create_tokens_data(params, wt_path, version)
bins_array = bin_pack(params, tokens_data)
dataset = tf.data.Dataset.from_tensor_slices(bins_array)
def _get_output(bin):
bin = tf.cast(bin, dtype=tf.int32)
indexes = tf.range(n_ctx)
results = tf.gather(bin, (indexes+1)%n_ctx)
eos_next_positions = tf.math.equal(tf.gather(bin, (indexes+2)%n_ctx), eos_token)
output = tf.where(eos_next_positions, results, tf.constant(eos_token, shape=[n_ctx]))
bin = tf.reshape(bin, [n_ctx])
bin = tf.cast(bin, dtype=tf.int32)
output = tf.reshape(output, [n_ctx])
output = tf.cast(output, dtype=tf.int32)
return bin, output
dataset = dataset.map(_get_output)
dataset = dataset.batch(params['eval_batch_size'], drop_remainder=True)
dataset = dataset.repeat()
return dataset
task_descriptors = {
'lambada': {
'init_fn': lambada_init,
'get_task_info_fn': lambada_get_task_info,
'input_fn': lambada_input,
},
'wikitext2': {
'init_fn': partial(wikitext_init, version='wikitext2'),
'get_task_info_fn': partial(wikitext_get_task_info, version='wikitext2'),
'input_fn': partial(wikitext_input, version='wikitext2'),
},
'wikitext103': {
'init_fn': partial(wikitext_init, version='wikitext103'),
'get_task_info_fn': partial(wikitext_get_task_info, version='wikitext103'),
'input_fn': partial(wikitext_input, version='wikitext103'),
},
}