|
| 1 | +""" |
| 2 | +T5-Base Model for Summarization, Sentiment Classification, and Translation |
| 3 | +========================================================================== |
| 4 | +
|
| 5 | +**Author**: `Pendo Abbo <pabbo@fb.com>`__, `Joe Cummings <jrcummings@fb.com>`__ |
| 6 | +
|
| 7 | +""" |
| 8 | + |
| 9 | +###################################################################### |
| 10 | +# Overview |
| 11 | +# -------- |
| 12 | +# |
| 13 | +# This tutorial demonstrates how to use a pre-trained T5 Model for summarization, sentiment classification, and |
| 14 | +# translation tasks. We will demonstrate how to use the torchtext library to: |
| 15 | +# |
| 16 | +# 1. Build a text pre-processing pipeline for a T5 model |
| 17 | +# 2. Instantiate a pre-trained T5 model with base configuration |
| 18 | +# 3. Read in the CNNDM, IMDB, and Multi30k datasets and pre-process their texts in preparation for the model |
| 19 | +# 4. Perform text summarization, sentiment classification, and translation |
| 20 | +# |
| 21 | +# |
| 22 | + |
| 23 | +####################################################################### |
| 24 | +# Data Transformation |
| 25 | +# ------------------- |
| 26 | +# |
| 27 | +# The T5 model does not work with raw text. Instead, it requires the text to be transformed into numerical form |
| 28 | +# in order to perform training and inference. The following transformations are required for the T5 model: |
| 29 | +# |
| 30 | +# 1. Tokenize text |
| 31 | +# 2. Convert tokens into (integer) IDs |
| 32 | +# 3. Truncate the sequences to a specified maximum length |
| 33 | +# 4. Add end-of-sequence (EOS) and padding token IDs |
| 34 | +# |
| 35 | +# T5 uses a SentencePiece model for text tokenization. Below, we use a pre-trained SentencePiece model to build |
| 36 | +# the text pre-processing pipeline using torchtext's T5Transform. Note that the transform supports both |
| 37 | +# batched and non-batched text input (for example, one can either pass a single sentence or a list of sentences), however |
| 38 | +# the T5 model expects the input to be batched. |
| 39 | +# |
| 40 | + |
| 41 | +from torchtext.models import T5Transform |
| 42 | + |
| 43 | +padding_idx = 0 |
| 44 | +eos_idx = 1 |
| 45 | +max_seq_len = 512 |
| 46 | +t5_sp_model_path = "https://download.pytorch.org/models/text/t5_tokenizer_base.model" |
| 47 | + |
| 48 | +transform = T5Transform( |
| 49 | + sp_model_path=t5_sp_model_path, |
| 50 | + max_seq_len=max_seq_len, |
| 51 | + eos_idx=eos_idx, |
| 52 | + padding_idx=padding_idx, |
| 53 | +) |
| 54 | + |
| 55 | +####################################################################### |
| 56 | +# Alternatively, we can also use the transform shipped with the pre-trained models that does all of the above out-of-the-box |
| 57 | +# |
| 58 | +# .. code-block:: |
| 59 | +# |
| 60 | +# from torchtext.models import T5_BASE_GENERATION |
| 61 | +# transform = T5_BASE_GENERATION.transform() |
| 62 | +# |
| 63 | + |
| 64 | + |
| 65 | +###################################################################### |
| 66 | +# Model Preparation |
| 67 | +# ----------------- |
| 68 | +# |
| 69 | +# torchtext provides SOTA pre-trained models that can be used directly for NLP tasks or fine-tuned on downstream tasks. Below |
| 70 | +# we use the pre-trained T5 model with standard base configuration to perform text summarization, sentiment classification, and |
| 71 | +# translation. For additional details on available pre-trained models, see `the torchtext documentation <https://pytorch.org/text/main/models.html>`__ |
| 72 | +# |
| 73 | +# |
| 74 | +from torchtext.models import T5_BASE_GENERATION |
| 75 | + |
| 76 | + |
| 77 | +t5_base = T5_BASE_GENERATION |
| 78 | +transform = t5_base.transform() |
| 79 | +model = t5_base.get_model() |
| 80 | +model.eval() |
| 81 | + |
| 82 | + |
| 83 | +####################################################################### |
| 84 | +# GenerationUtils |
| 85 | +# ------------------ |
| 86 | +# |
| 87 | +# We can use torchtext's ``GenerationUtils`` to produce an output sequence based on the input sequence provided. This calls on the |
| 88 | +# model's encoder and decoder, and iteratively expands the decoded sequences until the end-of-sequence token is generated |
| 89 | +# for all sequences in the batch. The ``generate`` method shown below uses greedy search to generate the sequences. Beam search and |
| 90 | +# other decoding strategies are also supported. |
| 91 | +# |
| 92 | +# |
| 93 | +from torchtext.prototype.generate import GenerationUtils |
| 94 | + |
| 95 | +sequence_generator = GenerationUtils(model) |
| 96 | + |
| 97 | + |
| 98 | +####################################################################### |
| 99 | +# Datasets |
| 100 | +# -------- |
| 101 | +# torchtext provides several standard NLP datasets. For a complete list, refer to the documentation |
| 102 | +# at https://pytorch.org/text/stable/datasets.html. These datasets are built using composable torchdata |
| 103 | +# datapipes and hence support standard flow-control and mapping/transformation using user defined |
| 104 | +# functions and transforms. |
| 105 | +# |
| 106 | +# Below we demonstrate how to pre-process the CNNDM dataset to include the prefix necessary for the |
| 107 | +# model to indentify the task it is performing. The CNNDM dataset has a train, validation, and test |
| 108 | +# split. Below we demo on the test split. |
| 109 | +# |
| 110 | +# The T5 model uses the prefix "summarize" for text summarization. For more information on task |
| 111 | +# prefixes, please visit Appendix D of the `T5 Paper <https://arxiv.org/pdf/1910.10683.pdf>`__ |
| 112 | +# |
| 113 | +# .. note:: |
| 114 | +# Using datapipes is still currently subject to a few caveats. If you wish |
| 115 | +# to extend this example to include shuffling, multi-processing, or |
| 116 | +# distributed learning, please see :ref:`this note <datapipes_warnings>` |
| 117 | +# for further instructions. |
| 118 | + |
| 119 | +from functools import partial |
| 120 | + |
| 121 | +from torch.utils.data import DataLoader |
| 122 | +from torchtext.datasets import CNNDM |
| 123 | + |
| 124 | +cnndm_batch_size = 5 |
| 125 | +cnndm_datapipe = CNNDM(split="test") |
| 126 | +task = "summarize" |
| 127 | + |
| 128 | + |
| 129 | +def apply_prefix(task, x): |
| 130 | + return f"{task}: " + x[0], x[1] |
| 131 | + |
| 132 | + |
| 133 | +cnndm_datapipe = cnndm_datapipe.map(partial(apply_prefix, task)) |
| 134 | +cnndm_datapipe = cnndm_datapipe.batch(cnndm_batch_size) |
| 135 | +cnndm_datapipe = cnndm_datapipe.rows2columnar(["article", "abstract"]) |
| 136 | +cnndm_dataloader = DataLoader(cnndm_datapipe, shuffle=True, batch_size=None) |
| 137 | + |
| 138 | +####################################################################### |
| 139 | +# Alternately, we can also use batched API, for example, apply the prefix on the whole batch: |
| 140 | +# |
| 141 | +# .. code-block:: |
| 142 | +# |
| 143 | +# def batch_prefix(task, x): |
| 144 | +# return { |
| 145 | +# "article": [f'{task}: ' + y for y in x["article"]], |
| 146 | +# "abstract": x["abstract"] |
| 147 | +# } |
| 148 | +# |
| 149 | +# cnndm_batch_size = 5 |
| 150 | +# cnndm_datapipe = CNNDM(split="test") |
| 151 | +# task = 'summarize' |
| 152 | +# |
| 153 | +# cnndm_datapipe = cnndm_datapipe.batch(cnndm_batch_size).rows2columnar(["article", "abstract"]) |
| 154 | +# cnndm_datapipe = cnndm_datapipe.map(partial(batch_prefix, task)) |
| 155 | +# cnndm_dataloader = DataLoader(cnndm_datapipe, batch_size=None) |
| 156 | +# |
| 157 | + |
| 158 | +####################################################################### |
| 159 | +# We can also load the IMDB dataset, which will be used to demonstrate sentiment classification using the T5 model. |
| 160 | +# This dataset has a train and test split. Below we demo on the test split. |
| 161 | +# |
| 162 | +# The T5 model was trained on the SST2 dataset (also available in torchtext) for sentiment classification using the |
| 163 | +# prefix "sst2 sentence". Therefore, we will use this prefix to perform sentiment classification on the IMDB dataset. |
| 164 | +# |
| 165 | + |
| 166 | +from torchtext.datasets import IMDB |
| 167 | + |
| 168 | +imdb_batch_size = 3 |
| 169 | +imdb_datapipe = IMDB(split="test") |
| 170 | +task = "sst2 sentence" |
| 171 | +labels = {"1": "negative", "2": "positive"} |
| 172 | + |
| 173 | + |
| 174 | +def process_labels(labels, x): |
| 175 | + return x[1], labels[str(x[0])] |
| 176 | + |
| 177 | + |
| 178 | +imdb_datapipe = imdb_datapipe.map(partial(process_labels, labels)) |
| 179 | +imdb_datapipe = imdb_datapipe.map(partial(apply_prefix, task)) |
| 180 | +imdb_datapipe = imdb_datapipe.batch(imdb_batch_size) |
| 181 | +imdb_datapipe = imdb_datapipe.rows2columnar(["text", "label"]) |
| 182 | +imdb_dataloader = DataLoader(imdb_datapipe, batch_size=None) |
| 183 | + |
| 184 | +####################################################################### |
| 185 | +# Finally, we can also load the Multi30k dataset to demonstrate English to German translation using the T5 model. |
| 186 | +# This dataset has a train, validation, and test split. Below we demo on the test split. |
| 187 | +# |
| 188 | +# The T5 model uses the prefix "translate English to German" for this task. |
| 189 | + |
| 190 | +from torchtext.datasets import Multi30k |
| 191 | + |
| 192 | +multi_batch_size = 5 |
| 193 | +language_pair = ("en", "de") |
| 194 | +multi_datapipe = Multi30k(split="test", language_pair=language_pair) |
| 195 | +task = "translate English to German" |
| 196 | + |
| 197 | +multi_datapipe = multi_datapipe.map(partial(apply_prefix, task)) |
| 198 | +multi_datapipe = multi_datapipe.batch(multi_batch_size) |
| 199 | +multi_datapipe = multi_datapipe.rows2columnar(["english", "german"]) |
| 200 | +multi_dataloader = DataLoader(multi_datapipe, batch_size=None) |
| 201 | + |
| 202 | +####################################################################### |
| 203 | +# Generate Summaries |
| 204 | +# ------------------ |
| 205 | +# |
| 206 | +# We can put all of the components together to generate summaries on the first batch of articles in the CNNDM test set |
| 207 | +# using a beam size of 1. |
| 208 | +# |
| 209 | + |
| 210 | +batch = next(iter(cnndm_dataloader)) |
| 211 | +input_text = batch["article"] |
| 212 | +target = batch["abstract"] |
| 213 | +beam_size = 1 |
| 214 | + |
| 215 | +model_input = transform(input_text) |
| 216 | +model_output = sequence_generator.generate(model_input, eos_idx=eos_idx, num_beams=beam_size) |
| 217 | +output_text = transform.decode(model_output.tolist()) |
| 218 | + |
| 219 | +for i in range(cnndm_batch_size): |
| 220 | + print(f"Example {i+1}:\n") |
| 221 | + print(f"prediction: {output_text[i]}\n") |
| 222 | + print(f"target: {target[i]}\n\n") |
| 223 | + |
| 224 | + |
| 225 | +####################################################################### |
| 226 | +# Summarization Output (Might vary since we shuffle the dataloader) |
| 227 | +# -------------------- |
| 228 | +# |
| 229 | +# .. code-block:: |
| 230 | +# |
| 231 | +# Example 1: |
| 232 | +# |
| 233 | +# prediction: the 24-year-old has been tattooed for over a decade . he has landed in australia |
| 234 | +# to start work on a new campaign . he says he is 'taking it in your stride' to be honest . |
| 235 | +# |
| 236 | +# target: London-based model Stephen James Hendry famed for his full body tattoo . The supermodel |
| 237 | +# is in Sydney for a new modelling campaign . Australian fans understood to have already located |
| 238 | +# him at his hotel . The 24-year-old heartthrob is recently single . |
| 239 | +# |
| 240 | +# |
| 241 | +# Example 2: |
| 242 | +# |
| 243 | +# prediction: a stray pooch has used up at least three of her own after being hit by a |
| 244 | +# car and buried in a field . the dog managed to stagger to a nearby farm, dirt-covered |
| 245 | +# and emaciated, where she was found . she suffered a dislocated jaw, leg injuries and a |
| 246 | +# caved-in sinus cavity -- and still requires surgery to help her breathe . |
| 247 | +# |
| 248 | +# target: Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer |
| 249 | +# and buried in a field . "She's a true miracle dog and she deserves a good life," says |
| 250 | +# Sara Mellado, who is looking for a home for Theia . |
| 251 | +# |
| 252 | +# |
| 253 | +# Example 3: |
| 254 | +# |
| 255 | +# prediction: mohammad Javad Zarif arrived in Iran on a sunny friday morning . he has gone |
| 256 | +# a long way to bring Iran in from the cold and allow it to rejoin the international |
| 257 | +# community . but there are some facts about him that are less well-known . |
| 258 | +# |
| 259 | +# target: Mohammad Javad Zarif has spent more time with John Kerry than any other |
| 260 | +# foreign minister . He once participated in a takeover of the Iranian Consulate in San |
| 261 | +# Francisco . The Iranian foreign minister tweets in English . |
| 262 | +# |
| 263 | +# |
| 264 | +# Example 4: |
| 265 | +# |
| 266 | +# prediction: five americans were monitored for three weeks after being exposed to Ebola in |
| 267 | +# west africa . one of the five had a heart-related issue and has been discharged but hasn't |
| 268 | +# left the area . they are clinicians for Partners in Health, a Boston-based aid group . |
| 269 | +# |
| 270 | +# target: 17 Americans were exposed to the Ebola virus while in Sierra Leone in March . |
| 271 | +# Another person was diagnosed with the disease and taken to hospital in Maryland . |
| 272 | +# National Institutes of Health says the patient is in fair condition after weeks of |
| 273 | +# treatment . |
| 274 | +# |
| 275 | +# |
| 276 | +# Example 5: |
| 277 | +# |
| 278 | +# prediction: the student was identified during an investigation by campus police and |
| 279 | +# the office of student affairs . he admitted to placing the noose on the tree early |
| 280 | +# Wednesday morning . the incident is one of several recent racist events to affect |
| 281 | +# college students . |
| 282 | +# |
| 283 | +# target: Student is no longer on Duke University campus and will face disciplinary |
| 284 | +# review . School officials identified student during investigation and the person |
| 285 | +# admitted to hanging the noose, Duke says . The noose, made of rope, was discovered on |
| 286 | +# campus about 2 a.m. |
| 287 | +# |
| 288 | + |
| 289 | + |
| 290 | +####################################################################### |
| 291 | +# Generate Sentiment Classifications |
| 292 | +# ---------------------------------- |
| 293 | +# |
| 294 | +# Similarly, we can use the model to generate sentiment classifications on the first batch of reviews from the IMDB test set |
| 295 | +# using a beam size of 1. |
| 296 | +# |
| 297 | + |
| 298 | +batch = next(iter(imdb_dataloader)) |
| 299 | +input_text = batch["text"] |
| 300 | +target = batch["label"] |
| 301 | +beam_size = 1 |
| 302 | + |
| 303 | +model_input = transform(input_text) |
| 304 | +model_output = sequence_generator.generate(model_input, eos_idx=eos_idx, num_beams=beam_size) |
| 305 | +output_text = transform.decode(model_output.tolist()) |
| 306 | + |
| 307 | +for i in range(imdb_batch_size): |
| 308 | + print(f"Example {i+1}:\n") |
| 309 | + print(f"input_text: {input_text[i]}\n") |
| 310 | + print(f"prediction: {output_text[i]}\n") |
| 311 | + print(f"target: {target[i]}\n\n") |
| 312 | + |
| 313 | + |
| 314 | +####################################################################### |
| 315 | +# Sentiment Output |
| 316 | +# ---------------- |
| 317 | +# |
| 318 | +# :: |
| 319 | +# |
| 320 | +# Example 1: |
| 321 | +# |
| 322 | +# input_text: sst2 sentence: I love sci-fi and am willing to put up with a lot. Sci-fi |
| 323 | +# movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like |
| 324 | +# this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). |
| 325 | +# Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the |
| 326 | +# background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' |
| 327 | +# setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. |
| 328 | +# It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character |
| 329 | +# development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may |
| 330 | +# treat important issues, yet not as a serious philosophy. It's really difficult to care about |
| 331 | +# the characters here as they are not simply foolish, just missing a spark of life. Their |
| 332 | +# actions and reactions are wooden and predictable, often painful to watch. The makers of Earth |
| 333 | +# KNOW it's rubbish as they have to always say "Gene Roddenberry's Earth..." otherwise people |
| 334 | +# would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, |
| 335 | +# cheap, poorly edited (watching it without advert breaks really brings this home) trudging |
| 336 | +# Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring |
| 337 | +# him back as another actor. Jeeez. Dallas all over again. |
| 338 | +# |
| 339 | +# prediction: negative |
| 340 | +# |
| 341 | +# target: negative |
| 342 | +# |
| 343 | +# |
| 344 | +# Example 2: |
| 345 | +# |
| 346 | +# input_text: sst2 sentence: Worth the entertainment value of a rental, especially if you like |
| 347 | +# action movies. This one features the usual car chases, fights with the great Van Damme kick |
| 348 | +# style, shooting battles with the 40 shell load shotgun, and even terrorist style bombs. All |
| 349 | +# of this is entertaining and competently handled but there is nothing that really blows you |
| 350 | +# away if you've seen your share before.<br /><br />The plot is made interesting by the |
| 351 | +# inclusion of a rabbit, which is clever but hardly profound. Many of the characters are |
| 352 | +# heavily stereotyped -- the angry veterans, the terrified illegal aliens, the crooked cops, |
| 353 | +# the indifferent feds, the bitchy tough lady station head, the crooked politician, the fat |
| 354 | +# federale who looks like he was typecast as the Mexican in a Hollywood movie from the 1940s. |
| 355 | +# All passably acted but again nothing special.<br /><br />I thought the main villains were |
| 356 | +# pretty well done and fairly well acted. By the end of the movie you certainly knew who the |
| 357 | +# good guys were and weren't. There was an emotional lift as the really bad ones got their just |
| 358 | +# deserts. Very simplistic, but then you weren't expecting Hamlet, right? The only thing I found |
| 359 | +# really annoying was the constant cuts to VDs daughter during the last fight scene.<br /><br /> |
| 360 | +# Not bad. Not good. Passable 4. |
| 361 | +# |
| 362 | +# prediction: positive |
| 363 | +# |
| 364 | +# target: negative |
| 365 | +# |
| 366 | +# |
| 367 | +# Example 3: |
| 368 | +# |
| 369 | +# input_text: sst2 sentence: its a totally average film with a few semi-alright action sequences |
| 370 | +# that make the plot seem a little better and remind the viewer of the classic van dam films. |
| 371 | +# parts of the plot don't make sense and seem to be added in to use up time. the end plot is that |
| 372 | +# of a very basic type that doesn't leave the viewer guessing and any twists are obvious from the |
| 373 | +# beginning. the end scene with the flask backs don't make sense as they are added in and seem to |
| 374 | +# have little relevance to the history of van dam's character. not really worth watching again, |
| 375 | +# bit disappointed in the end production, even though it is apparent it was shot on a low budget |
| 376 | +# certain shots and sections in the film are of poor directed quality. |
| 377 | +# |
| 378 | +# prediction: negative |
| 379 | +# |
| 380 | +# target: negative |
| 381 | +# |
| 382 | + |
| 383 | + |
| 384 | +####################################################################### |
| 385 | +# Generate Translations |
| 386 | +# --------------------- |
| 387 | +# |
| 388 | +# Finally, we can also use the model to generate English to German translations on the first batch of examples from the Multi30k |
| 389 | +# test set. |
| 390 | +# |
| 391 | + |
| 392 | +batch = next(iter(multi_dataloader)) |
| 393 | +input_text = batch["english"] |
| 394 | +target = batch["german"] |
| 395 | + |
| 396 | +model_input = transform(input_text) |
| 397 | +model_output = sequence_generator.generate(model_input, eos_idx=eos_idx, num_beams=beam_size) |
| 398 | +output_text = transform.decode(model_output.tolist()) |
| 399 | + |
| 400 | +for i in range(multi_batch_size): |
| 401 | + print(f"Example {i+1}:\n") |
| 402 | + print(f"input_text: {input_text[i]}\n") |
| 403 | + print(f"prediction: {output_text[i]}\n") |
| 404 | + print(f"target: {target[i]}\n\n") |
| 405 | + |
| 406 | + |
| 407 | +####################################################################### |
| 408 | +# Translation Output |
| 409 | +# ------------------ |
| 410 | +# |
| 411 | +# :: |
| 412 | +# |
| 413 | +# Example 1: |
| 414 | +# |
| 415 | +# input_text: translate English to German: A man in an orange hat starring at something. |
| 416 | +# |
| 417 | +# prediction: Ein Mann in einem orangen Hut, der an etwas schaut. |
| 418 | +# |
| 419 | +# target: Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt. |
| 420 | +# |
| 421 | +# |
| 422 | +# Example 2: |
| 423 | +# |
| 424 | +# input_text: translate English to German: A Boston Terrier is running on lush green grass in front of a white fence. |
| 425 | +# |
| 426 | +# prediction: Ein Boston Terrier läuft auf üppigem grünem Gras vor einem weißen Zaun. |
| 427 | +# |
| 428 | +# target: Ein Boston Terrier läuft über saftig-grünes Gras vor einem weißen Zaun. |
| 429 | +# |
| 430 | +# |
| 431 | +# Example 3: |
| 432 | +# |
| 433 | +# input_text: translate English to German: A girl in karate uniform breaking a stick with a front kick. |
| 434 | +# |
| 435 | +# prediction: Ein Mädchen in Karate-Uniform bricht einen Stöck mit einem Frontkick. |
| 436 | +# |
| 437 | +# target: Ein Mädchen in einem Karateanzug bricht ein Brett mit einem Tritt. |
| 438 | +# |
| 439 | +# |
| 440 | +# Example 4: |
| 441 | +# |
| 442 | +# input_text: translate English to German: Five people wearing winter jackets and helmets stand in the snow, with snowmobiles in the background. |
| 443 | +# |
| 444 | +# prediction: Fünf Menschen mit Winterjacken und Helmen stehen im Schnee, mit Schneemobilen im Hintergrund. |
| 445 | +# |
| 446 | +# target: Fünf Leute in Winterjacken und mit Helmen stehen im Schnee mit Schneemobilen im Hintergrund. |
| 447 | +# |
| 448 | +# |
| 449 | +# Example 5: |
| 450 | +# |
| 451 | +# input_text: translate English to German: People are fixing the roof of a house. |
| 452 | +# |
| 453 | +# prediction: Die Leute fixieren das Dach eines Hauses. |
| 454 | +# |
| 455 | +# target: Leute Reparieren das Dach eines Hauses. |
| 456 | +# |
0 commit comments