From 307001d417e8e3e36243db2723eb5118a0dee027 Mon Sep 17 00:00:00 2001 From: songyicheng Date: Mon, 26 Dec 2022 02:42:09 +0000 Subject: [PATCH 1/3] fix tensor view bug --- NLP/GPT2/model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/NLP/GPT2/model.py b/NLP/GPT2/model.py index 3c8346cba..0197e4524 100644 --- a/NLP/GPT2/model.py +++ b/NLP/GPT2/model.py @@ -124,7 +124,7 @@ def _merge_heads(self, tensor, num_heads, attn_head_size): bsz, seq_len = tensor.size()[:-2] # new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,) new_shape = (bsz, seq_len, num_heads * attn_head_size) - return tensor.view(*new_shape) + return tensor.reshape(*new_shape) def forward(self, hidden_states, layer_past=None, use_cache=False): hidden_states = self.c_attn(hidden_states) @@ -356,8 +356,8 @@ def forward( # Flatten the tokens loss_fct = nn.CrossEntropyLoss() - shift_logits = shift_logits.view(-1, shift_logits.size(-1)) - shift_labels = shift_labels.view(-1) + shift_logits = shift_logits.reshape(-1, shift_logits.size(-1)) + shift_labels = shift_labels.reshape(-1) loss = loss_fct(shift_logits, shift_labels) output = (lm_logits,) + transformer_outputs[1:] From 2c8acd8bcd5f2dddae3041de5666162a1e44291b Mon Sep 17 00:00:00 2001 From: songyicheng Date: Mon, 16 Jan 2023 05:59:46 +0000 Subject: [PATCH 2/3] fix gpt2 trainer get_lr bug --- NLP/GPT2/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NLP/GPT2/trainer.py b/NLP/GPT2/trainer.py index 24ec2f3be..1b47b2038 100644 --- a/NLP/GPT2/trainer.py +++ b/NLP/GPT2/trainer.py @@ -119,7 +119,7 @@ def train_single_epoch(self, data_loader, epoch): "step": step, "avg_loss": losses.avg, "loss": losses.val, - "lr": self.lr_scheduler.get_lr()[0], + "lr": self.lr_scheduler.get_last_lr()[0], } data_iter.set_postfix(logging) From 5806ee9d0788e3886003c0355c31e0d37a6d92ec Mon Sep 17 00:00:00 2001 From: songyicheng Date: Mon, 16 Jan 2023 06:15:27 +0000 Subject: [PATCH 3/3] fix bert lr optim bug --- NLP/BERT/utils/lamb_optimizer.py | 2 +- NLP/BERT/utils/lr_scheduler.py | 17 ++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/NLP/BERT/utils/lamb_optimizer.py b/NLP/BERT/utils/lamb_optimizer.py index f4b1bae4e..127b6f1df 100644 --- a/NLP/BERT/utils/lamb_optimizer.py +++ b/NLP/BERT/utils/lamb_optimizer.py @@ -17,7 +17,7 @@ from typing import Callable, Dict, Iterator, List, Union, Tuple import oneflow as flow -from oneflow.nn.optimizer.optimizer import Optimizer, ParamGroup +from oneflow.optim import Optimizer from oneflow.nn.parameter import Parameter diff --git a/NLP/BERT/utils/lr_scheduler.py b/NLP/BERT/utils/lr_scheduler.py index 60b57bbdc..9ee91c9ae 100644 --- a/NLP/BERT/utils/lr_scheduler.py +++ b/NLP/BERT/utils/lr_scheduler.py @@ -1,6 +1,6 @@ import math import oneflow as flow -from oneflow.nn.optimizer.lr_scheduler import LrScheduler +from oneflow.optim.lr_scheduler import _LRScheduler as LrScheduler class PolynomialLR(LrScheduler): @@ -50,19 +50,18 @@ def __init__( self.cycle = cycle super().__init__(optimizer, last_step, verbose) - def get_lr(self): + def get_lr(self, base_lr, step): decay_batch = self.max_decay_steps - cur_batch = self.last_step + cur_batch = step if self.cycle: + if cur_batch == 0: + cur_batch = 1 decay_batch = decay_batch * math.ceil(cur_batch / decay_batch) else: cur_batch = min(cur_batch, decay_batch) - return [ - (base_lr - self.end_learning_rate) - * ((1 - cur_batch / decay_batch) ** (self.power)) - + self.end_learning_rate - for base_lr in self.base_lrs - ] + + factor = (1 - cur_batch / decay_batch) ** (self.power) + return (base_lr - self.end_learning_rate) * factor + self.end_learning_rate def _generate_conf_for_graph(self, opt_confs): # CosineDecayLR is the same as CosineDecayConf in nn.Graph