diff --git a/NLP/BERT/utils/lamb_optimizer.py b/NLP/BERT/utils/lamb_optimizer.py index f4b1bae4e..127b6f1df 100644 --- a/NLP/BERT/utils/lamb_optimizer.py +++ b/NLP/BERT/utils/lamb_optimizer.py @@ -17,7 +17,7 @@ from typing import Callable, Dict, Iterator, List, Union, Tuple import oneflow as flow -from oneflow.nn.optimizer.optimizer import Optimizer, ParamGroup +from oneflow.optim import Optimizer from oneflow.nn.parameter import Parameter diff --git a/NLP/BERT/utils/lr_scheduler.py b/NLP/BERT/utils/lr_scheduler.py index 60b57bbdc..9ee91c9ae 100644 --- a/NLP/BERT/utils/lr_scheduler.py +++ b/NLP/BERT/utils/lr_scheduler.py @@ -1,6 +1,6 @@ import math import oneflow as flow -from oneflow.nn.optimizer.lr_scheduler import LrScheduler +from oneflow.optim.lr_scheduler import _LRScheduler as LrScheduler class PolynomialLR(LrScheduler): @@ -50,19 +50,18 @@ def __init__( self.cycle = cycle super().__init__(optimizer, last_step, verbose) - def get_lr(self): + def get_lr(self, base_lr, step): decay_batch = self.max_decay_steps - cur_batch = self.last_step + cur_batch = step if self.cycle: + if cur_batch == 0: + cur_batch = 1 decay_batch = decay_batch * math.ceil(cur_batch / decay_batch) else: cur_batch = min(cur_batch, decay_batch) - return [ - (base_lr - self.end_learning_rate) - * ((1 - cur_batch / decay_batch) ** (self.power)) - + self.end_learning_rate - for base_lr in self.base_lrs - ] + + factor = (1 - cur_batch / decay_batch) ** (self.power) + return (base_lr - self.end_learning_rate) * factor + self.end_learning_rate def _generate_conf_for_graph(self, opt_confs): # CosineDecayLR is the same as CosineDecayConf in nn.Graph diff --git a/NLP/GPT2/model.py b/NLP/GPT2/model.py index 3c8346cba..0197e4524 100644 --- a/NLP/GPT2/model.py +++ b/NLP/GPT2/model.py @@ -124,7 +124,7 @@ def _merge_heads(self, tensor, num_heads, attn_head_size): bsz, seq_len = tensor.size()[:-2] # new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,) new_shape = (bsz, seq_len, num_heads * attn_head_size) - return tensor.view(*new_shape) + return tensor.reshape(*new_shape) def forward(self, hidden_states, layer_past=None, use_cache=False): hidden_states = self.c_attn(hidden_states) @@ -356,8 +356,8 @@ def forward( # Flatten the tokens loss_fct = nn.CrossEntropyLoss() - shift_logits = shift_logits.view(-1, shift_logits.size(-1)) - shift_labels = shift_labels.view(-1) + shift_logits = shift_logits.reshape(-1, shift_logits.size(-1)) + shift_labels = shift_labels.reshape(-1) loss = loss_fct(shift_logits, shift_labels) output = (lm_logits,) + transformer_outputs[1:] diff --git a/NLP/GPT2/trainer.py b/NLP/GPT2/trainer.py index 24ec2f3be..1b47b2038 100644 --- a/NLP/GPT2/trainer.py +++ b/NLP/GPT2/trainer.py @@ -119,7 +119,7 @@ def train_single_epoch(self, data_loader, epoch): "step": step, "avg_loss": losses.avg, "loss": losses.val, - "lr": self.lr_scheduler.get_lr()[0], + "lr": self.lr_scheduler.get_last_lr()[0], } data_iter.set_postfix(logging)