diff --git a/NLP/BERT/utils/lamb_optimizer.py b/NLP/BERT/utils/lamb_optimizer.py
index f4b1bae4e..127b6f1df 100644
--- a/NLP/BERT/utils/lamb_optimizer.py
+++ b/NLP/BERT/utils/lamb_optimizer.py
@@ -17,7 +17,7 @@
 from typing import Callable, Dict, Iterator, List, Union, Tuple
 
 import oneflow as flow
-from oneflow.nn.optimizer.optimizer import Optimizer, ParamGroup
+from oneflow.optim import Optimizer
 from oneflow.nn.parameter import Parameter
 
 
diff --git a/NLP/BERT/utils/lr_scheduler.py b/NLP/BERT/utils/lr_scheduler.py
index 60b57bbdc..9ee91c9ae 100644
--- a/NLP/BERT/utils/lr_scheduler.py
+++ b/NLP/BERT/utils/lr_scheduler.py
@@ -1,6 +1,6 @@
 import math
 import oneflow as flow
-from oneflow.nn.optimizer.lr_scheduler import LrScheduler
+from oneflow.optim.lr_scheduler import _LRScheduler as LrScheduler
 
 
 class PolynomialLR(LrScheduler):
@@ -50,19 +50,18 @@ def __init__(
         self.cycle = cycle
         super().__init__(optimizer, last_step, verbose)
 
-    def get_lr(self):
+    def get_lr(self, base_lr, step):
         decay_batch = self.max_decay_steps
-        cur_batch = self.last_step
+        cur_batch = step
         if self.cycle:
+            if cur_batch == 0:
+                cur_batch = 1
             decay_batch = decay_batch * math.ceil(cur_batch / decay_batch)
         else:
             cur_batch = min(cur_batch, decay_batch)
-        return [
-            (base_lr - self.end_learning_rate)
-            * ((1 - cur_batch / decay_batch) ** (self.power))
-            + self.end_learning_rate
-            for base_lr in self.base_lrs
-        ]
+
+        factor = (1 - cur_batch / decay_batch) ** (self.power)
+        return (base_lr - self.end_learning_rate) * factor + self.end_learning_rate
 
     def _generate_conf_for_graph(self, opt_confs):
         # CosineDecayLR is the same as CosineDecayConf in nn.Graph
diff --git a/NLP/GPT2/model.py b/NLP/GPT2/model.py
index 3c8346cba..0197e4524 100644
--- a/NLP/GPT2/model.py
+++ b/NLP/GPT2/model.py
@@ -124,7 +124,7 @@ def _merge_heads(self, tensor, num_heads, attn_head_size):
         bsz, seq_len = tensor.size()[:-2]
         # new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
         new_shape = (bsz, seq_len, num_heads * attn_head_size)
-        return tensor.view(*new_shape)
+        return tensor.reshape(*new_shape)
 
     def forward(self, hidden_states, layer_past=None, use_cache=False):
         hidden_states = self.c_attn(hidden_states)
@@ -356,8 +356,8 @@ def forward(
 
             # Flatten the tokens
             loss_fct = nn.CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, shift_logits.size(-1))
-            shift_labels = shift_labels.view(-1)
+            shift_logits = shift_logits.reshape(-1, shift_logits.size(-1))
+            shift_labels = shift_labels.reshape(-1)
             loss = loss_fct(shift_logits, shift_labels)
 
         output = (lm_logits,) + transformer_outputs[1:]
diff --git a/NLP/GPT2/trainer.py b/NLP/GPT2/trainer.py
index 24ec2f3be..1b47b2038 100644
--- a/NLP/GPT2/trainer.py
+++ b/NLP/GPT2/trainer.py
@@ -119,7 +119,7 @@ def train_single_epoch(self, data_loader, epoch):
                 "step": step,
                 "avg_loss": losses.avg,
                 "loss": losses.val,
-                "lr": self.lr_scheduler.get_lr()[0],
+                "lr": self.lr_scheduler.get_last_lr()[0],
             }
             data_iter.set_postfix(logging)