flypythoncom
diff --git a/‎README.md
+22 b/‎README.md
+22
diff --git a/‎bert-example.py
+89 b/‎bert-example.py
+89
diff --git a/‎bert.py
+48 b/‎bert.py
+48
diff --git a/‎bert/run.sh
+3 b/‎bert/run.sh
+3
diff --git a/‎data/class.txt
+10 b/‎data/class.txt
+10
@@ -0,0 +1,22 @@
+### Stanford / Winter 2019
+
+
+
+download bert file
+
+ google tensorflow bert(need to be converted)
+ 
+ https://github.com/google-research/bert
+ 
+ 
+https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip
+
+
+ chinese bert   
+ 
+ https://github.com/ymcui/Chinese-BERT-wwm/blob/master/README_EN.md
+
+
+关于nlp职位面试相关的问题，请关注公众号：
+
+  ![flypython微信公众号](https://flypython.com/images/wechat.png)
@@ -0,0 +1,89 @@
+#https://github.com/huggingface/transformers
+
+#https://huggingface.co/transformers/quickstart.html
+#BERT example
+
+#pip install transformers
+#老的pytorch_transformers
+
+import torch
+import torch.nn as nn
+from transformers  import BertConfig, BertModel
+from transformers.tokenization_bert  import BertTokenizer as tokenization
+import os
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+#get_bert_model
+# bert预训练模型：
+# pytorch_model.bin
+# config.json
+# vocab.txt
+bert_path = './bert'
+do_lower_case=True
+
+bert_config_file = os.path.join(bert_path, f'bert_config.json')
+vocab_file = os.path.join(bert_path, f'vocab.txt')
+init_checkpoint = os.path.join(bert_path, f'pytorch_model.bin')
+
+#加载配置
+bert_config = BertConfig.from_json_file(bert_config_file)
+
+# 加载词典
+tokenizer = tokenization(vocab_file=vocab_file, do_lower_case=do_lower_case)
+
+# 加载模型
+model_bert =  BertModel.from_pretrained(bert_path,config=bert_config)
+model_bert.to(device)
+
+
+# Tokenize input
+text = "乌兹别克斯坦议会立法院主席获连任"
+tokenized_text = tokenizer.tokenize(text)
+tokenized_text=['[CLS]'] + tokenized_text + ['[SEP]']
+# Convert token to vocabulary indices
+# input_ids：一个形状为[batch_size, sequence_length]的torch.LongTensor，在词汇表中包含单词的token索引
+input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
+# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
+# segment_ids ：形状[batch_size, sequence_length]的可选torch.LongTensor，在[0, 1]中选择token类型索引。类型0对应于句子A，类型1对应于句子B。
+segment_ids = [0]*len(input_ids)
+# input_mask：一个可选的torch.LongTensor，形状为[batch_size, sequence_length]，索引在[0, 1]中选择。
+input_mask = [1]*len(input_ids)
+
+# Convert inputs to PyTorch tensors
+input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
+print("input_ids",input_ids.size())
+input_mask = torch.tensor([input_mask], dtype=torch.long).to(device)# attention_mask,可以不输入
+segments_tensors = torch.tensor([segment_ids], dtype=torch.long).to(device)
+#输出
+all_encoder_layer, pooled_output = model_bert(input_ids,input_mask,token_type_ids=segments_tensors)
+# all_encoder_layers：一个大小为[batch_size, sequence_length，hidden_size]的torch.FloatTensor列表，
+# 它是每个注意块末端隐藏状态的完整序列列表（即BERT-base的12个完整序列，BERT-large的24个完整序列）
+# pooled_output：一个大小为[batch_size, hidden_size] 的torch.FloatTensor，
+# 它是在与输入（CLF）的第一个字符相关联的隐藏状态之上预训练的分类器的输出，用于训练Next - Sentence任务（参见BERT的论文）
+
+#如果我们要输出embeding 表示，只使用all_encoder_layer
+print('all_encoder_layer',all_encoder_layer.shape)
+print('pooled_output',pooled_output.size())
+#如果要分类，使用pooled_output
+
+#padding
+max_seq_length=300
+
+text = "乌兹别克斯坦议会立法院主席获连任"
+tokenized_text = tokenizer.tokenize(text)
+tokenized_text=['[CLS]'] + tokenized_text + ['[SEP]']
+input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
+input_mask = [1]*len(input_ids)
+
+padding = [0] * (max_seq_length - len(input_ids))
+input_ids += padding
+input_mask += padding
+input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
+input_mask = torch.tensor([input_mask], dtype=torch.long).to(device)
+print("padding input_ids",input_ids.size())
+
+model_bert.eval()
+with torch.no_grad():
+    all_encoder_layer, pooled_output = model_bert(input_ids,attention_mask= input_mask)
+    print('padding all_encoder_layer', all_encoder_layer.shape)
+    print('padding pooled_output', pooled_output.size())
@@ -0,0 +1,48 @@
+# coding: UTF-8
+import torch
+import torch.nn as nn
+# from pytorch_pretrained_bert import BertModel, BertTokenizer
+from transformers import BertModel, BertTokenizer,BertConfig
+import os
+
+class Config(object):
+
+    """配置参数"""
+    def __init__(self, dataset):
+        self.model_name = 'bert'
+        self.train_path = dataset + '/data/train.txt'                                # 训练集
+        self.dev_path = dataset + '/data/dev.txt'                                    # 验证集
+        self.test_path = dataset + '/data/test.txt'                                  # 测试集
+        self.class_list = [x.strip() for x in open(
+            dataset + '/data/class.txt').readlines()]                                # 类别名单
+        self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt'        # 模型训练结果
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')   # 设备
+
+        self.require_improvement = 1000                                 # 若超过1000batch效果还没提升，则提前结束训练
+        self.num_classes = len(self.class_list)                         # 类别数
+        self.num_epochs = 3                                             # epoch数
+        self.batch_size = 128                                           # mini-batch大小
+        self.pad_size = 32                                              # 每句话处理成的长度(短填长切)
+        self.learning_rate = 5e-5                                       # 学习率
+        self.bert_path = './bert'
+        self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
+        self.hidden_size = 768
+
+
+class Model(nn.Module):
+
+    def __init__(self, config):
+        super(Model, self).__init__()
+        bert_config_file = os.path.join(config.bert_path, f'bert_config.json')
+        bert_config = BertConfig.from_json_file(bert_config_file)
+        self.bert = BertModel.from_pretrained(config.bert_path,config=bert_config)
+        for param in self.bert.parameters():
+            param.requires_grad = True
+        self.fc = nn.Linear(config.hidden_size, config.num_classes)
+
+    def forward(self, x):
+        context = x[0]  # 输入的句子
+        mask = x[2]  # 对padding部分进行mask，和句子一个size，padding部分用0表示，如：[1, 1, 1, 1, 0, 0]
+        _, pooled = self.bert(context, attention_mask=mask)
+        out = self.fc(pooled)
+        return out
@@ -0,0 +1,3 @@
+export BERT_BASE_DIR=./
+
+transformers bert   $BERT_BASE_DIR/bert_model.ckpt   $BERT_BASE_DIR/bert_config.json   $BERT_BASE_DIR/pytorch_model.bin
@@ -0,0 +1,10 @@
+finance
+realty
+stocks
+education
+science
+society
+politics
+sports
+game
+entertainment
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+export BERT_BASE_DIR=./`
	`2`	`+`
	`3`	`+transformers bert $BERT_BASE_DIR/bert_model.ckpt $BERT_BASE_DIR/bert_config.json $BERT_BASE_DIR/pytorch_model.bin`