Skip to content

Commit fa59b86

Browse files
author
chongjiu.jin
committed
first commit
1 parent b75ea51 commit fa59b86

File tree

11 files changed

+200426
-0
lines changed

11 files changed

+200426
-0
lines changed

README.md

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
### Stanford / Winter 2019
2+
3+
4+
5+
download bert file
6+
7+
google tensorflow bert(need to be converted)
8+
9+
https://github.com/google-research/bert
10+
11+
12+
https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip
13+
14+
15+
chinese bert
16+
17+
https://github.com/ymcui/Chinese-BERT-wwm/blob/master/README_EN.md
18+
19+
20+
关于nlp职位面试相关的问题,请关注公众号:
21+
22+
![flypython微信公众号](https://flypython.com/images/wechat.png)

bert-example.py

+89
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
#https://github.com/huggingface/transformers
2+
3+
#https://huggingface.co/transformers/quickstart.html
4+
#BERT example
5+
6+
#pip install transformers
7+
#老的pytorch_transformers
8+
9+
import torch
10+
import torch.nn as nn
11+
from transformers import BertConfig, BertModel
12+
from transformers.tokenization_bert import BertTokenizer as tokenization
13+
import os
14+
15+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16+
#get_bert_model
17+
# bert预训练模型:
18+
# pytorch_model.bin
19+
# config.json
20+
# vocab.txt
21+
bert_path = './bert'
22+
do_lower_case=True
23+
24+
bert_config_file = os.path.join(bert_path, f'bert_config.json')
25+
vocab_file = os.path.join(bert_path, f'vocab.txt')
26+
init_checkpoint = os.path.join(bert_path, f'pytorch_model.bin')
27+
28+
#加载配置
29+
bert_config = BertConfig.from_json_file(bert_config_file)
30+
31+
# 加载词典
32+
tokenizer = tokenization(vocab_file=vocab_file, do_lower_case=do_lower_case)
33+
34+
# 加载模型
35+
model_bert = BertModel.from_pretrained(bert_path,config=bert_config)
36+
model_bert.to(device)
37+
38+
39+
# Tokenize input
40+
text = "乌兹别克斯坦议会立法院主席获连任"
41+
tokenized_text = tokenizer.tokenize(text)
42+
tokenized_text=['[CLS]'] + tokenized_text + ['[SEP]']
43+
# Convert token to vocabulary indices
44+
# input_ids:一个形状为[batch_size, sequence_length]的torch.LongTensor,在词汇表中包含单词的token索引
45+
input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
46+
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
47+
# segment_ids :形状[batch_size, sequence_length]的可选torch.LongTensor,在[0, 1]中选择token类型索引。类型0对应于句子A,类型1对应于句子B。
48+
segment_ids = [0]*len(input_ids)
49+
# input_mask:一个可选的torch.LongTensor,形状为[batch_size, sequence_length],索引在[0, 1]中选择。
50+
input_mask = [1]*len(input_ids)
51+
52+
# Convert inputs to PyTorch tensors
53+
input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
54+
print("input_ids",input_ids.size())
55+
input_mask = torch.tensor([input_mask], dtype=torch.long).to(device)# attention_mask,可以不输入
56+
segments_tensors = torch.tensor([segment_ids], dtype=torch.long).to(device)
57+
#输出
58+
all_encoder_layer, pooled_output = model_bert(input_ids,input_mask,token_type_ids=segments_tensors)
59+
# all_encoder_layers:一个大小为[batch_size, sequence_length,hidden_size]的torch.FloatTensor列表,
60+
# 它是每个注意块末端隐藏状态的完整序列列表(即BERT-base的12个完整序列,BERT-large的24个完整序列)
61+
# pooled_output:一个大小为[batch_size, hidden_size] 的torch.FloatTensor,
62+
# 它是在与输入(CLF)的第一个字符相关联的隐藏状态之上预训练的分类器的输出,用于训练Next - Sentence任务(参见BERT的论文)
63+
64+
#如果我们要输出embeding 表示,只使用all_encoder_layer
65+
print('all_encoder_layer',all_encoder_layer.shape)
66+
print('pooled_output',pooled_output.size())
67+
#如果要分类,使用pooled_output
68+
69+
#padding
70+
max_seq_length=300
71+
72+
text = "乌兹别克斯坦议会立法院主席获连任"
73+
tokenized_text = tokenizer.tokenize(text)
74+
tokenized_text=['[CLS]'] + tokenized_text + ['[SEP]']
75+
input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
76+
input_mask = [1]*len(input_ids)
77+
78+
padding = [0] * (max_seq_length - len(input_ids))
79+
input_ids += padding
80+
input_mask += padding
81+
input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
82+
input_mask = torch.tensor([input_mask], dtype=torch.long).to(device)
83+
print("padding input_ids",input_ids.size())
84+
85+
model_bert.eval()
86+
with torch.no_grad():
87+
all_encoder_layer, pooled_output = model_bert(input_ids,attention_mask= input_mask)
88+
print('padding all_encoder_layer', all_encoder_layer.shape)
89+
print('padding pooled_output', pooled_output.size())

bert.py

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# coding: UTF-8
2+
import torch
3+
import torch.nn as nn
4+
# from pytorch_pretrained_bert import BertModel, BertTokenizer
5+
from transformers import BertModel, BertTokenizer,BertConfig
6+
import os
7+
8+
class Config(object):
9+
10+
"""配置参数"""
11+
def __init__(self, dataset):
12+
self.model_name = 'bert'
13+
self.train_path = dataset + '/data/train.txt' # 训练集
14+
self.dev_path = dataset + '/data/dev.txt' # 验证集
15+
self.test_path = dataset + '/data/test.txt' # 测试集
16+
self.class_list = [x.strip() for x in open(
17+
dataset + '/data/class.txt').readlines()] # 类别名单
18+
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果
19+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 设备
20+
21+
self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练
22+
self.num_classes = len(self.class_list) # 类别数
23+
self.num_epochs = 3 # epoch数
24+
self.batch_size = 128 # mini-batch大小
25+
self.pad_size = 32 # 每句话处理成的长度(短填长切)
26+
self.learning_rate = 5e-5 # 学习率
27+
self.bert_path = './bert'
28+
self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
29+
self.hidden_size = 768
30+
31+
32+
class Model(nn.Module):
33+
34+
def __init__(self, config):
35+
super(Model, self).__init__()
36+
bert_config_file = os.path.join(config.bert_path, f'bert_config.json')
37+
bert_config = BertConfig.from_json_file(bert_config_file)
38+
self.bert = BertModel.from_pretrained(config.bert_path,config=bert_config)
39+
for param in self.bert.parameters():
40+
param.requires_grad = True
41+
self.fc = nn.Linear(config.hidden_size, config.num_classes)
42+
43+
def forward(self, x):
44+
context = x[0] # 输入的句子
45+
mask = x[2] # 对padding部分进行mask,和句子一个size,padding部分用0表示,如:[1, 1, 1, 1, 0, 0]
46+
_, pooled = self.bert(context, attention_mask=mask)
47+
out = self.fc(pooled)
48+
return out

bert/run.sh

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
export BERT_BASE_DIR=./
2+
3+
transformers bert $BERT_BASE_DIR/bert_model.ckpt $BERT_BASE_DIR/bert_config.json $BERT_BASE_DIR/pytorch_model.bin

data/class.txt

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
finance
2+
realty
3+
stocks
4+
education
5+
science
6+
society
7+
politics
8+
sports
9+
game
10+
entertainment

0 commit comments

Comments
 (0)