1
+ #https://github.com/huggingface/transformers
2
+
3
+ #https://huggingface.co/transformers/quickstart.html
4
+ #BERT example
5
+
6
+ #pip install transformers
7
+ #老的pytorch_transformers
8
+
9
+ import torch
10
+ import torch .nn as nn
11
+ from transformers import BertConfig , BertModel
12
+ from transformers .tokenization_bert import BertTokenizer as tokenization
13
+ import os
14
+
15
+ device = torch .device ("cuda" if torch .cuda .is_available () else "cpu" )
16
+ #get_bert_model
17
+ # bert预训练模型:
18
+ # pytorch_model.bin
19
+ # config.json
20
+ # vocab.txt
21
+ bert_path = './bert'
22
+ do_lower_case = True
23
+
24
+ bert_config_file = os .path .join (bert_path , f'bert_config.json' )
25
+ vocab_file = os .path .join (bert_path , f'vocab.txt' )
26
+ init_checkpoint = os .path .join (bert_path , f'pytorch_model.bin' )
27
+
28
+ #加载配置
29
+ bert_config = BertConfig .from_json_file (bert_config_file )
30
+
31
+ # 加载词典
32
+ tokenizer = tokenization (vocab_file = vocab_file , do_lower_case = do_lower_case )
33
+
34
+ # 加载模型
35
+ model_bert = BertModel .from_pretrained (bert_path ,config = bert_config )
36
+ model_bert .to (device )
37
+
38
+
39
+ # Tokenize input
40
+ text = "乌兹别克斯坦议会立法院主席获连任"
41
+ tokenized_text = tokenizer .tokenize (text )
42
+ tokenized_text = ['[CLS]' ] + tokenized_text + ['[SEP]' ]
43
+ # Convert token to vocabulary indices
44
+ # input_ids:一个形状为[batch_size, sequence_length]的torch.LongTensor,在词汇表中包含单词的token索引
45
+ input_ids = tokenizer .convert_tokens_to_ids (tokenized_text )
46
+ # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
47
+ # segment_ids :形状[batch_size, sequence_length]的可选torch.LongTensor,在[0, 1]中选择token类型索引。类型0对应于句子A,类型1对应于句子B。
48
+ segment_ids = [0 ]* len (input_ids )
49
+ # input_mask:一个可选的torch.LongTensor,形状为[batch_size, sequence_length],索引在[0, 1]中选择。
50
+ input_mask = [1 ]* len (input_ids )
51
+
52
+ # Convert inputs to PyTorch tensors
53
+ input_ids = torch .tensor ([input_ids ], dtype = torch .long ).to (device )
54
+ print ("input_ids" ,input_ids .size ())
55
+ input_mask = torch .tensor ([input_mask ], dtype = torch .long ).to (device )# attention_mask,可以不输入
56
+ segments_tensors = torch .tensor ([segment_ids ], dtype = torch .long ).to (device )
57
+ #输出
58
+ all_encoder_layer , pooled_output = model_bert (input_ids ,input_mask ,token_type_ids = segments_tensors )
59
+ # all_encoder_layers:一个大小为[batch_size, sequence_length,hidden_size]的torch.FloatTensor列表,
60
+ # 它是每个注意块末端隐藏状态的完整序列列表(即BERT-base的12个完整序列,BERT-large的24个完整序列)
61
+ # pooled_output:一个大小为[batch_size, hidden_size] 的torch.FloatTensor,
62
+ # 它是在与输入(CLF)的第一个字符相关联的隐藏状态之上预训练的分类器的输出,用于训练Next - Sentence任务(参见BERT的论文)
63
+
64
+ #如果我们要输出embeding 表示,只使用all_encoder_layer
65
+ print ('all_encoder_layer' ,all_encoder_layer .shape )
66
+ print ('pooled_output' ,pooled_output .size ())
67
+ #如果要分类,使用pooled_output
68
+
69
+ #padding
70
+ max_seq_length = 300
71
+
72
+ text = "乌兹别克斯坦议会立法院主席获连任"
73
+ tokenized_text = tokenizer .tokenize (text )
74
+ tokenized_text = ['[CLS]' ] + tokenized_text + ['[SEP]' ]
75
+ input_ids = tokenizer .convert_tokens_to_ids (tokenized_text )
76
+ input_mask = [1 ]* len (input_ids )
77
+
78
+ padding = [0 ] * (max_seq_length - len (input_ids ))
79
+ input_ids += padding
80
+ input_mask += padding
81
+ input_ids = torch .tensor ([input_ids ], dtype = torch .long ).to (device )
82
+ input_mask = torch .tensor ([input_mask ], dtype = torch .long ).to (device )
83
+ print ("padding input_ids" ,input_ids .size ())
84
+
85
+ model_bert .eval ()
86
+ with torch .no_grad ():
87
+ all_encoder_layer , pooled_output = model_bert (input_ids ,attention_mask = input_mask )
88
+ print ('padding all_encoder_layer' , all_encoder_layer .shape )
89
+ print ('padding pooled_output' , pooled_output .size ())
0 commit comments