Skip to content

Commit b3bb4a3

Browse files
committed
update
1 parent f432c34 commit b3bb4a3

33 files changed

+25563
-0
lines changed

__init__.py

Whitespace-only changes.

data/lm_dict.pik

12.5 MB
Binary file not shown.

data/train_valid_test.h5

19.2 MB
Binary file not shown.

data/vocab_label.pik

980 KB
Binary file not shown.

data/存在劳动关系_20181022_test.txt

Lines changed: 634 additions & 0 deletions
Large diffs are not rendered by default.

data/存在劳动关系_20181022_test.txt_tokenized.txt

Lines changed: 634 additions & 0 deletions
Large diffs are not rendered by default.

data/存在劳动关系_20181022_train.txt

Lines changed: 10959 additions & 0 deletions
Large diffs are not rendered by default.

data/存在劳动关系_20181022_train.txt_tokenized.txt

Lines changed: 10959 additions & 0 deletions
Large diffs are not rendered by default.

data_util_hdf5.py

Lines changed: 490 additions & 0 deletions
Large diffs are not rendered by default.

evaluation_matrix.py

Lines changed: 256 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,256 @@
1+
# -*- coding: utf-8 -*-
2+
import numpy as np
3+
import random
4+
import codecs
5+
"""
6+
compute single evaulation matrix for task1,task2 and task3:
7+
compute f1 score(micro,macro) for accusation & relevant article, and score for pentaly
8+
"""
9+
10+
small_value=0.00001
11+
random_number=500
12+
def compute_confuse_matrix_batch(y_targetlabel_list,y_logits_array,label_dict,name='default'):
13+
"""
14+
compute confuse matrix for a batch
15+
:param y_targetlabel_list: a list; each element is a mulit-hot,e.g. [1,0,0,1,...]
16+
:param y_logits_array: a 2-d array. [batch_size,num_class]
17+
:param label_dict:{label:(TP, FP, FN)}
18+
:param name: a string for debug purpose
19+
:return:label_dict:{label:(TP, FP, FN)}
20+
"""
21+
for i,y_targetlabel_list_single in enumerate(y_targetlabel_list):
22+
label_dict=compute_confuse_matrix(y_targetlabel_list_single,y_logits_array[i],label_dict,name=name)
23+
return label_dict
24+
25+
def compute_confuse_matrix(y_targetlabel_list_single,y_logit_array_single,label_dict,name='default'):
26+
"""
27+
compute true postive(TP), false postive(FP), false negative(FN) given target lable and predict label
28+
:param y_targetlabel_list: a list. length is batch_size(e.g.1). each element is a multi-hot,like '[0,0,1,0,1,...]'
29+
:param y_logit_array: an numpy array. shape is:[batch_size,num_classes]
30+
:param label_dict {label:(TP,FP,FN)}
31+
:return: macro_f1(a scalar),micro_f1(a scalar)
32+
"""
33+
#1.get target label and predict label
34+
y_target_labels=get_target_label_short(y_targetlabel_list_single) #e.g. y_targetlabel_list[0]=[2,12,88]
35+
#y_logit=y_logit_array_single #y_logit_array[0] #[202]
36+
y_predict_labels=[i for i in range(len(y_logit_array_single)) if y_logit_array_single[i]>=0.50] #TODO 0.5PW e.g.[2,12,13,10]
37+
if len(y_predict_labels) < 1: y_predict_labels = [np.argmax(y_logit_array_single)]
38+
39+
#if len(y_predict_labels)<1: y_predict_labels=[np.argmax(y_logit_array_single)] #TODO ADD 2018.05.29
40+
if random.choice([x for x in range(random_number)]) ==1:print(name+".y_target_labels:",y_target_labels,";y_predict_labels:",y_predict_labels) #debug purpose
41+
42+
#2.count number of TP,FP,FN for each class
43+
y_labels_unique=[]
44+
y_labels_unique.extend(y_target_labels)
45+
y_labels_unique.extend(y_predict_labels)
46+
y_labels_unique=list(set(y_labels_unique))
47+
for i,label in enumerate(y_labels_unique): #e.g. label=2
48+
TP, FP, FN = label_dict[label]
49+
if label in y_predict_labels and label in y_target_labels:#predict=1,truth=1 (TP)
50+
TP=TP+1
51+
elif label in y_predict_labels and label not in y_target_labels:#predict=1,truth=0(FP)
52+
FP=FP+1
53+
elif label not in y_predict_labels and label in y_target_labels:#predict=0,truth=1(FN)
54+
FN=FN+1
55+
label_dict[label] = (TP, FP, FN)
56+
return label_dict
57+
58+
59+
def compute_penalty_score_batch(target_deaths, predict_deaths,target_lifeimprisons, predict_lifeimprisons,target_imprsions, predict_imprisons):
60+
"""
61+
compute penalty score(task 3) for a batch.
62+
:param target_deaths: a list. each element is a mulit-hot list
63+
:param predict_deaths: a 2-d array. [batch_size,num_class]
64+
:param target_lifeimprisons: a list. each element is a mulit-hot list
65+
:param predict_lifeimprisons: a 2-d array. [batch_size,num_class]
66+
:param target_imprsions: a list. each element is a mulit-hot list
67+
:param predict_imprisons: a 2-d array. [batch_size,num_class]
68+
:return: score_batch: a scalar, average score for that batch
69+
"""
70+
length=len(target_deaths)
71+
score_total=0.0
72+
for i in range(length):
73+
score=compute_penalty_score(target_deaths[i], predict_deaths[i], target_lifeimprisons[i],predict_lifeimprisons[i],target_imprsions[i], predict_imprisons[i])
74+
score_total=score_total+score
75+
score_batch=score_total/float(length)
76+
return score_batch
77+
78+
def compute_penalty_score(target_death, predict_death,target_lifeimprison, predict_lifeimprison,target_imprsion, predict_imprison):
79+
"""
80+
compute penalty score(task 3) for a single data
81+
:param target_death: a mulit-hot list. e.g. [1,0,0,1,...]
82+
:param predict_death: [num_class]
83+
:param target_lifeimprison: a mulit-hot list. e.g. [1,0,0,1,...]
84+
:param predict_lifeimprison: [num_class]
85+
:param target_imprsion: a mulit-hot list. e.g. [1,0,0,1,...]
86+
:param predict_imprison:[num_class]
87+
:return: score: a scalar,score for this data
88+
"""
89+
score_death=compute_death_lifeimprisonment_score(target_death, predict_death)
90+
score_lifeimprisonment=compute_death_lifeimprisonment_score(target_lifeimprison, predict_lifeimprison)
91+
score_imprisonment=compute_imprisonment_score(target_imprsion, predict_imprison)
92+
score=((score_death+score_lifeimprisonment+score_imprisonment)/3.0)*(100.0)
93+
return score
94+
95+
def compute_death_lifeimprisonment_score(target,predict):
96+
"""
97+
compute score for death or life imprisonment
98+
:param target: a list
99+
:param predict: an array
100+
:return: score: a scalar
101+
"""
102+
103+
score=0.0
104+
target=np.argmax(target)
105+
predict=np.argmax(predict)
106+
if random.choice([x for x in range(random_number)]) == 1:print("death_lifeimprisonment_score.target:", target, ";predict:", predict)
107+
if target==predict:
108+
score=1.0
109+
if random.choice([x for x in range(random_number)]) == 1:print("death_lifeimprisonment_score:",score)
110+
return score
111+
112+
def compute_imprisonment_score(target_value,predict_value):
113+
"""
114+
compute imprisonment score
115+
:param target_value: a scalar
116+
:param predict_value:a scalar
117+
:return: score: a scalar
118+
"""
119+
if random.choice([x for x in range(random_number)]) ==1:print("x.imprisonment_score.target_value:",target_value,";predict_value:",predict_value)
120+
score=0.0
121+
v=np.abs(np.log(predict_value+1.0)-np.log(target_value+1.0))
122+
if v<=0.2:
123+
score=1.0
124+
elif v<=0.4:
125+
score=0.8
126+
elif v<=0.6:
127+
score=0.6
128+
elif v<=0.8:
129+
score=0.4
130+
elif v<=1.0:
131+
score=0.2
132+
else:
133+
score=0.0
134+
if random.choice([x for x in range(random_number)]) ==1:print("imprisonment_score:",score)
135+
return score
136+
137+
def compute_micro_macro(label_dict):
138+
"""
139+
compute f1 of micro and macro
140+
:param label_dict:
141+
:return: f1_micro,f1_macro: scalar, scalar
142+
"""
143+
f1_micro = compute_f1_micro_use_TFFPFN(label_dict)
144+
f1_macro= compute_f1_macro_use_TFFPFN(label_dict)
145+
return f1_micro,f1_macro
146+
147+
def compute_f1_micro_use_TFFPFN(label_dict):
148+
"""
149+
compute f1_micro
150+
:param label_dict: {label:(TP,FP,FN)}
151+
:return: f1_micro: a scalar
152+
"""
153+
TF_micro_accusation, FP_micro_accusation, FN_micro_accusation =compute_TF_FP_FN_micro(label_dict)
154+
f1_micro_accusation = compute_f1(TF_micro_accusation, FP_micro_accusation, FN_micro_accusation,'micro')
155+
return f1_micro_accusation
156+
157+
def compute_f1_macro_use_TFFPFN(label_dict):
158+
"""
159+
compute f1_macro
160+
:param label_dict: {label:(TP,FP,FN)}
161+
:return: f1_macro
162+
"""
163+
f1_dict= {}
164+
num_classes=len(label_dict)
165+
for label, tuplee in label_dict.items():
166+
TP,FP,FN=tuplee
167+
f1_score_onelabel=compute_f1(TP,FP,FN,'macro')
168+
f1_dict[label]=f1_score_onelabel
169+
f1_score_sum=0.0
170+
for label,f1_score in f1_dict.items():
171+
f1_score_sum=f1_score_sum+f1_score
172+
f1_score=f1_score_sum/float(num_classes)
173+
return f1_score
174+
175+
#[this function is for debug purpose only]
176+
def compute_f1_score_write_for_debug(label_dict,label2index):
177+
"""
178+
compute f1 score. basicly you can also use other function to get result
179+
:param label_dict: {label:(TP,FP,FN)}
180+
:return: a dict. key is label name, value is f1 score.
181+
"""
182+
f1score_dict={}
183+
# 1. compute f1 score for each accusation.
184+
for label, tuplee in label_dict.items():
185+
TP, FP, FN = tuplee
186+
f1_score_single = compute_f1(TP, FP, FN, 'normal_f1_score')
187+
accusation_index2label = {kv[1]: kv[0] for kv in label2index.items()}
188+
label_name=accusation_index2label[label]
189+
f1score_dict[label_name]=f1_score_single
190+
191+
# 2. each to file system for debug purpose.
192+
f1score_file='debug_accuracy.txt'
193+
write_object = codecs.open(f1score_file, mode='a', encoding='utf-8')
194+
write_object.write("\n\n")
195+
196+
#tuple_list = sorted(f1score_dict.items(), lambda x, y: cmp(x[1], y[1]), reverse=False)
197+
tuple_list = sorted(f1score_dict.items(), key=lambda x: x[1], reverse=False)
198+
199+
for tuplee in tuple_list:
200+
label_name,f1_score=tuplee
201+
write_object.write(label_name+":"+str(f1_score)+"\n")
202+
write_object.close()
203+
return f1score_dict
204+
205+
def compute_f1(TP,FP,FN,compute_type):
206+
"""
207+
compute f1
208+
:param TP_micro: number.e.g. 200
209+
:param FP_micro: number.e.g. 200
210+
:param FN_micro: number.e.g. 200
211+
:return: f1_score: a scalar
212+
"""
213+
precison=TP/(TP+FP+small_value)
214+
recall=TP/(TP+FN+small_value)
215+
f1_score=(2*precison*recall)/(precison+recall+small_value)
216+
217+
if random.choice([x for x in range(500)]) == 1:print(compute_type,"precison:",str(precison),";recall:",str(recall),";f1_score:",f1_score)
218+
219+
return f1_score
220+
221+
def compute_TF_FP_FN_micro(label_dict):
222+
"""
223+
compute micro FP,FP,FN
224+
:param label_dict_accusation: a dict. {label:(TP, FP, FN)}
225+
:return:TP_micro,FP_micro,FN_micro
226+
"""
227+
TP_micro,FP_micro,FN_micro=0.0,0.0,0.0
228+
for label,tuplee in label_dict.items():
229+
TP,FP,FN=tuplee
230+
TP_micro=TP_micro+TP
231+
FP_micro=FP_micro+FP
232+
FN_micro=FN_micro+FN
233+
return TP_micro,FP_micro,FN_micro
234+
235+
def init_label_dict(num_classes):
236+
"""
237+
init label dict. this dict will be used to save TP,FP,FN
238+
:param num_classes:
239+
:return: label_dict: a dict. {label_index:(0,0,0)}
240+
"""
241+
label_dict={}
242+
for i in range(num_classes):
243+
label_dict[i]=(0,0,0)
244+
return label_dict
245+
246+
def get_target_label_short(y_mulitihot):
247+
"""
248+
get target label.
249+
:param y_mulitihot: [0,0,1,0,1,0,...]
250+
:return: taget_list.e.g. [3,5,100]
251+
"""
252+
taget_list = [];
253+
for i, element in enumerate(y_mulitihot):
254+
if element == 1:
255+
taget_list.append(i)
256+
return taget_list

model/__init__.py

Whitespace-only changes.
177 Bytes
Binary file not shown.
3.55 KB
Binary file not shown.
8.21 KB
Binary file not shown.
878 Bytes
Binary file not shown.
843 Bytes
Binary file not shown.
4.31 KB
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

model/base_model.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# -*- coding: utf-8 -*-
2+
import tensorflow as tf
3+
from model.multi_head_attention import MultiHeadAttention
4+
from model.poistion_wise_feed_forward import PositionWiseFeedFoward
5+
from model.layer_norm_residual_conn import LayerNormResidualConnection
6+
class BaseClass(object):
7+
"""
8+
base class has some common fields and functions.
9+
"""
10+
def __init__(self,d_model,d_k,d_v,sequence_length,h,batch_size,num_layer=6,decoder_sent_length=None):
11+
"""
12+
:param d_model:
13+
:param d_k:
14+
:param d_v:
15+
:param sequence_length:
16+
:param h:
17+
:param batch_size:
18+
:param embedded_words: shape:[batch_size,sequence_length,embed_size]
19+
"""
20+
self.d_model=d_model
21+
self.d_k=d_k
22+
self.d_v=d_v
23+
self.sequence_length=sequence_length
24+
self.h=h
25+
self.num_layer=num_layer
26+
self.batch_size=batch_size
27+
self.decoder_sent_length=decoder_sent_length
28+
29+
def sub_layer_postion_wise_feed_forward(self, x, layer_index) :# COMMON FUNCTION
30+
"""
31+
position-wise feed forward. you can implement it as feed forward network, or two layers of CNN.
32+
:param x: shape should be:[batch_size,sequence_length,d_model]
33+
:param layer_index: index of layer number
34+
:return: [batch_size,sequence_length,d_model]
35+
"""
36+
# use variable scope here with input of layer index, to make sure each layer has different parameters.
37+
with tf.variable_scope("sub_layer_postion_wise_feed_forward" + str(layer_index)):
38+
postion_wise_feed_forward = PositionWiseFeedFoward(x, layer_index,d_model=self.d_model,d_ff=self.d_model*4)
39+
postion_wise_feed_forward_output = postion_wise_feed_forward.position_wise_feed_forward_fn()
40+
return postion_wise_feed_forward_output
41+
42+
def sub_layer_multi_head_attention(self ,layer_index ,Q ,K_s,V_s,mask=None,is_training=None,dropout_keep_prob=0.9) :# COMMON FUNCTION
43+
"""
44+
multi head attention as sub layer
45+
:param layer_index: index of layer number
46+
:param Q: shape should be: [batch_size,sequence_length,embed_size]
47+
:param k_s: shape should be: [batch_size,sequence_length,embed_size]
48+
:param mask: when use mask,illegal connection will be mask as huge big negative value.so it's possiblitity will become zero.
49+
:return: output of multi head attention.shape:[batch_size,sequence_length,d_model]
50+
"""
51+
#print("sub_layer_multi_head_attention.",";layer_index:",layer_index)
52+
with tf.variable_scope("base_mode_sub_layer_multi_head_attention_" +str(layer_index)):
53+
#2. call function of multi head attention to get result
54+
multi_head_attention_class = MultiHeadAttention(Q, K_s, V_s, self.d_model, self.d_k, self.d_v, self.sequence_length,self.h,
55+
is_training=is_training,mask=mask,dropout_rate=(1.0-dropout_keep_prob))
56+
sub_layer_multi_head_attention_output = multi_head_attention_class.multi_head_attention_fn() # [batch_size*sequence_length,d_model]
57+
return sub_layer_multi_head_attention_output # [batch_size,sequence_length,d_model]
58+
59+
def sub_layer_layer_norm_residual_connection(self,layer_input ,layer_output,layer_index,dropout_keep_prob=0.9,use_residual_conn=True,sub_layer_name='layer1'): # COMMON FUNCTION
60+
"""
61+
layer norm & residual connection
62+
:param input: [batch_size,equence_length,d_model]
63+
:param output:[batch_size,sequence_length,d_model]
64+
:return:
65+
"""
66+
#print("sub_layer_layer_norm_residual_connection.layer_input:",layer_input,";layer_output:",layer_output,";dropout_keep_prob:",dropout_keep_prob)
67+
#assert layer_input.get_shape().as_list()==layer_output.get_shape().as_list()
68+
#layer_output_new= layer_input+ layer_output
69+
variable_scope="sub_layer_layer_norm_residual_connection_" +str(layer_index)+'_'+sub_layer_name
70+
#print("######sub_layer_layer_norm_residual_connection.variable_scope:",variable_scope)
71+
with tf.variable_scope(variable_scope):
72+
layer_norm_residual_conn=LayerNormResidualConnection(layer_input,layer_output,layer_index,residual_dropout=(1-dropout_keep_prob),use_residual_conn=use_residual_conn)
73+
output = layer_norm_residual_conn.layer_norm_residual_connection()
74+
return output # [batch_size,sequence_length,d_model]

0 commit comments

Comments
 (0)