-
Notifications
You must be signed in to change notification settings - Fork 0
/
ngram_test_generate_txt_file.py
158 lines (142 loc) · 6.55 KB
/
ngram_test_generate_txt_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# !/usr/bin/python
# -*- coding: utf-8 -*-
import os
import json
import numpy as np
import matplotlib.pyplot as plt
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def get_file_list(file_path):
files_path = os.listdir(file_path)
return files_path
def get_n_gram(dir_path, file_path, token_dictionary, sen_start_dict, sen_end_dict, sen_flag):
if not os.path.isdir(file_path):
f = open(dir_path+"/"+file_path)
iter_f = iter(f)
for line in iter_f:
line = line.decode('gb18030')
# 加上这一步会简单很多啊,也是醉了
# print line
start = 0
end = 0
whitespace = 0
pre_previous = None
previous = None
current = None
pre_pre_is_sentence = False
pre_is_sentence = True
cur_is_sentence = False
# print "len of line is :", len(line), "\n"
for i in range(len(line)):
if line[i] == '[':
start = i+1
if line[i] == ' ':
whitespace += 1
if whitespace == 1:
start = i+2
if whitespace == 2:
whitespace = 0
'''if i == len(line)-1 and line[i-2] not in sen_flag:
current = line[start:len(line)-2]
if current + " " + "<EOS>" not in sen_end_dict:
sen_end_dict[current + " " + "<EOS>"] = 1
else:
sen_end_dict[current + " " + "<EOS>"] += 1
print current + " " + "<EOS>"'''
if line[i] == '/':
end = i
if end > start:
# print line[i-1].decode('gb18030')
# temp = line[i-1: i].decode('gb18030')
if line[i-1] in sen_flag:
cur_is_sentence = True
# print line[i-1], i-1, "\n"
current = line[start: end]
# print "current is:" + current + "\n"
if previous is not None and current is not None:
if previous + " " + current not in token_dictionary:
token_dictionary[previous + " " + current] = 1
else:
token_dictionary[previous + " " + current] += 1
if pre_pre_is_sentence is True:
if "<BOS>" + " " + previous not in sen_start_dict:
sen_start_dict["<BOS>" + " " + previous] = 1
else:
sen_start_dict["<BOS>" + " " + previous] += 1
if previous is not None and cur_is_sentence is True and pre_previous is not None:
if previous + " " + "<EOS>" not in sen_end_dict:
sen_end_dict[previous + " " + "<EOS>"] = 1
else:
sen_end_dict[previous + " " + "<EOS>"] += 1
pre_previous = previous
previous = current
pre_pre_is_sentence = pre_is_sentence
pre_is_sentence = cur_is_sentence
# print "previous is:" + previous + "\n"
start = end
cur_is_sentence = False
def count_token_from_file_list(dir_path1, dir_path2, dir_path3, files1, files2, files3, token_dictionary,
sen_start_dict, sen_end_dict, sen_flag):
for file_path1 in files1:
get_n_gram(dir_path1, file_path1, token_dictionary, sen_start_dict, sen_end_dict, sen_flag)
for file_path2 in files2:
get_n_gram(dir_path2, file_path2, token_dictionary, sen_start_dict, sen_end_dict, sen_flag)
for file_path3 in files3:
get_n_gram(dir_path3, file_path3, token_dictionary, sen_start_dict, sen_end_dict, sen_flag)
if __name__ == '__main__':
path1 = "/Users/datamining/Desktop/CL_4th/test"
path2 = "/Users/datamining/Desktop/CL_4th/train"
path3 = "/Users/datamining/Desktop/CL_4th/valid"
files1 = get_file_list(path1)
files2 = get_file_list(path2)
files3 = get_file_list(path3)
# token_dictionary = {}
# get_n_gram(path, files[0], token_dictionary)
token_dictionary = {}
sen_start_dict = {}
sen_end_dict = {}
sen_flag = "。?!:;".decode('utf-8')
for elem in sen_flag:
print elem
# count_token_from_file_list(path1, path2, path3, files1, files2, files3, token_dictionary, sen_start_dict, sen_end_dict, sen_flag)
for file_pa in files3:
get_n_gram(path3, file_pa, token_dictionary, sen_start_dict, sen_end_dict, sen_flag)
for file_pa in files2:
get_n_gram(path2, file_pa, token_dictionary, sen_start_dict, sen_end_dict, sen_flag)
print "len of token_dictionary is:", len(token_dictionary), "\n"
print "len of sen_start_dict is:", len(sen_start_dict), "\n"
print "len of sen_end_dict is:", len(sen_end_dict), "\n"
# print sen_start_dict
print json.dumps(sen_start_dict, ensure_ascii=False, encoding='utf-8')
# print sen_end_dict
print json.dumps(sen_end_dict, ensure_ascii=False, encoding='utf-8')
sen_start_count = 0
for ite in sen_start_dict:
sen_start_count += sen_start_dict[ite]
sen_end_count = 0
for ite in sen_end_dict:
sen_end_count += sen_end_dict[ite]
print "sen_start_count is:", sen_start_count, "\n"
print "sen_end_count is:", sen_end_count, "\n"
'''dict = sorted(token_dictionary.iteritems(), key=lambda d: d[1], reverse=True)
# dict = sortedDictValues1(token_dictionary)
print dict[0:100]
print json.dumps(token_dictionary, ensure_ascii=False, encoding='utf-8')'''
sen_start = sorted(sen_start_dict.iteritems(), key=lambda d: d[1], reverse=True)
sen_end = sorted(sen_end_dict.iteritems(), key=lambda d: d[1], reverse=True)
print sen_start[0:100]
with open('/Users/datamining/Desktop/11.txt', 'w') as f:
for ite in sen_start:
f.write(ite[0].decode('utf-8') + ' '+str(ite[1])+'\n')
# print ite, str(sen_start_dict[ite])
with open('/Users/datamining/Desktop/12.txt', 'w') as f:
for ite in sen_end:
f.write(ite[0].decode('utf-8') + ' '+str(ite[1])+'\n')
# print ite[0], str(ite[1])
temp = "<BOS>" + " " + "扶贫"
temp = temp.decode('utf-8')
print temp, ":", sen_start_dict[temp]
temp2 = "成绩" + " " + "<EOS>"
temp2 = temp2.decode('utf-8')
print temp2, ":", sen_end_dict[temp2]