-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathtext_strcutured.py
110 lines (89 loc) · 3.15 KB
/
text_strcutured.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import codecs
import sys
#import gensim
import logging
from gensim import corpora, models, similarities
import jieba
import numpy as np
import pandas as pd
import argparse
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
#the source corpus path is ~/Downloads/cnews/cnews.train.txt
def preprocess_data(f_text, f_stopwords, slic):
#we formate the preprocess then return the
doc = open(f_text).readlines()
documents = [d.strip().split('\t')[1] for d in doc][:slic]
# remove common words and tokenize
with open(f_stopwords) as f:
stoplist = [x.strip() for x in f.readlines()]
texts = [[word for word in jieba.cut(document) if word not in stoplist]\
for document in documents]
#remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1]\
for text in texts]
from pprint import pprint
#pprint(texts[:3])
dictionary = corpora.Dictionary(texts)
#dictionary.save('/tmp/deerwester.dict')
n_token = len(dictionary.token2id)
print('we have total token with number of :', len(dictionary.token2id))
'''
id2token = dict()
for k, v in zip(dictionary.token2id.keys(), dictionary.token2id.values()):
id2token.setdefault(v, k)
'''
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
return tfidf, n_token, texts, dictionary
# save as file
#print(tfidf, type(tfidf), dir(tfidf))
#corpora.MmCorpus.serialize('/tmp/corpus.mm', tfidf)
def main():
parser = argparse.ArgumentParser(description = 'add argument to the program!')
parser.add_argument("--f_text", help = "raw corpus file")
parser.add_argument("--stop_text", help = "stopwords file")
parser.add_argument("--slic", help = "slice the rawcorpus or ':' for all")
#print(parser.parse_args())
f_text, stopwords_path, slic = parser.parse_args()
if not f_text or not stop_text or not slic:
doc_path = '/Users/ajmd/Downloads/cnews/cnews.test.txt'
stopwords_path = '/Users/ajmd/data/stopwords/CNstopwords.txt'
slic = 500
else:
doc_path = f_text
stopwords_path = stop_text
slic = slic
tfidf, n_token, texts, dictionary = preprocess_data(doc_path, stopwords_path, slic)
# we revector the text in the raw file
mshape = (tfidf.num_docs, n_token)
real_matrix = np.zeros(shape = mshape)
for i in range(len(texts)):
real_vec = real_matrix[i,]
#print(c)
vec = dictionary.doc2bow(texts[i])
#print('vec:{}'.format(vec))
vector = tfidf[vec]
#print('vector:', vector)
for t in vector:
idx, val = t
#print(idx, val)
real_vec[idx] = val
#print(i, ':', real_vec, real_vec.sum())
#break
#print(real_matrix)
print('saving to csv format...with shape of {}'.format(real_matrix.shape))
header = np.array(list(dictionary.token2id.keys())).reshape(-1,8178)
#print(header)
print("Shape of header:".format(header.shape))
final_array = np.vstack((header, real_matrix))
data = pd.DataFrame(final_array)
return data
#data.to_csv('./data1.csv',encoding = 'utf-8')
#np.savetxt("./text_feature.csv", final_array, delimiter=',')
if __name__ == "__main__":
main()