-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
118 lines (95 loc) · 6.29 KB
/
Copy pathpreprocessing.py
File metadata and controls
118 lines (95 loc) · 6.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os, glob
import re
import numpy as np
import math
import matplotlib.pyplot as plt
import matplotlib.tri as tri
from tempfile import TemporaryFile
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import sklearn
from sklearn.model_selection import train_test_split
from scipy.spatial import distance
from collections import Counter
from ouvertureFichiers import lecture_fichier
stop_words_list = stopwords.words('english')
"""stop_words_list = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount", "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as", "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thick", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"]"""
def preprocessing(T):#T est une liste de chaines de caracteres (chaque chaine de caractere contient tout le text d'un fichier)
pattern = r'[a-zA-Z]+'
L = [re.findall(pattern,c) for c in T]
listeMotsProcessed = [ [mot.lower() for mot in liste if len(mot)>2 and (mot.lower() not in stop_words_list)] for liste in L]
return listeMotsProcessed
def BoW(m): #Renvoie X et Dico
Dico = dict()
ind = 0
i = 0
for s in m:
for mot in s:
if mot not in Dico:
Dico[mot] = i
i = i+1
X = np.zeros((len(m), len(Dico)))
for s in m:
for mot in s:
X[ind, Dico[mot]] = X[ind, Dico[mot]] + 1
ind = ind + 1
return X, Dico
def document_frequency(X):
listeDocFreq = [] #vecteur des frequences documentaires pour chaque mot du dictionnaire
liste = [sum(x) for x in zip(*X)]
listeDocFreq = [(1.0*count)/len(X) for count in liste]
return listeDocFreq
def inverse_doc_frequency(X):
liste = [sum(x) for x in zip(*X)]
listeDocFreq = document_frequency(X)
liste_log = [math.log(len(X)/count) for count in liste ]
listeInvDocFreq = [x*y for x,y in zip(listeDocFreq,liste_log)]
return listeInvDocFreq
def remove_keys(d, keys):
to_remove = set(keys)
filtered_keys = set(d.keys()) - to_remove
filtered_values = map(d.get, filtered_keys)
return dict(zip(filtered_keys, filtered_values))
def matriceX_Dico_processed(X,D,docFreq): #seuil en parametre
#prend en argument la matrice X et la liste des, et enleve les colonnes (mots) qui ne sont pas pertinants grace a l'analyse de word frequency
L = np.array(docFreq)
L1 = np.where(L>0.9)[0]
L2 = np.where(L < 3/len(X))[0]
L = list(set(L1) | set(L2))
X = np.delete(X, L, axis=1)
keys = [key for key in D if D[key] in L]
#k = [keys[i] for i in L]
D = remove_keys(D,keys)
return X,D
def matriceX_Dico_processed_inverse(X,D,invDocFreq):
#prend en argument la matrice X et la liste des, et enleve les colonnes (mots) qui ne sont pas pertinants grace a l'analyse de word frequency
vect = np.array(invDocFreq)
ind = np.argsort(vect)
ind = ind[:int(len(ind)*0.1)]
X = np.delete(X, ind, axis = 1) # on supprime la ieme colonne de X
keys = [key for key in D if D[key] in ind]
#k = [keys[i] for i in L]
D = remove_keys(D,keys)
return X,D
def mots_recurrents(X,D,invDocFreq):
vect = np.array(invDocFreq)
ind = np.argsort(vect)
ind = ind[:int(len(ind)*0.1)]
keys = [key for key in D]
return [keys[i] for i in ind]
def matrice_agreg(X,Y,nbserie):
Xa = np.zeros((nbserie,len(X[0])))
for s in range(nbserie):
y = Y[:,2]
L = np.where(y==s)[0]
cpt=0
p=[]
for j in range(0,len(X[0])):
for i in L:
cpt = cpt + X[i,j]
p.append(cpt*1./len(L))
cpt=0
#f = np.array([np.mean(X[L[0]:L[len(L)-1],i]) for i in range(len(X[0]))])
Xa[s]=p
return Xa