-
Notifications
You must be signed in to change notification settings - Fork 24
/
docsim.py
183 lines (136 loc) · 7.59 KB
/
docsim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
"""
Semantic document similarity using Gensim
@author: 4oh4
28/03/2020
This class is based on the Gensim Soft Cosine Tutorial notebook:
https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb
"""
from re import sub
import threading
from multiprocessing import cpu_count
import gensim.downloader as api
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.models import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import SoftCosineSimilarity
from gensim.models.keyedvectors import Word2VecKeyedVectors
# Import and download the most up-to-date stopwords from NLTK
# from nltk import download
# from nltk.corpus import stopwords
# download('stopwords') # Download stopwords list.
# nltk_stop_words = set(stopwords.words("english"))
# Or use a hard-coded list of English stopwords
nltk_stop_words = {'a','about','above','after','again','against','ain','all','am','an','and','any','are','aren',"aren't",'as','at','be','because','been','before','being','below','between','both','but','by','can','couldn',"couldn't",'d','did','didn',"didn't",'do','does','doesn',"doesn't",'doing','don',"don't",'down','during','each','few','for','from','further','had','hadn',"hadn't",'has','hasn',"hasn't",'have','haven',"haven't",'having','he','her','here','hers','herself','him','himself','his','how','i','if','in','into','is','isn',"isn't",'it',"it's",'its','itself','just','ll','m','ma','me','mightn',"mightn't",'more','most','mustn',"mustn't",'my','myself','needn',"needn't",'no','nor','not','now','o','of','off','on','once','only','or','other','our','ours','ourselves','out','over','own','re','s','same','shan',"shan't",'she',"she's",'should',"should've",'shouldn',"shouldn't",'so','some','such','t','than','that',"that'll",'the','their','theirs','them','themselves','then','there','these','they','this','those','through','to','too','under','until','up','ve','very','was','wasn',"wasn't",'we','were','weren',"weren't",'what','when','where','which','while','who','whom','why','will','with','won',"won't",'wouldn',"wouldn't",'y','you',"you'd","you'll","you're","you've",'your','yours','yourself','yourselves'}
class NotReadyError(Exception):
pass
class DocSim:
"""
Find documents that are similar to a query string.
Calculated using word similarity (Soft Cosine Similarity) of word embedding vectors
Example usage:
# Use default model (glove-wiki-gigaword-50)
docsim = DocSim()
docsim.similarity_query(query_string, documents)
# Or, specify a preferred, pre-existing Gensim model with custom stopwords and verbose mode
docsim = DocSim(model='glove-twitter-25', stopwords=['the', 'and', 'are'], verbose=True)
docsim.similarity_query(query_string, documents)
# Or, supply a custom pre-initialised model in gensim.models.keyedvectors.Word2VecKeyedVectors format
docsim = DocSim(model=myModel)
docsim.similarity_query(query_string, documents)
"""
default_model = "glove-wiki-gigaword-50"
model_ready = False # Only really relevant to threaded sub-class
def __init__(self, model=None, stopwords=None, verbose=False):
# Constructor
self.verbose = verbose
self._load_model(model)
if stopwords is None:
self.stopwords = nltk_stop_words
else:
self.stopwords = stopwords
def _load_model(self, model):
# Pass through to _setup_model (overridden in threaded)
self._setup_model(model)
def _setup_model(self, model):
# Determine which model to use, download/load it, and create the similarity_index
if isinstance(model, Word2VecKeyedVectors):
# Use supplied model
self.model = model
elif isinstance(model, str):
# Try to download named model
if self.verbose:
print(f'Loading word vector model: {model}')
self.model = api.load(model)
if self.verbose:
print('Model loaded')
elif model is None:
# Download/use default GloVe model
if self.verbose:
print(f'Loading default GloVe word vector model: {self.default_model}')
self.model = api.load(self.default_model)
if self.verbose:
print('Model loaded')
else:
raise ValueError('Unable to load word vector model')
self.similarity_index = WordEmbeddingSimilarityIndex(self.model)
self.model_ready = True
def preprocess(self, doc: str):
# Clean up input document string, remove stopwords, and tokenize
doc = sub(r'<img[^<>]+(>|$)', " image_token ", doc)
doc = sub(r'<[^<>]+(>|$)', " ", doc)
doc = sub(r'\[img_assist[^]]*?\]', " ", doc)
doc = sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', " url_token ", doc)
return [token for token in simple_preprocess(doc, min_len=0, max_len=float("inf")) if token not in self.stopwords]
def _softcossim(self, query: str, documents: list):
# Compute Soft Cosine Measure between the query and each of the documents.
query = self.tfidf[self.dictionary.doc2bow(query)]
index = SoftCosineSimilarity(
self.tfidf[[self.dictionary.doc2bow(document) for document in documents]],
self.similarity_matrix)
similarities = index[query]
return similarities
def similarity_query(self, query_string: str, documents: list):
"""
Run a new similarity ranking, for query_string against each of the documents
Arguments:
query_string: (string)
documents: (list) of string documents to compare query_string against
explain: (bbol) if True, highest scoring words are also returned
Returns:
list: similarity scores for each of the documents
or
NotReadyError: if model is not ready/available
"""
if self.model_ready:
corpus = [self.preprocess(document) for document in documents]
query = self.preprocess(query_string)
if set(query) == set([word for document in corpus for word in document]):
raise ValueError('query_string full overlaps content of document corpus')
if self.verbose:
print(f'{len(corpus)} documents loaded into corpus')
self.dictionary = Dictionary(corpus+[query])
self.tfidf = TfidfModel(dictionary=self.dictionary)
self.similarity_matrix = SparseTermSimilarityMatrix(self.similarity_index,
self.dictionary, self.tfidf)
scores = self._softcossim(query, corpus)
return scores.tolist()
else:
raise NotReadyError('Word embedding model is not ready.')
class DocSim_threaded(DocSim):
"""
Threaded verion to load model (long running process) in background. Everything else same as standard version.
Find documents that are similar to a query string.
Calculated using word similarity (Soft Cosine Similarity) of word embedding vectors
Example usage:
docsim = DocSim_threaded()
docsim.similarity_query(query_string, documents)
"""
def _load_model(self, model):
"""
# Setup the model in a separate thread
"""
self.thread = threading.Thread(target=self._setup_model, args=[model])
self.thread.setDaemon(True)
self.thread.start()