-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexplore-w2v.py
56 lines (40 loc) · 1.59 KB
/
explore-w2v.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu May 13 11:18:22 2021
@author: cbadenes
"""
import worker.eval as workers
from gensim.models import Word2Vec
import pysolr
#########################################################################################################
## {"doc": "ibc-FGT-2201", "strategy": "l3_h0", "ref-labels": ["D009042"], "inf-labels": ["D061485"]}
#########################################################################################################
doc_id = "ibc-FGT-2201"
model_level = 3
#########################################################################################################
#########################################################################################################
# Create a client instance. The timeout and authentication options are not required.
solr = pysolr.Solr('http://librairy.linkeddata.es/data/mesinesp', always_commit=True, timeout=50)
model = Word2Vec.load("../models/word2vec.model")
articles = solr.search("id:"+doc_id)
article = articles.docs[0]
bow = article['bow_t']
for term in bow.split(" "):
print(">>>>",term)
if ("=" not in term):
continue
word = term.split("=")[0]
freq = term.split("=")[1]
try:
sims = model.wv.most_similar(word, topn=1)
for sim in sims:
print(sim)
sim_word = sim[0]
new_term = sim_word + "=" + freq
bow += new_term + " "
print("added",new_term)
except:
print("similar words not found for",word)
topics = workers.get_topics(model_level,bow)
print(topics)