-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathquery.py
129 lines (104 loc) · 4.76 KB
/
query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import nltk
import os
import string
from math import log
# this requires nltk to be installed: "pip install nltk"
nltk.download('stopwords')
nltk.download('punkt')
def answerQuery(query, dataFolder="Data", FILE_MATCH=1, SENT_MATCH=1):
""" Given 'dataFolder' (contains files with information about space topics),
'query' (the question the user has asked), 'FILE_MATCH' (the number of files
to find sentences from), and 'SENT_MATCH' (the number of sentences used to
answer the query) """
# Calculate IDF values across files
fileDict = readFiles(dataFolder)
docsMap = {
filename: tokenize(fileDict[filename])
for filename in fileDict
}
query = set(tokenize(query))
# Extract sentences from top files according to TF-IDF
filenames = topFiles(query, docsMap, computeIDFS(docsMap), FILE_MATCH)
sentences = {}
for filename in filenames:
for passage in fileDict[filename].split("\n"):
for sentence in nltk.sent_tokenize(passage):
tokens = tokenize(sentence)
if tokens:
sentences[sentence] = tokens
# Determine top sentence matches
return topSentences(query, sentences, computeIDFS(sentences), SENT_MATCH)
def readFiles(dir):
""" Returns a map of the filename to the contents of the file for
every file in the given directory dir. """
fileDict = {}
for filename in os.listdir(dir): # loop though all files
with open(os.path.join(dir, filename), encoding="utf-8") as fString:
fileDict[filename] = fString.read()
return fileDict
def tokenize(doc):
""" Given a document(as a string), return an ordered list of its words. Coverts
all words to lowercase and removes any punctuation or English stopwords. """
# set up: tokenize the document in order to iterate through words
word_document = nltk.tokenize.word_tokenize(doc.lower())
punct = string.punctuation
stop_words = nltk.corpus.stopwords.words('english')
words = [] # finalize the list of words
for word in word_document:
# check to see if the word has punctuation or is a stop word
if word not in punct and word not in stop_words:
words.append(word)
return words
def computeIDFS(docsMap):
""" Given a map of file names to lists of words(docsMap),
return a dictionary that maps every word to its IDF value. """
num_docs = len(docsMap)
occurrence = {}
for doc in docsMap: # loop though every file
for word in set(docsMap[doc]):
if word in occurrence.keys():
occurrence[word] += 1
else:
occurrence[word] = 1
IDFS = {} # find IDF value of words
for word in occurrence:
IDFS[word] = log((num_docs/occurrence[word]))
return IDFS
def topFiles(query, docsMap, IDFS, n):
""" Given a 'query' (a set of words), 'docsMap' (a dictionary mapping
names of files to a list of their words), and 'IDFS' (a dictionary
mapping words to their IDF values), return a list of the filenames of
the 'n' top files that match the query, ranked according to tf-idf. """
fileIDFS = {}
for doc in docsMap:
fileIDFS[doc] = 0
for word in query:
words = docsMap[doc]
tf = words.count(word)
if word in IDFS:
idf = IDFS[word]
fileIDFS[doc] += idf * tf
rank = sorted([doc for doc in docsMap.keys()],
key=lambda x: fileIDFS[x], reverse=True)
return rank[:n] # the top n files
def topSentences(query, sentences, IDFS, n):
""" Given a 'query' (a set of words), 'sentences' (a dictionary mapping
sentences to a list of their words), and 'IDFS' (a dictionary mapping words
to their IDF values), return a list of the 'n' top sentences that match
the query, ranked according to idf. If there are ties, preference should
be given to sentences that have a higher query term density. """
sentTraits = {} # map sentences to a list: (total idf, # matches or qtd)
for sentence in sentences:
sentTraits[sentence] = [0, 0]
length = len(sentences[sentence])
for word in query: # determine the total idf an matches in each sentence
# if the query word exists in sentence
if word in sentences[sentence]:
sentTraits[sentence][0] += IDFS[word]
sentTraits[sentence][1] += sentences[sentence].count(word)
sentTraits[sentence][1] = float(sentTraits[sentence][1] / length)
sorted_sentences = sorted(sentTraits.keys(), key=lambda sentence: (
sentTraits[sentence][0], sentTraits[sentence][1]), reverse=True)
return sorted_sentences[:n] # return the top n sentences
if __name__ == "__main__":
print(answerQuery(input("Enter a query: "))[0])