forked from sd16spring/ToolBox-WordFrequency
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfrequency.py
More file actions
54 lines (44 loc) · 2.31 KB
/
frequency.py
File metadata and controls
54 lines (44 loc) · 2.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
"""
Author = Cedric Kim
Analyzes the word frequencies in a Tale of Two Cities """
import string
from string import maketrans
#import TaleofTwoCities.txt
def get_word_list(file_name):
"""reads the specified project Gutenberg book. Header comments,
punctuation, and whitespace are stripped away. The function
returns a list of the words used in the book as a list.
All words are converted to lower case."""
f = open(file_name, 'r')
lines = f.readlines()
end_line = 0
start_line = 0
while lines[start_line].find('START OF THIS PROJECT GUTENBERG EBOOK') == -1: ##finds the line which this is located
start_line += 1
while lines[end_line].find('END OF THIS PROJECT GUTENBERG EBOOK') == -1: ##fined the line which end is located
end_line +=1
lines = lines[start_line+1:end_line-2] ##takes the rest of the book
s = ''
word_string = s.join(lines) ##joins the list into a long string
for char in string.punctuation:
word_string = word_string.replace(char, '') ##removes all puncuation
word_list = word_string.split() ##splits the string into a list of words
word_list = [word.lower() for word in word_list] ##changes uppercase into lowercase
return word_list
def get_top_n_words(word_list, n):
"""Takes a list of words as input and returns a list of the n most frequently
occurring words ordered from most to least frequently occurring.
word_list: a list of words (assumed to all be in lower case with no
punctuation
n: the number of words to return
returns: a list of n most frequently occurring words ordered from most
frequently to least frequentlyoccurrin"""
word_count = dict() ##create a new dictionary
for word in word_list:
word_count[word] = 1 + word_count.get(word, 0) ##creates key value pair for the words based on their frequency
ordered_by_frequency = sorted(word_count, key=word_count.get, reverse=True) ##sorts the words
return ordered_by_frequency[:n] ##takes the n most frequent words
if __name__ == '__main__':
word_list = get_word_list('TaleofTwoCities.txt')
top_frequency_words = get_top_n_words(word_list, 100)
print top_frequency_words