Skip to content

Commit ca5ce3d

Browse files
author
Karan
committed
code files added
1 parent 87919b2 commit ca5ce3d

File tree

247 files changed

+218482
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

247 files changed

+218482
-0
lines changed

Chapter 10/code/bag_of_words.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import numpy as np
2+
from sklearn.feature_extraction.text import CountVectorizer
3+
from nltk.corpus import brown
4+
from text_chunker import chunker
5+
6+
# Read the data from the Brown corpus
7+
input_data = ' '.join(brown.words()[:5400])
8+
9+
# Number of words in each chunk
10+
chunk_size = 800
11+
12+
text_chunks = chunker(input_data, chunk_size)
13+
14+
# Convert to dict items
15+
chunks = []
16+
for count, chunk in enumerate(text_chunks):
17+
d = {'index': count, 'text': chunk}
18+
chunks.append(d)
19+
20+
# Extract the document term matrix
21+
count_vectorizer = CountVectorizer(min_df=7, max_df=20)
22+
document_term_matrix = count_vectorizer.fit_transform([chunk['text'] for chunk in chunks])
23+
24+
# Extract the vocabulary and display it
25+
vocabulary = np.array(count_vectorizer.get_feature_names())
26+
print("\nVocabulary:\n", vocabulary)
27+
28+
# Generate names for chunks
29+
chunk_names = []
30+
for i in range(len(text_chunks)):
31+
chunk_names.append('Chunk-' + str(i+1))
32+
33+
# Print the document term matrix
34+
print("\nDocument term matrix:")
35+
formatted_text = '{:>12}' * (len(chunk_names) + 1)
36+
print('\n', formatted_text.format('Word', *chunk_names), '\n')
37+
for word, item in zip(vocabulary, document_term_matrix.T):
38+
# 'item' is a 'csr_matrix' data structure
39+
output = [word] + [str(freq) for freq in item.data]
40+
print(formatted_text.format(*output))
41+

Chapter 10/code/category_predictor.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from sklearn.datasets import fetch_20newsgroups
2+
from sklearn.naive_bayes import MultinomialNB
3+
from sklearn.feature_extraction.text import TfidfTransformer
4+
from sklearn.feature_extraction.text import CountVectorizer
5+
6+
# Define the category map
7+
category_map = {'talk.politics.misc': 'Politics', 'rec.autos': 'Autos',
8+
'rec.sport.hockey': 'Hockey', 'sci.electronics': 'Electronics',
9+
'sci.med': 'Medicine'}
10+
11+
# Get the training dataset
12+
training_data = fetch_20newsgroups(subset='train',
13+
categories=category_map.keys(), shuffle=True, random_state=5)
14+
15+
# Build a count vectorizer and extract term counts
16+
count_vectorizer = CountVectorizer()
17+
train_tc = count_vectorizer.fit_transform(training_data.data)
18+
print("\nDimensions of training data:", train_tc.shape)
19+
20+
# Create the tf-idf transformer
21+
tfidf = TfidfTransformer()
22+
train_tfidf = tfidf.fit_transform(train_tc)
23+
24+
# Define test data
25+
input_data = [
26+
'You need to be careful with cars when you are driving on slippery roads',
27+
'A lot of devices can be operated wirelessly',
28+
'Players need to be careful when they are close to goal posts',
29+
'Political debates help us understand the perspectives of both sides'
30+
]
31+
32+
# Train a Multinomial Naive Bayes classifier
33+
classifier = MultinomialNB().fit(train_tfidf, training_data.target)
34+
35+
# Transform input data using count vectorizer
36+
input_tc = count_vectorizer.transform(input_data)
37+
38+
# Transform vectorized data using tfidf transformer
39+
input_tfidf = tfidf.transform(input_tc)
40+
41+
# Predict the output categories
42+
predictions = classifier.predict(input_tfidf)
43+
44+
# Print the outputs
45+
for sent, category in zip(input_data, predictions):
46+
print('\nInput:', sent, '\nPredicted category:', \
47+
category_map[training_data.target_names[category]])
48+

Chapter 10/code/data.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
The Roman empire expanded very rapidly and it was the biggest empire in the world for a long time.
2+
An algebraic structure is a set with one or more finitary operations defined on it that satisfies a list of axioms.
3+
Renaissance started as a cultural movement in Italy in the Late Medieval period and later spread to the rest of Europe.
4+
The line of demarcation between prehistoric and historical times is crossed when people cease to live only in the present.
5+
Mathematicians seek out patterns and use them to formulate new conjectures.
6+
A notational symbol that represents a number is called a numeral in mathematics.
7+
The process of extracting the underlying essence of a mathematical concept is called abstraction.
8+
Historically, people have frequently waged wars against each other in order to expand their empires.
9+
Ancient history indicates that various outside influences have helped formulate the culture and traditions of Eastern Europe.
10+
Mappings between sets which preserve structures are of special interest in many fields of mathematics.

Chapter 10/code/gender_identifier.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import random
2+
3+
from nltk import NaiveBayesClassifier
4+
from nltk.classify import accuracy as nltk_accuracy
5+
from nltk.corpus import names
6+
7+
# Extract last N letters from the input word
8+
# and that will act as our "feature"
9+
def extract_features(word, N=2):
10+
last_n_letters = word[-N:]
11+
return {'feature': last_n_letters.lower()}
12+
13+
if __name__=='__main__':
14+
# Create training data using labeled names available in NLTK
15+
male_list = [(name, 'male') for name in names.words('male.txt')]
16+
female_list = [(name, 'female') for name in names.words('female.txt')]
17+
data = (male_list + female_list)
18+
19+
# Seed the random number generator
20+
random.seed(5)
21+
22+
# Shuffle the data
23+
random.shuffle(data)
24+
25+
# Create test data
26+
input_names = ['Alexander', 'Danielle', 'David', 'Cheryl']
27+
28+
# Define the number of samples used for train and test
29+
num_train = int(0.8 * len(data))
30+
31+
# Iterate through different lengths to compare the accuracy
32+
for i in range(1, 6):
33+
print('\nNumber of end letters:', i)
34+
features = [(extract_features(n, i), gender) for (n, gender) in data]
35+
train_data, test_data = features[:num_train], features[num_train:]
36+
classifier = NaiveBayesClassifier.train(train_data)
37+
38+
# Compute the accuracy of the classifier
39+
accuracy = round(100 * nltk_accuracy(classifier, test_data), 2)
40+
print('Accuracy = ' + str(accuracy) + '%')
41+
42+
# Predict outputs for input names using the trained classifier model
43+
for name in input_names:
44+
print(name, '==>', classifier.classify(extract_features(name, i)))
45+

Chapter 10/code/lemmatizer.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from nltk.stem import WordNetLemmatizer
2+
3+
input_words = ['writing', 'calves', 'be', 'branded', 'horse', 'randomize',
4+
'possibly', 'provision', 'hospital', 'kept', 'scratchy', 'code']
5+
6+
# Create lemmatizer object
7+
lemmatizer = WordNetLemmatizer()
8+
9+
# Create a list of lemmatizer names for display
10+
lemmatizer_names = ['NOUN LEMMATIZER', 'VERB LEMMATIZER']
11+
formatted_text = '{:>24}' * (len(lemmatizer_names) + 1)
12+
print('\n', formatted_text.format('INPUT WORD', *lemmatizer_names),
13+
'\n', '='*75)
14+
15+
# Lemmatize each word and display the output
16+
for word in input_words:
17+
output = [word, lemmatizer.lemmatize(word, pos='n'),
18+
lemmatizer.lemmatize(word, pos='v')]
19+
print(formatted_text.format(*output))

Chapter 10/code/sentiment_analyzer.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
from nltk.corpus import movie_reviews
2+
from nltk.classify import NaiveBayesClassifier
3+
from nltk.classify.util import accuracy as nltk_accuracy
4+
5+
# Extract features from the input list of words
6+
def extract_features(words):
7+
return dict([(word, True) for word in words])
8+
9+
if __name__=='__main__':
10+
# Load the reviews from the corpus
11+
fileids_pos = movie_reviews.fileids('pos')
12+
fileids_neg = movie_reviews.fileids('neg')
13+
14+
# Extract the features from the reviews
15+
features_pos = [(extract_features(movie_reviews.words(
16+
fileids=[f])), 'Positive') for f in fileids_pos]
17+
features_neg = [(extract_features(movie_reviews.words(
18+
fileids=[f])), 'Negative') for f in fileids_neg]
19+
20+
# Define the train and test split (80% and 20%)
21+
threshold = 0.8
22+
num_pos = int(threshold * len(features_pos))
23+
num_neg = int(threshold * len(features_neg))
24+
25+
# Create training and training datasets
26+
features_train = features_pos[:num_pos] + features_neg[:num_neg]
27+
features_test = features_pos[num_pos:] + features_neg[num_neg:]
28+
29+
# Print the number of datapoints used
30+
print('\nNumber of training datapoints:', len(features_train))
31+
print('Number of test datapoints:', len(features_test))
32+
33+
# Train a Naive Bayes classifier
34+
classifier = NaiveBayesClassifier.train(features_train)
35+
print('\nAccuracy of the classifier:', nltk_accuracy(
36+
classifier, features_test))
37+
38+
N = 15
39+
print('\nTop ' + str(N) + ' most informative words:')
40+
for i, item in enumerate(classifier.most_informative_features()):
41+
print(str(i+1) + '. ' + item[0])
42+
if i == N - 1:
43+
break
44+
45+
# Test input movie reviews
46+
input_reviews = [
47+
'The costumes in this movie were great',
48+
'I think the story was terrible and the characters were very weak',
49+
'People say that the director of the movie is amazing',
50+
'This is such an idiotic movie. I will not recommend it to anyone.'
51+
]
52+
53+
print("\nMovie review predictions:")
54+
for review in input_reviews:
55+
print("\nReview:", review)
56+
57+
# Compute the probabilities
58+
probabilities = classifier.prob_classify(extract_features(review.split()))
59+
60+
# Pick the maximum value
61+
predicted_sentiment = probabilities.max()
62+
63+
# Print outputs
64+
print("Predicted sentiment:", predicted_sentiment)
65+
print("Probability:", round(probabilities.prob(predicted_sentiment), 2))
66+

Chapter 10/code/stemmer.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from nltk.stem.porter import PorterStemmer
2+
from nltk.stem.lancaster import LancasterStemmer
3+
from nltk.stem.snowball import SnowballStemmer
4+
5+
input_words = ['writing', 'calves', 'be', 'branded', 'horse', 'randomize',
6+
'possibly', 'provision', 'hospital', 'kept', 'scratchy', 'code']
7+
8+
# Create various stemmer objects
9+
porter = PorterStemmer()
10+
lancaster = LancasterStemmer()
11+
snowball = SnowballStemmer('english')
12+
13+
# Create a list of stemmer names for display
14+
stemmer_names = ['PORTER', 'LANCASTER', 'SNOWBALL']
15+
formatted_text = '{:>16}' * (len(stemmer_names) + 1)
16+
print('\n', formatted_text.format('INPUT WORD', *stemmer_names),
17+
'\n', '='*68)
18+
19+
# Stem each word and display the output
20+
for word in input_words:
21+
output = [word, porter.stem(word),
22+
lancaster.stem(word), snowball.stem(word)]
23+
print(formatted_text.format(*output))

Chapter 10/code/text_chunker.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import numpy as np
2+
from nltk.corpus import brown
3+
4+
# Split the input text into chunks, where
5+
# each chunk contains N words
6+
def chunker(input_data, N):
7+
input_words = input_data.split(' ')
8+
output = []
9+
10+
cur_chunk = []
11+
count = 0
12+
for word in input_words:
13+
cur_chunk.append(word)
14+
count += 1
15+
if count == N:
16+
output.append(' '.join(cur_chunk))
17+
count, cur_chunk = 0, []
18+
19+
output.append(' '.join(cur_chunk))
20+
21+
return output
22+
23+
if __name__=='__main__':
24+
# Read the first 12000 words from the Brown corpus
25+
input_data = ' '.join(brown.words()[:12000])
26+
27+
# Define the number of words in each chunk
28+
chunk_size = 700
29+
30+
chunks = chunker(input_data, chunk_size)
31+
print('\nNumber of text chunks =', len(chunks), '\n')
32+
for i, chunk in enumerate(chunks):
33+
print('Chunk', i+1, '==>', chunk[:50])

Chapter 10/code/tokenizer.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from nltk.tokenize import sent_tokenize, \
2+
word_tokenize, WordPunctTokenizer
3+
4+
# Define input text
5+
input_text = "Do you know how tokenization works? It's actually quite interesting! Let's analyze a couple of sentences and figure it out."
6+
7+
# Sentence tokenizer
8+
print("\nSentence tokenizer:")
9+
print(sent_tokenize(input_text))
10+
11+
# Word tokenizer
12+
print("\nWord tokenizer:")
13+
print(word_tokenize(input_text))
14+
15+
# WordPunct tokenizer
16+
print("\nWord punct tokenizer:")
17+
print(WordPunctTokenizer().tokenize(input_text))

Chapter 10/code/topic_modeler.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
from nltk.tokenize import RegexpTokenizer
2+
from nltk.corpus import stopwords
3+
from nltk.stem.snowball import SnowballStemmer
4+
from gensim import models, corpora
5+
6+
# Load input data
7+
def load_data(input_file):
8+
data = []
9+
with open(input_file, 'r') as f:
10+
for line in f.readlines():
11+
data.append(line[:-1])
12+
13+
return data
14+
15+
# Processor function for tokenizing, removing stop
16+
# words, and stemming
17+
def process(input_text):
18+
# Create a regular expression tokenizer
19+
tokenizer = RegexpTokenizer(r'\w+')
20+
21+
# Create a Snowball stemmer
22+
stemmer = SnowballStemmer('english')
23+
24+
# Get the list of stop words
25+
stop_words = stopwords.words('english')
26+
27+
# Tokenize the input string
28+
tokens = tokenizer.tokenize(input_text.lower())
29+
30+
# Remove the stop words
31+
tokens = [x for x in tokens if not x in stop_words]
32+
33+
# Perform stemming on the tokenized words
34+
tokens_stemmed = [stemmer.stem(x) for x in tokens]
35+
36+
return tokens_stemmed
37+
38+
if __name__=='__main__':
39+
# Load input data
40+
data = load_data('data.txt')
41+
42+
# Create a list for sentence tokens
43+
tokens = [process(x) for x in data]
44+
45+
# Create a dictionary based on the sentence tokens
46+
dict_tokens = corpora.Dictionary(tokens)
47+
48+
# Create a document-term matrix
49+
doc_term_mat = [dict_tokens.doc2bow(token) for token in tokens]
50+
51+
# Define the number of topics for the LDA model
52+
num_topics = 2
53+
54+
# Generate the LDA model
55+
ldamodel = models.ldamodel.LdaModel(doc_term_mat,
56+
num_topics=num_topics, id2word=dict_tokens, passes=25)
57+
58+
num_words = 5
59+
print('\nTop ' + str(num_words) + ' contributing words to each topic:')
60+
for item in ldamodel.print_topics(num_topics=num_topics, num_words=num_words):
61+
print('\nTopic', item[0])
62+
63+
# Print the contributing words along with their relative contributions
64+
list_of_strings = item[1].split(' + ')
65+
for text in list_of_strings:
66+
weight = text.split('*')[0]
67+
word = text.split('*')[1]
68+
print(word, '==>', str(round(float(weight) * 100, 2)) + '%')
69+

0 commit comments

Comments
 (0)