PacktPublishing
diff --git a/‎Chapter 10/code/bag_of_words.py
Lines changed: 41 additions & 0 deletions b/‎Chapter 10/code/bag_of_words.py
Lines changed: 41 additions & 0 deletions
diff --git a/‎Chapter 10/code/category_predictor.py
Lines changed: 48 additions & 0 deletions b/‎Chapter 10/code/category_predictor.py
Lines changed: 48 additions & 0 deletions
diff --git a/‎Chapter 10/code/data.txt
Lines changed: 10 additions & 0 deletions b/‎Chapter 10/code/data.txt
Lines changed: 10 additions & 0 deletions
diff --git a/‎Chapter 10/code/gender_identifier.py
Lines changed: 45 additions & 0 deletions b/‎Chapter 10/code/gender_identifier.py
Lines changed: 45 additions & 0 deletions
diff --git a/‎Chapter 10/code/lemmatizer.py
Lines changed: 19 additions & 0 deletions b/‎Chapter 10/code/lemmatizer.py
Lines changed: 19 additions & 0 deletions
diff --git a/‎Chapter 10/code/sentiment_analyzer.py
Lines changed: 66 additions & 0 deletions b/‎Chapter 10/code/sentiment_analyzer.py
Lines changed: 66 additions & 0 deletions
diff --git a/‎Chapter 10/code/stemmer.py
Lines changed: 23 additions & 0 deletions b/‎Chapter 10/code/stemmer.py
Lines changed: 23 additions & 0 deletions
diff --git a/‎Chapter 10/code/text_chunker.py
Lines changed: 33 additions & 0 deletions b/‎Chapter 10/code/text_chunker.py
Lines changed: 33 additions & 0 deletions
diff --git a/‎Chapter 10/code/tokenizer.py
Lines changed: 17 additions & 0 deletions b/‎Chapter 10/code/tokenizer.py
Lines changed: 17 additions & 0 deletions
diff --git a/‎Chapter 10/code/topic_modeler.py
Lines changed: 69 additions & 0 deletions b/‎Chapter 10/code/topic_modeler.py
Lines changed: 69 additions & 0 deletions
@@ -0,0 +1,41 @@
+import numpy as np
+from sklearn.feature_extraction.text import CountVectorizer
+from nltk.corpus import brown
+from text_chunker import chunker 
+
+# Read the data from the Brown corpus
+input_data = ' '.join(brown.words()[:5400])
+
+# Number of words in each chunk 
+chunk_size = 800
+
+text_chunks = chunker(input_data, chunk_size)
+
+# Convert to dict items
+chunks = []
+for count, chunk in enumerate(text_chunks):
+    d = {'index': count, 'text': chunk}
+    chunks.append(d)
+
+# Extract the document term matrix
+count_vectorizer = CountVectorizer(min_df=7, max_df=20)
+document_term_matrix = count_vectorizer.fit_transform([chunk['text'] for chunk in chunks])
+
+# Extract the vocabulary and display it
+vocabulary = np.array(count_vectorizer.get_feature_names())
+print("\nVocabulary:\n", vocabulary)
+
+# Generate names for chunks
+chunk_names = []
+for i in range(len(text_chunks)):
+    chunk_names.append('Chunk-' + str(i+1))
+
+# Print the document term matrix
+print("\nDocument term matrix:")
+formatted_text = '{:>12}' * (len(chunk_names) + 1)
+print('\n', formatted_text.format('Word', *chunk_names), '\n')
+for word, item in zip(vocabulary, document_term_matrix.T):
+    # 'item' is a 'csr_matrix' data structure
+    output = [word] + [str(freq) for freq in item.data]
+    print(formatted_text.format(*output))
+
@@ -0,0 +1,48 @@
+from sklearn.datasets import fetch_20newsgroups
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.feature_extraction.text import CountVectorizer
+
+# Define the category map
+category_map = {'talk.politics.misc': 'Politics', 'rec.autos': 'Autos', 
+        'rec.sport.hockey': 'Hockey', 'sci.electronics': 'Electronics', 
+        'sci.med': 'Medicine'}
+
+# Get the training dataset
+training_data = fetch_20newsgroups(subset='train', 
+        categories=category_map.keys(), shuffle=True, random_state=5)
+
+# Build a count vectorizer and extract term counts 
+count_vectorizer = CountVectorizer()
+train_tc = count_vectorizer.fit_transform(training_data.data)
+print("\nDimensions of training data:", train_tc.shape)
+
+# Create the tf-idf transformer
+tfidf = TfidfTransformer()
+train_tfidf = tfidf.fit_transform(train_tc)
+
+# Define test data 
+input_data = [
+    'You need to be careful with cars when you are driving on slippery roads', 
+    'A lot of devices can be operated wirelessly',
+    'Players need to be careful when they are close to goal posts',
+    'Political debates help us understand the perspectives of both sides'
+]
+
+# Train a Multinomial Naive Bayes classifier
+classifier = MultinomialNB().fit(train_tfidf, training_data.target)
+
+# Transform input data using count vectorizer
+input_tc = count_vectorizer.transform(input_data)
+
+# Transform vectorized data using tfidf transformer
+input_tfidf = tfidf.transform(input_tc)
+
+# Predict the output categories
+predictions = classifier.predict(input_tfidf)
+
+# Print the outputs
+for sent, category in zip(input_data, predictions):
+    print('\nInput:', sent, '\nPredicted category:', \
+            category_map[training_data.target_names[category]])
+
@@ -0,0 +1,10 @@
+The Roman empire expanded very rapidly and it was the biggest empire in the world for a long time.
+An algebraic structure is a set with one or more finitary operations defined on it that satisfies a list of axioms.
+Renaissance started as a cultural movement in Italy in the Late Medieval period and later spread to the rest of Europe.
+The line of demarcation between prehistoric and historical times is crossed when people cease to live only in the present.
+Mathematicians seek out patterns and use them to formulate new conjectures.  
+A notational symbol that represents a number is called a numeral in mathematics. 
+The process of extracting the underlying essence of a mathematical concept is called abstraction.
+Historically, people have frequently waged wars against each other in order to expand their empires.
+Ancient history indicates that various outside influences have helped formulate the culture and traditions of Eastern Europe.
+Mappings between sets which preserve structures are of special interest in many fields of mathematics. 
@@ -0,0 +1,45 @@
+import random
+
+from nltk import NaiveBayesClassifier
+from nltk.classify import accuracy as nltk_accuracy
+from nltk.corpus import names
+
+# Extract last N letters from the input word
+# and that will act as our "feature"
+def extract_features(word, N=2):
+    last_n_letters = word[-N:]
+    return {'feature': last_n_letters.lower()}
+
+if __name__=='__main__':
+    # Create training data using labeled names available in NLTK
+    male_list = [(name, 'male') for name in names.words('male.txt')]
+    female_list = [(name, 'female') for name in names.words('female.txt')]
+    data = (male_list + female_list)
+
+    # Seed the random number generator
+    random.seed(5)
+
+    # Shuffle the data
+    random.shuffle(data)
+
+    # Create test data
+    input_names = ['Alexander', 'Danielle', 'David', 'Cheryl']
+
+    # Define the number of samples used for train and test
+    num_train = int(0.8 * len(data))
+
+    # Iterate through different lengths to compare the accuracy
+    for i in range(1, 6):
+        print('\nNumber of end letters:', i)
+        features = [(extract_features(n, i), gender) for (n, gender) in data]
+        train_data, test_data = features[:num_train], features[num_train:]
+        classifier = NaiveBayesClassifier.train(train_data)
+
+        # Compute the accuracy of the classifier 
+        accuracy = round(100 * nltk_accuracy(classifier, test_data), 2)
+        print('Accuracy = ' + str(accuracy) + '%')
+
+        # Predict outputs for input names using the trained classifier model
+        for name in input_names:
+            print(name, '==>', classifier.classify(extract_features(name, i)))
+
@@ -0,0 +1,19 @@
+from nltk.stem import WordNetLemmatizer
+
+input_words = ['writing', 'calves', 'be', 'branded', 'horse', 'randomize', 
+        'possibly', 'provision', 'hospital', 'kept', 'scratchy', 'code']
+
+# Create lemmatizer object 
+lemmatizer = WordNetLemmatizer()
+
+# Create a list of lemmatizer names for display
+lemmatizer_names = ['NOUN LEMMATIZER', 'VERB LEMMATIZER']
+formatted_text = '{:>24}' * (len(lemmatizer_names) + 1)
+print('\n', formatted_text.format('INPUT WORD', *lemmatizer_names), 
+        '\n', '='*75)
+
+# Lemmatize each word and display the output
+for word in input_words:
+    output = [word, lemmatizer.lemmatize(word, pos='n'),
+           lemmatizer.lemmatize(word, pos='v')]
+    print(formatted_text.format(*output))
@@ -0,0 +1,66 @@
+from nltk.corpus import movie_reviews 
+from nltk.classify import NaiveBayesClassifier
+from nltk.classify.util import accuracy as nltk_accuracy
+ 
+# Extract features from the input list of words
+def extract_features(words):
+    return dict([(word, True) for word in words])
+ 
+if __name__=='__main__':
+    # Load the reviews from the corpus 
+    fileids_pos = movie_reviews.fileids('pos')
+    fileids_neg = movie_reviews.fileids('neg')
+     
+    # Extract the features from the reviews
+    features_pos = [(extract_features(movie_reviews.words(
+            fileids=[f])), 'Positive') for f in fileids_pos]
+    features_neg = [(extract_features(movie_reviews.words(
+            fileids=[f])), 'Negative') for f in fileids_neg]
+     
+    # Define the train and test split (80% and 20%)
+    threshold = 0.8
+    num_pos = int(threshold * len(features_pos))
+    num_neg = int(threshold * len(features_neg))
+     
+     # Create training and training datasets
+    features_train = features_pos[:num_pos] + features_neg[:num_neg]
+    features_test = features_pos[num_pos:] + features_neg[num_neg:]  
+
+    # Print the number of datapoints used
+    print('\nNumber of training datapoints:', len(features_train))
+    print('Number of test datapoints:', len(features_test))
+     
+    # Train a Naive Bayes classifier 
+    classifier = NaiveBayesClassifier.train(features_train)
+    print('\nAccuracy of the classifier:', nltk_accuracy(
+            classifier, features_test))
+
+    N = 15
+    print('\nTop ' + str(N) + ' most informative words:')
+    for i, item in enumerate(classifier.most_informative_features()):
+        print(str(i+1) + '. ' + item[0])
+        if i == N - 1:
+            break
+
+    # Test input movie reviews
+    input_reviews = [
+        'The costumes in this movie were great', 
+        'I think the story was terrible and the characters were very weak',
+        'People say that the director of the movie is amazing', 
+        'This is such an idiotic movie. I will not recommend it to anyone.' 
+    ]
+
+    print("\nMovie review predictions:")
+    for review in input_reviews:
+        print("\nReview:", review)
+
+        # Compute the probabilities
+        probabilities = classifier.prob_classify(extract_features(review.split()))
+
+        # Pick the maximum value
+        predicted_sentiment = probabilities.max()
+
+        # Print outputs
+        print("Predicted sentiment:", predicted_sentiment)
+        print("Probability:", round(probabilities.prob(predicted_sentiment), 2))
+
@@ -0,0 +1,23 @@
+from nltk.stem.porter import PorterStemmer
+from nltk.stem.lancaster import LancasterStemmer
+from nltk.stem.snowball import SnowballStemmer
+
+input_words = ['writing', 'calves', 'be', 'branded', 'horse', 'randomize', 
+        'possibly', 'provision', 'hospital', 'kept', 'scratchy', 'code']
+
+# Create various stemmer objects
+porter = PorterStemmer()
+lancaster = LancasterStemmer()
+snowball = SnowballStemmer('english')
+
+# Create a list of stemmer names for display
+stemmer_names = ['PORTER', 'LANCASTER', 'SNOWBALL']
+formatted_text = '{:>16}' * (len(stemmer_names) + 1)
+print('\n', formatted_text.format('INPUT WORD', *stemmer_names), 
+        '\n', '='*68)
+
+# Stem each word and display the output
+for word in input_words:
+    output = [word, porter.stem(word), 
+            lancaster.stem(word), snowball.stem(word)]
+    print(formatted_text.format(*output))
@@ -0,0 +1,33 @@
+import numpy as np
+from nltk.corpus import brown
+
+# Split the input text into chunks, where
+# each chunk contains N words
+def chunker(input_data, N):
+    input_words = input_data.split(' ')
+    output = []
+
+    cur_chunk = []
+    count = 0
+    for word in input_words:
+        cur_chunk.append(word)
+        count += 1
+        if count == N:
+            output.append(' '.join(cur_chunk))
+            count, cur_chunk = 0, []
+
+    output.append(' '.join(cur_chunk))
+
+    return output 
+
+if __name__=='__main__':
+    # Read the first 12000 words from the Brown corpus
+    input_data = ' '.join(brown.words()[:12000])
+
+    # Define the number of words in each chunk 
+    chunk_size = 700
+
+    chunks = chunker(input_data, chunk_size)
+    print('\nNumber of text chunks =', len(chunks), '\n')
+    for i, chunk in enumerate(chunks):
+        print('Chunk', i+1, '==>', chunk[:50])
@@ -0,0 +1,17 @@
+from nltk.tokenize import sent_tokenize, \
+        word_tokenize, WordPunctTokenizer
+
+# Define input text
+input_text = "Do you know how tokenization works? It's actually quite interesting! Let's analyze a couple of sentences and figure it out." 
+
+# Sentence tokenizer 
+print("\nSentence tokenizer:")
+print(sent_tokenize(input_text))
+
+# Word tokenizer
+print("\nWord tokenizer:")
+print(word_tokenize(input_text))
+
+# WordPunct tokenizer
+print("\nWord punct tokenizer:")
+print(WordPunctTokenizer().tokenize(input_text))
@@ -0,0 +1,69 @@
+from nltk.tokenize import RegexpTokenizer  
+from nltk.corpus import stopwords
+from nltk.stem.snowball import SnowballStemmer
+from gensim import models, corpora
+
+# Load input data
+def load_data(input_file):
+    data = []
+    with open(input_file, 'r') as f:
+        for line in f.readlines():
+            data.append(line[:-1])
+
+    return data
+
+# Processor function for tokenizing, removing stop 
+# words, and stemming
+def process(input_text):
+    # Create a regular expression tokenizer
+    tokenizer = RegexpTokenizer(r'\w+')
+
+    # Create a Snowball stemmer 
+    stemmer = SnowballStemmer('english')
+
+    # Get the list of stop words 
+    stop_words = stopwords.words('english')
+    
+    # Tokenize the input string
+    tokens = tokenizer.tokenize(input_text.lower())
+
+    # Remove the stop words 
+    tokens = [x for x in tokens if not x in stop_words]
+    
+    # Perform stemming on the tokenized words 
+    tokens_stemmed = [stemmer.stem(x) for x in tokens]
+
+    return tokens_stemmed
+    
+if __name__=='__main__':
+    # Load input data
+    data = load_data('data.txt')
+
+    # Create a list for sentence tokens
+    tokens = [process(x) for x in data]
+
+    # Create a dictionary based on the sentence tokens 
+    dict_tokens = corpora.Dictionary(tokens)
+        
+    # Create a document-term matrix
+    doc_term_mat = [dict_tokens.doc2bow(token) for token in tokens]
+
+    # Define the number of topics for the LDA model
+    num_topics = 2
+
+    # Generate the LDA model 
+    ldamodel = models.ldamodel.LdaModel(doc_term_mat, 
+            num_topics=num_topics, id2word=dict_tokens, passes=25)
+
+    num_words = 5
+    print('\nTop ' + str(num_words) + ' contributing words to each topic:')
+    for item in ldamodel.print_topics(num_topics=num_topics, num_words=num_words):
+        print('\nTopic', item[0])
+
+        # Print the contributing words along with their relative contributions 
+        list_of_strings = item[1].split(' + ')
+        for text in list_of_strings:
+            weight = text.split('*')[0]
+            word = text.split('*')[1]
+            print(word, '==>', str(round(float(weight) * 100, 2)) + '%')
+