|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +""" |
| 3 | +Created on Thu Jan 17 21:36:10 2019 |
| 4 | +
|
| 5 | +@author: Koffi Moïse AGBENYA |
| 6 | +
|
| 7 | +CONTENT-BASED FILTERING |
| 8 | +
|
| 9 | +Recommendation systems are a collection of algorithms used to recommend items |
| 10 | +to users based on information taken from the user. These systems have become |
| 11 | +ubiquitous can be commonly seen in online stores, movies databases and job |
| 12 | +finders. In this notebook, we will explore Content-based recommendation systems |
| 13 | +and implement a simple version of one using Python and the Pandas library. |
| 14 | +
|
| 15 | +ABOUT DATASET |
| 16 | +
|
| 17 | +This dataset (ml-latest) describes 5-star rating and free-text tagging activity |
| 18 | +from [MovieLens](http://movielens.org), a movie recommendation service. It |
| 19 | +contains 22884377 ratings and 586994 tag applications across 34208 movies. |
| 20 | +These data were created by 247753 users between January 09, 1995 and January |
| 21 | +29, 2016. This dataset was generated on January 29, 2016. |
| 22 | +
|
| 23 | +Users were selected at random for inclusion. All selected users had rated at |
| 24 | +least 1 movies. No demographic information is included. Each user is |
| 25 | +represented by an id, and no other information is provided. |
| 26 | +
|
| 27 | +The data are contained in four files, `links.csv`, `movies.csv`, `ratings.csv` |
| 28 | +and `tags.csv`. More details about the contents and use of all these files |
| 29 | +follows. |
| 30 | +
|
| 31 | +This is a *development* dataset. As such, it may change over time and is not an |
| 32 | +appropriate dataset for shared research results. |
| 33 | +
|
| 34 | +""" |
| 35 | + |
| 36 | +#Dataframe manipulation library |
| 37 | +import pandas as pd |
| 38 | +#Math functions, we'll only need the sqrt function so let's import only that |
| 39 | +from math import sqrt |
| 40 | +import numpy as np |
| 41 | +import matplotlib.pyplot as plt |
| 42 | + |
| 43 | +#Storing the movie information into a pandas dataframe |
| 44 | +movies_df = pd.read_csv('movies.csv') |
| 45 | +#Storing the user information into a pandas dataframe |
| 46 | +ratings_df = pd.read_csv('ratings.csv') |
| 47 | +#Head is a function that gets the first N rows of a dataframe. N's default is 5. |
| 48 | +movies_df.head() |
| 49 | + |
| 50 | +#Let's remove the year from the title column by using pandas' replace |
| 51 | +#function and store in a new year column. |
| 52 | + |
| 53 | +#Using regular expressions to find a year stored between parentheses |
| 54 | +#We specify the parantheses so we don't conflict with movies that have years in their titles |
| 55 | +movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False) |
| 56 | +#Removing the parentheses |
| 57 | +movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False) |
| 58 | +#Removing the years from the 'title' column |
| 59 | +movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '') |
| 60 | +#Applying the strip function to get rid of any ending whitespace characters that may have appeared |
| 61 | +movies_df['title'] = movies_df['title'].apply(lambda x: x.strip()) |
| 62 | +movies_df.head() |
| 63 | + |
| 64 | +#Every genre is separated by a | so we simply have to call the split function on | |
| 65 | +movies_df['genres'] = movies_df.genres.str.split('|') |
| 66 | +movies_df.head() |
| 67 | + |
| 68 | +""" |
| 69 | +
|
| 70 | +Since keeping genres in a list format isn't optimal for the content-based |
| 71 | +recommendation system technique, we will use the One Hot Encoding technique to |
| 72 | +convert the list of genres to a vector where each column corresponds to one |
| 73 | +possible value of the feature. This encoding is needed for feeding categorical |
| 74 | +data. In this case, we store every different genre in columns that contain |
| 75 | +either 1 or 0. 1 shows that a movie has that genre and 0 shows that it doesn't. |
| 76 | +Let's also store this dataframe in another variable since genres won't be |
| 77 | +important for our first recommendation system. |
| 78 | +
|
| 79 | +""" |
| 80 | + |
| 81 | +#Copying the movie dataframe into a new one since we won't need to use the |
| 82 | +#genre information in our first case. |
| 83 | +moviesWithGenres_df = movies_df.copy() |
| 84 | + |
| 85 | +#For every row in the dataframe, iterate through the list of genres and place a |
| 86 | +#1 into the corresponding column |
| 87 | +for index, row in movies_df.iterrows(): |
| 88 | + for genre in row['genres']: |
| 89 | + moviesWithGenres_df.at[index, genre] = 1 |
| 90 | +#Filling in the NaN values with 0 to show that a movie doesn't have that |
| 91 | +#column's genre |
| 92 | +moviesWithGenres_df = moviesWithGenres_df.fillna(0) |
| 93 | +moviesWithGenres_df.head() |
| 94 | + |
| 95 | +#Lets look at the ratings dataframe |
| 96 | +ratings_df.head() |
| 97 | + |
| 98 | +#Every row in the ratings dataframe has a user id associated with at least one |
| 99 | +#movie, a rating and a timestamp showing when they reviewed it. We won't be |
| 100 | +#needing the timestamp column, so let's drop it to save on memory. |
| 101 | + |
| 102 | +#Drop removes a specified row or column from a dataframe |
| 103 | +ratings_df = ratings_df.drop('timestamp', 1) |
| 104 | +ratings_df.head() |
| 105 | + |
| 106 | +#Content-Based recommendation system |
| 107 | + |
| 108 | +""" |
| 109 | +
|
| 110 | +Now, let's take a look at how to implement Content-Based or Item-Item |
| 111 | +recommendation systems. This technique attempts to figure out what a user's |
| 112 | +favourite aspects of an item is, and then recommends items that present those |
| 113 | +aspects. In our case, we're going to try to figure out the input's favorite |
| 114 | +genres from the movies and ratings given. |
| 115 | +
|
| 116 | +Let's begin by creating an input user to recommend movies to: |
| 117 | +
|
| 118 | +Notice: To add more movies, simply increase the amount of elements in the |
| 119 | +userInput. Feel free to add more in! Just be sure to write it in with capital |
| 120 | +letters and if a movie starts with a "The", like "The Matrix" then write it in |
| 121 | +like this: 'Matrix, The' . |
| 122 | +
|
| 123 | +""" |
| 124 | + |
| 125 | +userInput = [ |
| 126 | + {'title':'Breakfast Club, The', 'rating':5}, |
| 127 | + {'title':'Toy Story', 'rating':3.5}, |
| 128 | + {'title':'Jumanji', 'rating':2}, |
| 129 | + {'title':"Pulp Fiction", 'rating':5}, |
| 130 | + {'title':'Akira', 'rating':4.5} |
| 131 | + ] |
| 132 | +inputMovies = pd.DataFrame(userInput) |
| 133 | +inputMovies |
| 134 | + |
| 135 | +""" |
| 136 | +
|
| 137 | +Add movieId to input user |
| 138 | +With the input complete, let's extract the input movies's ID's from the movies |
| 139 | +dataframe and add them into it. |
| 140 | +
|
| 141 | +We can achieve this by first filtering out the rows that contain the input |
| 142 | +movies' title and then merging this subset with the input dataframe. We also |
| 143 | +drop unnecessary columns for the input to save memory space. |
| 144 | +
|
| 145 | +""" |
| 146 | + |
| 147 | +#Filtering out the movies by title |
| 148 | +inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())] |
| 149 | +#Then merging it so we can get the movieId. It's implicitly merging it by title. |
| 150 | +inputMovies = pd.merge(inputId, inputMovies) |
| 151 | +#Dropping information we won't use from the input dataframe |
| 152 | +inputMovies = inputMovies.drop('genres', 1).drop('year', 1) |
| 153 | +#Final input dataframe |
| 154 | +#If a movie you added in above isn't here, then it might not be in the original |
| 155 | +#dataframe or it might spelled differently, please check capitalisation. |
| 156 | +inputMovies |
| 157 | + |
| 158 | +#We're going to start by learning the input's preferences, so let's get the |
| 159 | +#subset of movies that the input has watched from the Dataframe containing |
| 160 | +#genres defined with binary values. |
| 161 | +#Filtering out the movies from the input |
| 162 | +userMovies = moviesWithGenres_df[moviesWithGenres_df['movieId'].isin(inputMovies['movieId'].tolist())] |
| 163 | +userMovies |
| 164 | + |
| 165 | +#We'll only need the actual genre table, so let's clean this up a bit by |
| 166 | +#resetting the index and dropping the movieId, title, genres and year columns. |
| 167 | + |
| 168 | +#Resetting the index to avoid future issues |
| 169 | +userMovies = userMovies.reset_index(drop=True) |
| 170 | +#Dropping unnecessary issues due to save memory and to avoid issues |
| 171 | +userGenreTable = userMovies.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1) |
| 172 | +userGenreTable |
| 173 | + |
| 174 | +""" |
| 175 | +
|
| 176 | +Now we're ready to start learning the input's preferences! |
| 177 | +
|
| 178 | +To do this, we're going to turn each genre into weights. We can do this by |
| 179 | +using the input's reviews and multiplying them into the input's genre table and |
| 180 | +then summing up the resulting table by column. This operation is actually a dot |
| 181 | +product between a matrix and a vector, so we can simply accomplish by calling |
| 182 | +Pandas's "dot" function. |
| 183 | +
|
| 184 | +""" |
| 185 | + |
| 186 | +inputMovies['rating'] |
| 187 | + |
| 188 | +#Dot produt to get weights |
| 189 | +userProfile = userGenreTable.transpose().dot(inputMovies['rating']) |
| 190 | +#The user profile |
| 191 | +userProfile |
| 192 | + |
| 193 | +#Now, we have the weights for every of the user's preferences. This is known as |
| 194 | +#the User Profile. Using this, we can recommend movies that satisfy the user's |
| 195 | +#preferences. |
| 196 | +#Let's start by extracting the genre table from the original dataframe: |
| 197 | + |
| 198 | +#Now let's get the genres of every movie in our original dataframe |
| 199 | +genreTable = moviesWithGenres_df.set_index(moviesWithGenres_df['movieId']) |
| 200 | +#And drop the unnecessary information |
| 201 | +genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1) |
| 202 | +genreTable.head() |
| 203 | + |
| 204 | +genreTable.shape |
| 205 | + |
| 206 | +#With the input's profile and the complete list of movies and their genres in |
| 207 | +#hand, we're going to take the weighted average of every movie based on the |
| 208 | +#input profile and recommend the top twenty movies that most satisfy it. |
| 209 | + |
| 210 | +#Multiply the genres by the weights and then take the weighted average |
| 211 | +recommendationTable_df = ((genreTable*userProfile).sum(axis=1))/(userProfile.sum()) |
| 212 | +recommendationTable_df.head() |
| 213 | + |
| 214 | +#Sort our recommendations in descending order |
| 215 | +recommendationTable_df = recommendationTable_df.sort_values(ascending=False) |
| 216 | +#Just a peek at the values |
| 217 | +recommendationTable_df.head() |
| 218 | + |
| 219 | +#Now here's the recommendation table |
| 220 | + |
| 221 | +#The final recommendation table |
| 222 | +movies_df.loc[movies_df['movieId'].isin(recommendationTable_df.head(20).keys())] |
| 223 | + |
| 224 | + |
0 commit comments