-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_preparation.py
73 lines (49 loc) · 2.19 KB
/
data_preparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import pandas as pd
import json
import pickle
import os
import warnings
import pandas as pd
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
# https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata/code?datasetId=138&sortBy=voteCount
credits = pd.read_csv('data/tmdb_5000_credits.csv')
movies = pd.read_csv('data/tmdb_5000_movies.csv')
def data_preparation(dataframe, column_list):
for column in column_list:
dataframe[column] = dataframe[column].apply(json.loads)
for index, i in dataframe.iterrows():
column_list_part = [partition['name'] for partition in i[column]]
dataframe.loc[index, column] = str(column_list_part)
data_preparation(movies, ['genres', 'keywords', 'spoken_languages'])
data_preparation(credits, ['cast'])
# Get Director
def get_director(x):
for i in x:
if i['job'] == 'Director':
return i['name']
credits['crew']=credits['crew'].apply(json.loads)
credits['crew'] = credits['crew'].apply(get_director)
credits.rename(columns={'crew':'director'},inplace=True)
movies = movies.merge(credits,left_on='id',right_on='movie_id',how='left')
df = movies[['id','original_title','overview','genres','cast','vote_average', 'director','keywords']]
df['tags'] = df['overview'] + df['genres'] + df['keywords'] + df['cast'] + df['director']
final_data = df[['id', 'original_title', 'tags']]
final_data.dropna(inplace=True)
#################################
# TF-IDF'in Problemimiz için Elde Edilmesi
#################################
tfidf = TfidfVectorizer(stop_words='english')
final_data['tags'] = final_data['tags'].fillna('')
tfidf_matrix = tfidf.fit_transform(final_data['tags'])
tfidf_matrix.shape
#################################
# 2. Cosine Similarity Matrisinin Oluşturulması
#################################
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim.shape
os.makedirs('model', exist_ok=True)
pickle.dump(final_data,open('model/movie_list.pkl', 'wb'))
pickle.dump(cosine_sim,open('model/cosine_sim.pkl', 'wb'))