-
Notifications
You must be signed in to change notification settings - Fork 65
/
utils.py
91 lines (73 loc) · 2.62 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# Utilities to process movies and ratings.
# They will output only popular and recent movies for a better experience.
import re
import csv
year_regex = re.compile('\d{4}')
title_regex = re.compile('(.*)\(\d{4}\)')
def get_movie_year(movie):
match = year_regex.search(movie)
if match is None:
return None
return int(match.group(0))
def get_appreciation(score):
score = float(score)
if score >= 4:
return True
if score <= 2:
return False
return None
def transform_movies():
movies_list = []
movies_file = open('./ml-latest-small/movies.csv', 'rt')
movies = csv.reader(movies_file, delimiter=',')
is_first = True
movies_list.append(["id", "title", "year", "genre"])
for movie in movies:
if is_first:
is_first = False
continue
year = get_movie_year(movie[1])
title = title_regex.match(movie[1]).group(1).strip() if year else movie[1]
movies_list.append([int(movie[0]), title, year, movie[2]])
popular_movies_file = open('./ml-latest-small/movies-clean.csv', 'wt')
popular_movies_writer = csv.writer(popular_movies_file, delimiter=',')
for row in movies_list:
popular_movies_writer.writerow(row)
movies_dict = dict()
is_first = True
for movie in movies_list:
if is_first:
is_first = False
continue
movies_dict[movie[0]] = movie
return movies_dict
def transform_ratings(movies_dict):
ratings_list = []
popular_ratings = []
movies_occurences = dict()
ratings_file = open('./ml-latest-small/ratings.csv', 'rt')
ratings = csv.reader(ratings_file, delimiter=',')
is_first = True
for rating in ratings:
if is_first:
is_first = False
continue
is_appreciated = get_appreciation(rating[2])
if is_appreciated is not None:
ratings_list.append([int(rating[0]), int(rating[1]), rating[2]])
for rating in ratings_list:
movie = rating[1]
if movie in movies_occurences:
movies_occurences[movie] += 1
else:
movies_occurences[movie] = 1
popular_ratings.append(["user", "movie", "score"])
for rating in ratings_list:
if movies_occurences[rating[1]] > 25 and movies_dict[rating[1]][2] > 2000:
popular_ratings.append(rating)
popular_ratings_file = open('./ml-latest-small/ratings-popular.csv', 'wt')
popular_ratings_writer = csv.writer(popular_ratings_file, delimiter=',')
for row in popular_ratings:
popular_ratings_writer.writerow(row)
movies = transform_movies()
transform_ratings(movies)