Importing the data
First, we import pandas and read in the assignment data.
import pandas as pd
titles = pd.read_csv('pa1/recsys%2Fdata%2Fmovie-titles.csv', header=None,index_col=0, names=['title'])
titles['movieId'] = titles.index.values
titles.head(5)
title | movieId | |
---|---|---|
11 | Star Wars: Episode IV - A New Hope (1977)" | 11 |
12 | Finding Nemo (2003)" | 12 |
13 | Forrest Gump (1994)" | 13 |
14 | American Beauty (1999)" | 14 |
22 | Pirates of the Caribbean: The Curse of the Bla... | 22 |
ratings = pd.read_csv('pa1/recsys%2Fdata%2Fratings.csv', header=None, names=['userId', 'movieId', 'rating'])
ratings.head(5)
userId | movieId | rating | |
---|---|---|---|
0 | 1 | 809 | 4.0 |
1 | 1 | 601 | 5.0 |
2 | 1 | 238 | 5.0 |
3 | 1 | 664 | 4.5 |
4 | 1 | 3049 | 3.0 |
users = pd.read_csv('pa1/recsys%2Fdata%2Fusers.csv', header=None, names=['userId','uniqueId'])
users.head(5)
userId | uniqueId | |
---|---|---|
0 | 1000 | kmh1234-wd4321-iamawesome6789 |
1 | 1001 | ca6faa08-232e-4ac1-a364-13c062fd3ae4 |
2 | 1002 | rdb |
3 | 1003 | skins428 |
4 | 1004 | ergunner |
pa1_inputs = [122, 558, 788]
pa1_inputs
[122, 558, 788]
Users who watched my movies
I check the ratings for one of my assigned movies (122).
ratings[ratings.movieId == pa1_inputs[0]].head(5)
userId | movieId | rating | |
---|---|---|---|
20 | 1 | 122 | 3.5 |
89 | 2 | 122 | 4.5 |
160 | 4 | 122 | 2.0 |
237 | 5 | 122 | 3.5 |
333 | 6 | 122 | 3.0 |
Ratings for my movies
Let’s gather the ratings for all my assigned movies.
myRatings = map(lambda id: ratings[ratings.movieId == id], pa1_inputs)
map(lambda x: x.head(5), myRatings)
[ userId movieId rating
20 1 122 3.5
89 2 122 4.5
160 4 122 2.0
237 5 122 3.5
333 6 122 3.0,
userId movieId rating
33 1 558 3.0
264 5 558 2.5
368 7 558 2.5
406 8 558 3.5
437 9 558 3.5,
userId movieId rating
22 1 788 4.0
118 3 788 3.0
171 4 788 3.0
213 5 788 2.5
322 6 788 2.0]
I then explore the case of movie 122 first, without generalizing, in order to figure out if I have the match right. I first get the users who watched movie 122, and get the count.
(ratings[ratings.movieId == 122]).userId.count()
4393
users122 = ratings[ratings.movieId == 122].userId.values
count122 = float(len(users122))
count122
4393.0
I then get the all the movie ratings of users who watched movie 122.
filter122 = ratings.userId.map(lambda x: x in users122)
filter122.describe()
count 338355
mean 0.8752878
std 0.3303928
min False
25% 1
50% 1
75% 1
max True
dtype: object
ratings122 = ratings[filter122]
ratings122.head(5)
userId | movieId | rating | |
---|---|---|---|
0 | 1 | 809 | 4.0 |
1 | 1 | 601 | 5.0 |
2 | 1 | 238 | 5.0 |
3 | 1 | 664 | 4.5 |
4 | 1 | 3049 | 3.0 |
And finally perform the rating calculation.
def rec122(id):
countBoth = float(len(ratings122[ratings122.movieId == id]))
return countBoth / count122 * 100
titles['rec122'] = titles.movieId.map(rec122)
titles.sort('rec122', ascending=False)[1:6].ix[:, ['movieId','rec122','title']]
movieId | rec122 | title | |
---|---|---|---|
120 | 120 | 95.128614 | The Lord of the Rings: The Fellowship of the R... |
121 | 121 | 94.627817 | The Lord of the Rings: The Two Towers (2002)" |
603 | 603 | 93.922149 | The Matrix (1999)" |
597 | 597 | 89.050763 | Titanic (1997)" |
604 | 604 | 88.071933 | The Matrix Reloaded (2003)" |
I got suspicious when I saw the top recommendations, so just to double check that 122 is actually LOTR3, I look at the titles info.
titles.ix[122]
title The Lord of the Rings: The Return of the King ...
movieId 122
rec122 100
Name: 122, dtype: object
Generalize the exploration
I know abstract what I gathered from the exploration and put it into functions. The names are slightly verbose but it made it easier for me to follow what was going on.
def usersWhoWatched(movieId):
return frozenset(ratings[ratings.movieId == movieId].userId.values)
allUsers = frozenset(ratings.userId.values)
def moviesWatchedByUser(userId):
return frozenset(ratings[ratings.userId == userId].movieId.values)
def usersWhoDidntWatch(movieId):
return allUsers.difference(usersWhoWatched(movieId))
def ratingsByUsers(users):
_filter = ratings.userId.map(lambda x : x in users)
return ratings[_filter]
def recommendationsForMovie(movieId):
users = usersWhoWatched(movieId)
userCount = float(len(users))
_ratings = ratingsByUsers(users)
def recHelper(id):
countBoth = float(len(_ratings[_ratings.movieId == id]))
return countBoth / userCount
data = pd.DataFrame(titles.movieId.map(recHelper), columns=['rating'])
data['id'] = np.array(data.index.values, dtype='uint32')
data['title'] = titles.title
return data.sort('rating', ascending=False)[1:]
def recommendationsAdvancedForMovie(movieId):
users = usersWhoWatched(movieId)
userCount = float(len(users))
notUsers = usersWhoDidntWatch(movieId)
notUserCount = float(len(notUsers))
_ratings = ratingsByUsers(users)
_notRatings = ratingsByUsers(notUsers)
def recHelper(id):
countXY = float(len(_ratings[_ratings.movieId == id]))
countNotXY = float(len(_notRatings[_notRatings.movieId == id])) + 0.001
# print "id: %s userCount %s notUserCount %s, countXY %s countNotXY %s" % (id, userCount, notUserCount, countXY, countNotXY)
return (countXY / userCount) / (countNotXY / notUserCount)
data = pd.DataFrame(titles.movieId.map(recHelper), columns=['rating'])
data['id'] = np.array(data.index.values, dtype='uint32')
data['title'] = titles.title
return data.sort('rating', ascending=False)[1:]
These functions now create the output for the assignment.
def pa1Solve(id):
vals = recommendationsForMovie(id).head(5)
vals = zip(vals.id, map(lambda x: round(x, 2), vals.rating.values))
return [x for subl in vals for x in subl]
def pa1Solver(ids, f):
res = []
_ids = []
for id in ids:
res += [f(id)]
_ids += [id]
ret = pd.DataFrame(pd.DataFrame(res))
ret.index = _ids
return ret
# return res
def pa1SolveAll(ids):
return pa1Solver(ids, pa1Solve)
def pa1_b_Solve(id):
vals = recommendationsAdvancedForMovie(id).head(5)
vals = zip(vals.id, map(lambda x: round(x, 2), vals.rating.values))
return [x for subl in vals for x in subl]
def pa1_b_SolveAll(ids):
return pa1Solver(ids, pa1_b_Solve)
And victory, let’s create the solutions and save them to CSV.
pa1Solution = pa1SolveAll(pa1_inputs)
pa1Solution
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
122 | 120 | 0.95 | 121 | 0.95 | 603 | 0.94 | 597 | 0.89 | 604 | 0.88 |
558 | 603 | 0.93 | 557 | 0.93 | 597 | 0.91 | 607 | 0.89 | 604 | 0.88 |
788 | 603 | 0.94 | 329 | 0.91 | 607 | 0.91 | 13 | 0.91 | 597 | 0.90 |
pa1Solution.to_csv('pa1/pa1Solution1.csv', header=None)
!cat pa1/pa1Solution1.csv
122,120,0.95,121,0.95,603,0.94,597,0.89,604,0.88
558,603,0.93,557,0.93,597,0.91,607,0.89,604,0.88
788,603,0.94,329,0.91,607,0.91,13,0.91,597,0.9
pa1_b_SolveAll(pa1_inputs).to_csv('pa1/pa1_b_Solution.csv', header=None)
!cat pa1/pa1_b_Solution.csv
122,121,4.86,243,3.69,120,3.69,2164,3.56,1894,3.17
558,36658,3.85,414,2.99,786,2.58,557,2.56,9331,2.45
788,9331,4.85,243,4.68,786,3.85,134,3.63,1900,3.44
import sys
pa1SolveAll([11,121,8587]).to_csv(sys.stdout, header=None)
11,603,0.96,1892,0.94,1891,0.94,120,0.93,1894,0.93
121,120,0.95,122,0.95,603,0.94,597,0.89,604,0.88
8587,603,0.92,597,0.9,607,0.87,120,0.86,13,0.86