Manuel Odendahl bio photo

Manuel Odendahl

wesen

Email

Importing the data

First, we import pandas and read in the assignment data.

import pandas as pd
titles = pd.read_csv('pa1/recsys%2Fdata%2Fmovie-titles.csv', header=None,index_col=0, names=['title'])
titles['movieId'] = titles.index.values
titles.head(5)
title movieId
11 Star Wars: Episode IV - A New Hope (1977)" 11
12 Finding Nemo (2003)" 12
13 Forrest Gump (1994)" 13
14 American Beauty (1999)" 14
22 Pirates of the Caribbean: The Curse of the Bla... 22
ratings = pd.read_csv('pa1/recsys%2Fdata%2Fratings.csv', header=None, names=['userId', 'movieId', 'rating'])
ratings.head(5)
userId movieId rating
0 1 809 4.0
1 1 601 5.0
2 1 238 5.0
3 1 664 4.5
4 1 3049 3.0
users = pd.read_csv('pa1/recsys%2Fdata%2Fusers.csv', header=None, names=['userId','uniqueId'])
users.head(5)
userId uniqueId
0 1000 kmh1234-wd4321-iamawesome6789
1 1001 ca6faa08-232e-4ac1-a364-13c062fd3ae4
2 1002 rdb
3 1003 skins428
4 1004 ergunner
pa1_inputs = [122, 558, 788]
pa1_inputs
[122, 558, 788]

Users who watched my movies

I check the ratings for one of my assigned movies (122).

ratings[ratings.movieId == pa1_inputs[0]].head(5)
userId movieId rating
20 1 122 3.5
89 2 122 4.5
160 4 122 2.0
237 5 122 3.5
333 6 122 3.0

Ratings for my movies

Let’s gather the ratings for all my assigned movies.

myRatings = map(lambda id: ratings[ratings.movieId == id], pa1_inputs)
map(lambda x: x.head(5), myRatings)
[     userId  movieId  rating
20        1      122     3.5
89        2      122     4.5
160       4      122     2.0
237       5      122     3.5
333       6      122     3.0,
      userId  movieId  rating
33        1      558     3.0
264       5      558     2.5
368       7      558     2.5
406       8      558     3.5
437       9      558     3.5,
      userId  movieId  rating
22        1      788     4.0
118       3      788     3.0
171       4      788     3.0
213       5      788     2.5
322       6      788     2.0]

I then explore the case of movie 122 first, without generalizing, in order to figure out if I have the match right. I first get the users who watched movie 122, and get the count.

(ratings[ratings.movieId == 122]).userId.count()
4393
users122 = ratings[ratings.movieId == 122].userId.values
count122 = float(len(users122))
count122
4393.0

I then get the all the movie ratings of users who watched movie 122.

filter122 = ratings.userId.map(lambda x: x in users122)
filter122.describe()
count       338355
mean     0.8752878
std      0.3303928
min          False
25%              1
50%              1
75%              1
max           True
dtype: object
ratings122 = ratings[filter122]
ratings122.head(5)
userId movieId rating
0 1 809 4.0
1 1 601 5.0
2 1 238 5.0
3 1 664 4.5
4 1 3049 3.0

And finally perform the rating calculation.

def rec122(id):
    countBoth = float(len(ratings122[ratings122.movieId == id]))
    return countBoth / count122 * 100
titles['rec122'] = titles.movieId.map(rec122)
titles.sort('rec122', ascending=False)[1:6].ix[:, ['movieId','rec122','title']]
movieId rec122 title
120 120 95.128614 The Lord of the Rings: The Fellowship of the R...
121 121 94.627817 The Lord of the Rings: The Two Towers (2002)"
603 603 93.922149 The Matrix (1999)"
597 597 89.050763 Titanic (1997)"
604 604 88.071933 The Matrix Reloaded (2003)"

I got suspicious when I saw the top recommendations, so just to double check that 122 is actually LOTR3, I look at the titles info.

titles.ix[122]
title      The Lord of the Rings: The Return of the King ...
movieId                                                  122
rec122                                                   100
Name: 122, dtype: object

Generalize the exploration

I know abstract what I gathered from the exploration and put it into functions. The names are slightly verbose but it made it easier for me to follow what was going on.

def usersWhoWatched(movieId):
    return frozenset(ratings[ratings.movieId == movieId].userId.values)
allUsers = frozenset(ratings.userId.values)
def moviesWatchedByUser(userId):
    return frozenset(ratings[ratings.userId == userId].movieId.values)
def usersWhoDidntWatch(movieId):
    return allUsers.difference(usersWhoWatched(movieId))
def ratingsByUsers(users):
    _filter = ratings.userId.map(lambda x : x in users)
    return ratings[_filter]
def recommendationsForMovie(movieId):
    users = usersWhoWatched(movieId)
    userCount = float(len(users))
    _ratings = ratingsByUsers(users)
    def recHelper(id):
        countBoth = float(len(_ratings[_ratings.movieId == id]))
        return countBoth / userCount
    
    data =  pd.DataFrame(titles.movieId.map(recHelper), columns=['rating'])
    data['id'] = np.array(data.index.values, dtype='uint32')
    data['title'] = titles.title
    return data.sort('rating', ascending=False)[1:]
def recommendationsAdvancedForMovie(movieId):
    users = usersWhoWatched(movieId)
    userCount = float(len(users)) 
    
    notUsers = usersWhoDidntWatch(movieId)
    notUserCount = float(len(notUsers)) 
    
    _ratings = ratingsByUsers(users)
    _notRatings = ratingsByUsers(notUsers)
    
    def recHelper(id):
        countXY = float(len(_ratings[_ratings.movieId == id]))
        countNotXY = float(len(_notRatings[_notRatings.movieId == id])) + 0.001
#         print "id: %s userCount %s notUserCount %s, countXY %s countNotXY %s" % (id, userCount, notUserCount, countXY, countNotXY)
        
        return (countXY / userCount) / (countNotXY / notUserCount)
    
    data =  pd.DataFrame(titles.movieId.map(recHelper), columns=['rating'])
    data['id'] = np.array(data.index.values, dtype='uint32')
    data['title'] = titles.title
    return data.sort('rating', ascending=False)[1:]

These functions now create the output for the assignment.

def pa1Solve(id):
    vals = recommendationsForMovie(id).head(5)
    vals = zip(vals.id, map(lambda x: round(x, 2), vals.rating.values))
    return [x for subl in vals for x in subl]
def pa1Solver(ids, f):
    res = []
    _ids = []
    for id in ids:
        res += [f(id)]
        _ids += [id]
    ret =  pd.DataFrame(pd.DataFrame(res))
    ret.index = _ids
    return ret
#     return res
def pa1SolveAll(ids):
    return pa1Solver(ids, pa1Solve)
def pa1_b_Solve(id):
    vals = recommendationsAdvancedForMovie(id).head(5)
    vals = zip(vals.id, map(lambda x: round(x, 2), vals.rating.values))
    return [x for subl in vals for x in subl] 
def pa1_b_SolveAll(ids):
    return pa1Solver(ids, pa1_b_Solve)

And victory, let’s create the solutions and save them to CSV.

pa1Solution  = pa1SolveAll(pa1_inputs)
pa1Solution
0 1 2 3 4 5 6 7 8 9
122 120 0.95 121 0.95 603 0.94 597 0.89 604 0.88
558 603 0.93 557 0.93 597 0.91 607 0.89 604 0.88
788 603 0.94 329 0.91 607 0.91 13 0.91 597 0.90
pa1Solution.to_csv('pa1/pa1Solution1.csv', header=None)
!cat pa1/pa1Solution1.csv
122,120,0.95,121,0.95,603,0.94,597,0.89,604,0.88
558,603,0.93,557,0.93,597,0.91,607,0.89,604,0.88
788,603,0.94,329,0.91,607,0.91,13,0.91,597,0.9
pa1_b_SolveAll(pa1_inputs).to_csv('pa1/pa1_b_Solution.csv', header=None)
!cat pa1/pa1_b_Solution.csv
122,121,4.86,243,3.69,120,3.69,2164,3.56,1894,3.17
558,36658,3.85,414,2.99,786,2.58,557,2.56,9331,2.45
788,9331,4.85,243,4.68,786,3.85,134,3.63,1900,3.44
import sys
pa1SolveAll([11,121,8587]).to_csv(sys.stdout, header=None)
11,603,0.96,1892,0.94,1891,0.94,120,0.93,1894,0.93
121,120,0.95,122,0.95,603,0.94,597,0.89,604,0.88
8587,603,0.92,597,0.9,607,0.87,120,0.86,13,0.86