-
Notifications
You must be signed in to change notification settings - Fork 1.6k
/
skeleton.py
53 lines (40 loc) · 1.59 KB
/
skeleton.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import numpy as np
from mrjob.job import MRJob
from itertools import combinations, permutations
from scipy.stats.stats import pearsonr
class RestaurantSimilarities(MRJob):
def steps(self):
"the steps in the map-reduce process"
thesteps = [
self.mr(mapper=self.line_mapper, reducer=self.users_items_collector),
self.mr(mapper=self.pair_items_mapper, reducer=self.calc_sim_collector)
]
return thesteps
def line_mapper(self,_,line):
"this is the complete implementation"
user_id,business_id,stars,business_avg,user_avg=line.split(',')
yield user_id, (business_id,stars,business_avg,user_avg)
def users_items_collector(self, user_id, values):
"""
#iterate over the list of tuples yielded in the previous mapper
#and append them to an array of rating information
"""
pass
def pair_items_mapper(self, user_id, values):
"""
ignoring the user_id key, take all combinations of business pairs
and yield as key the pair id, and as value the pair rating information
"""
pass #your code here
def calc_sim_collector(self, key, values):
"""
Pick up the information from the previous yield as shown. Compute
the pearson correlation and yield the final information as in the
last line here.
"""
(rest1, rest2), common_ratings = key, values
#your code here
yield (rest1, rest2), (rho, n_common)
#Below MUST be there for things to work
if __name__ == '__main__':
RestaurantSimilarities.run()