This repository has been archived by the owner on Dec 12, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 104
/
find_model_collaborative.py
123 lines (82 loc) · 3.29 KB
/
find_model_collaborative.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env python
"""
Copyright Google Inc. 2016
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import print_function
import sys
import itertools
from math import sqrt
from operator import add
from os.path import join, isfile, dirname
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
CLOUDSQL_INSTANCE_IP = sys.argv[1]
CLOUDSQL_DB_NAME = sys.argv[2]
CLOUDSQL_USER = sys.argv[3]
CLOUDSQL_PWD = sys.argv[4]
conf = SparkConf().setAppName("app_collaborative")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
jdbcUrl = 'jdbc:mysql://%s:3306/%s?user=%s&password=%s' % (CLOUDSQL_INSTANCE_IP, CLOUDSQL_DB_NAME, CLOUDSQL_USER, CLOUDSQL_PWD)
#[START how_far]
def howFarAreWe(model, against, sizeAgainst):
# Ignore the rating column
againstNoRatings = against.map(lambda x: (int(x[0]), int(x[1])) )
# Keep the rating to compare against
againstWiRatings = against.map(lambda x: ((int(x[0]),int(x[1])), int(x[2])) )
# Make a prediction and map it for later comparison
# The map has to be ((user,product), rating) not ((product,user), rating)
predictions = model.predictAll(againstNoRatings).map(lambda p: ( (p[0],p[1]), p[2]) )
# Returns the pairs (prediction, rating)
predictionsAndRatings = predictions.join(againstWiRatings).values()
# Returns the variance
return sqrt(predictionsAndRatings.map(lambda s: (s[0] - s[1]) ** 2).reduce(add) / float(sizeAgainst))
#[END how_far]
# Read the data from the Cloud SQL
# Create dataframes
dfRates = sqlContext.read.jdbc(url=jdbcUrl, table='Rating')
rddUserRatings = dfRates.filter(dfRates.userId == 0).rdd
print(rddUserRatings.count())
# Split the data in 3 different sets : training, validating, testing
# 60% 20% 20%
rddRates = dfRates.rdd
rddTraining, rddValidating, rddTesting = rddRates.randomSplit([6,2,2])
#Add user ratings in the training model
rddTraining.union(rddUserRatings)
nbValidating = rddValidating.count()
nbTesting = rddTesting.count()
print("Training: %d, validation: %d, test: %d" % (rddTraining.count(), nbValidating, rddTesting.count()))
# Best results are not commented
ranks = [5,10,15,20]
reguls = [0.1, 1,10]
iters = [5,10,20]
finalModel = None
finalRank = 0
finalRegul = float(0)
finalIter = -1
finalDist = float(100)
#[START train_model]
for cRank, cRegul, cIter in itertools.product(ranks, reguls, iters):
model = ALS.train(rddTraining, cRank, cIter, float(cRegul))
dist = howFarAreWe(model, rddValidating, nbValidating)
if dist < finalDist:
print("Best so far:%f" % dist)
finalModel = model
finalRank = cRank
finalRegul = cRegul
finalIter = cIter
finalDist = dist
#[END train_model]
print("Rank %i" % finalRank)
print("Regul %f" % finalRegul)
print("Iter %i" % finalIter)
print("Dist %f" % finalDist)