Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes problems with BaseEstimator class deprecated import _set_param and Implements a hybrid method combining Collaborative Filtering and Global Baseline estimates to deal with cold start #104

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
to provide a rich set of components from which you can construct a customized recommender system from a
set of algorithms.

In this fork, I (gabrielspmoreira) have done the following changes:
* Fixed some issues for compatibility with scikit.learn last version (BaseEstimator class deprecated import (scikits.learn.base) and _set_param renamed method)
* Implemented a hybrid method to combine Collaborative Filtering with Global Baseline estimates (based on users and items average preference), inspired on [Implementing Collaborative Filtering class](https://class.coursera.org/mmds-001/lecture/95) from [Coursera Mining Massive Datasets course](https://www.coursera.org/course/mmds).

## Usage

For Usage and Instructions checkout the [Crab Wiki](https://github.com/muricoca/crab/wiki)
Expand All @@ -14,7 +18,7 @@
The project was started in 2010 by Marcel Caraciolo as a M.S.C related project, and since then many people interested joined to help in the project.
It is currently maintained by a team of volunteers, members of the Muriçoca Labs.

## Authors
## Original Authors

Marcel Caraciolo (marcel@muricoca.com)

Expand Down
2 changes: 1 addition & 1 deletion scikits/crab/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# Bruno Melo <bruno@muricoca.com>
# License: BSD Style.

from scikits.learn.base import BaseEstimator
from sklearn.base import BaseEstimator


class BaseRecommender(BaseEstimator):
Expand Down
2 changes: 1 addition & 1 deletion scikits/crab/metrics/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from metrics import recall_score
from metrics import f1_score
from sampling import SplitSampling
from scikits.learn.base import clone
from sklearn.base import clone
from ..models.utils import ItemNotFoundError, UserNotFoundError


Expand Down
133 changes: 131 additions & 2 deletions scikits/crab/recommenders/knn/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def recommend(self, user_id, how_many=None, **params):
Desired number of recommendations (default=None ALL)

'''
self._set_params(**params)
self.set_params(**params)

candidate_items = self.all_other_items(user_id)

Expand Down Expand Up @@ -597,7 +597,7 @@ def recommend(self, user_id, how_many=None, **params):

'''

self._set_params(**params)
self.set_params(**params)

candidate_items = self.all_other_items(user_id, **params)

Expand Down Expand Up @@ -706,3 +706,132 @@ def recommended_because(self, user_id, item_id, how_many=None, **params):
for ind in sorted_preferences]

return top_n_recs


#=====================
#User Based Recommender combined with Global Baseline Recommender
#Based on Collaborative Filtering class from Coursera Mining Massive Datasets course, available in the following link
#https://class.coursera.org/mmds-001/lecture/95
class UserBasedRecommenderCombinedWithGlobalBaseline(UserBasedRecommender):
global_preferences_mean = None

def _get_global_preferences_mean(self):
if self.global_preferences_mean == None:
self.global_preferences_mean = np.nanmean(self.model.index)
return self.global_preferences_mean


def get_global_baseline_estimate(self, user_id, item_id):
'''
Parameters
----------
user_id: int or string
User for which recommendations are to be computed.

item_id: int or string
ID of item for which wants to find the estimated preference.

Returns
-------
Return an estimated preference based on Global Baseline strategy
where user average preference and item average preference are
combined with the global preference average to preferences_from_user
an estimate for user-item preference
'''
#print "GB for user",user_id," / item",item_id
global_mean = self._get_global_preferences_mean()
#print "global_mean:",global_mean
user_preferences = self.model.preferences_from_user(user_id, order_by_id=False)
#print "user_preferences:",user_preferences
item_preferences = self.model.preferences_for_item(item_id, order_by_id=False)
#print "item_preferences:",item_preferences
user_preferences_mean = sum(map(lambda i: i[1], user_preferences)) / float(len(user_preferences))
#print "user_preferences_mean:",user_preferences_mean
item_preferences_mean = sum(map(lambda i: i[1], item_preferences)) / float(len(item_preferences))
#print "item_preferences_mean:",item_preferences_mean

baseline_estimate = global_mean + (user_preferences_mean - global_mean) + (item_preferences_mean - global_mean)
#print "baseline_estimate",baseline_estimate

return baseline_estimate


def estimate_preference(self, user_id, item_id, **params):
'''
Parameters
----------
user_id: int or string
User for which recommendations are to be computed.

item_id: int or string
ID of item for which wants to find the estimated preference.

Returns
-------
Return an estimated preference of Collaborative Filtering combined
with the Global Baseline preference estimate for the user and item.
This hybrid method allows to deal better with user or item cold start problem
'''

preference = self.model.preference_value(user_id, item_id)
if not np.isnan(preference):
return preference

n_similarity = params.pop('n_similarity', 'user_similarity')
distance = params.pop('distance', self.similarity.distance)
nhood_size = params.pop('nhood_size', None)

nearest_neighbors = self.neighborhood_strategy.user_neighborhood(user_id,
self.model, n_similarity, distance, nhood_size, **params)

preference = 0.0
total_similarity = 0.0

similarities = np.array([self.similarity.get_similarity(user_id, to_user_id)
for to_user_id in nearest_neighbors]).flatten()

prefs = np.array([self.model.preference_value(to_user_id, item_id)
for to_user_id in nearest_neighbors])

prefs_baseline = np.array([self.get_global_baseline_estimate(to_user_id, item_id)
for to_user_id in nearest_neighbors])

prefs = prefs[~np.isnan(prefs)]
similarities = similarities[~np.isnan(prefs)]

#prefs_sim = np.sum(prefs[~np.isnan(similarities)] *
# similarities[~np.isnan(similarities)])

prefs_sim_gb = np.sum((prefs[~np.isnan(similarities)] - prefs_baseline[~np.isnan(similarities)]) *
similarities[~np.isnan(similarities)])

total_similarity = np.sum(similarities)

#Throw out the estimate if it was based on no data points,
#of course, but also if based on just one. This is a bit
#of a band-aid on the 'stock' item-based algorithm for
#the moment. The reason is that in this case the estimate
#is, simply, the user's rating for one item that happened
#to have a defined similarity. The similarity score doesn't
#matter, and that seems like a bad situation.
if total_similarity == 0.0 or \
not similarities[~np.isnan(similarities)].size:
return np.nan

#estimated = prefs_sim / total_similarity

baseline = self.get_global_baseline_estimate(user_id, item_id)
estimated_with_gb = baseline + (prefs_sim_gb / total_similarity)

#print "\nEstimating Preference - User",user_id," - Item",item_id
#print "CF:",estimated," CF+Baseline:",estimated_with_gb," Baseline:",baseline
#print "SIMILARITIES"
#pprint(zip(nearest_neighbors, similarities, prefs, prefs_baseline))

if self.capper:
max_p = self.model.maximum_preference_value()
min_p = self.model.minimum_preference_value()
estimated_with_gb = max_p if estimated_with_gb > max_p else min_p \
if estimated_with_gb < min_p else estimated_with_gb

return estimated_with_gb
2 changes: 1 addition & 1 deletion scikits/crab/recommenders/svd/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def recommend(self, user_id, how_many=None, **params):
Desired number of recommendations (default=None ALL)

'''
self._set_params(**params)
self.set_params(**params)

candidate_items = self.all_other_items(user_id)

Expand Down