Skip to content

Commit

Permalink
Merge branch 'release/v0.5.2'
Browse files Browse the repository at this point in the history
  • Loading branch information
evfro committed Mar 11, 2018
2 parents 07b5b89 + d624ac7 commit e2209e6
Show file tree
Hide file tree
Showing 20 changed files with 357 additions and 368 deletions.
240 changes: 120 additions & 120 deletions examples/Example_ML1M.ipynb

Large diffs are not rendered by default.

13 changes: 8 additions & 5 deletions polara/datasets/bookcrossing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from io import BytesIO
import pandas as pd
from StringIO import StringIO

try:
from pandas.io.common import ZipFile
Expand All @@ -13,7 +13,7 @@ def get_bx_data(local_file=None, get_ratings=True, get_users=False, get_books=Fa
from requests import get
zip_file_url = 'http://www2.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip'
zip_response = get(zip_file_url)
zip_contents = StringIO(zip_response.content)
zip_contents = BytesIO(zip_response.content)
else:
zip_contents = local_file

Expand All @@ -26,21 +26,24 @@ def get_bx_data(local_file=None, get_ratings=True, get_users=False, get_books=Fa
delimiter = ';'
if get_ratings:
zdata = zfile.read(zip_file)
ratings = pd.read_csv(StringIO(zdata), sep=delimiter, header=0, engine='c')
ratings = pd.read_csv(BytesIO(zdata), sep=delimiter, header=0,
engine='c', encoding='unicode_escape')

if get_users:
zip_file = zip_files[zip_files.str.contains('users', flags=2)].iat[0]
with zfile.open(zip_file) as zdata:
users = pd.read_csv(zdata, sep=delimiter, header=0, engine='c',)
users = pd.read_csv(zdata, sep=delimiter, header=0, engine='c',
encoding='unicode_escape')

if get_books:
zip_file = zip_files[zip_files.str.contains('books', flags=2)].iat[0]
with zfile.open(zip_file) as zdata:
books = pd.read_csv(zdata, sep=delimiter, header=0, engine='c',
quoting=1, escapechar='\\',
quoting=1, escapechar='\\', encoding='unicode_escape',
usecols=['ISBN', 'Book-Author', 'Publisher'])

res = [data.rename(columns=lambda x: x.lower().replace('book-', '')
.replace('-id', 'id'), copy=False)
for data in [ratings, users, books] if data is not None]
if len(res)==1: res = res[0]
return res
24 changes: 16 additions & 8 deletions polara/datasets/movielens.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
except ImportError:
from zipfile import ZipFile


def get_movielens_data(local_file=None, get_ratings=True, get_genres=False,
split_genres=True, mdb_mapping=False):
split_genres=True, mdb_mapping=False):
'''Downloads movielens data and stores it in pandas dataframe.
'''
if not local_file:
Expand All @@ -25,21 +26,28 @@ def get_movielens_data(local_file=None, get_ratings=True, get_genres=False,
zip_files = pd.Series(zfile.namelist())
zip_file = zip_files[zip_files.str.contains('ratings')].iat[0]
is_latest = 'latest' in zip_file
header = 0 if is_latest else None
is_20m = '20m' in zip_file
delimiter = ','
header = 0 if (is_latest or is_20m) else None
if get_ratings:
zdata = zfile.read(zip_file)
delimiter = ','
zdata = zdata.replace(b'::', delimiter.encode()) # makes data compatible with pandas c-engine
zdata = zdata.replace(b'::', delimiter.encode())
# makes data compatible with pandas c-engine
# returns string objects instead of bytes in that case
ml_data = pd.read_csv(BytesIO(zdata), sep=delimiter, header=header, engine='c',
names=['userid', 'movieid', 'rating', 'timestamp'],
usecols=['userid', 'movieid', 'rating'])

if get_genres:
zip_file = zip_files[zip_files.str.contains('movies')].iat[0]
with zfile.open(zip_file) as zdata:
delimiter = ',' if is_latest else '::'
genres_data = pd.read_csv(zdata, sep=delimiter, header=header, engine='python',
names=['movieid', 'movienm', 'genres'])
zdata = zfile.read(zip_file)
if not is_latest:
# make data compatible with pandas c-engine
# pandas returns string objects instead of bytes in that case
zdata = zdata.replace(b'::', delimiter.encode())
genres_data = pd.read_csv(BytesIO(zdata), sep=delimiter, header=header,
engine='c', encoding='unicode_escape',
names=['movieid', 'movienm', 'genres'])

ml_genres = get_split_genres(genres_data) if split_genres else genres_data

Expand Down
8 changes: 4 additions & 4 deletions polara/datasets/netflix.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ def get_netflix_data(gz_file):
movie_data.append(df[movieid])

data = pd.concat(movie_data, keys=movie_name)
data = data.reset_index().iloc[:, :3].rename(columns={'level_0':'movieid',
'level_1':'userid',
'level_2':'rating'})
data = data.reset_index().iloc[:, :3].rename(columns={'level_0': 'movieid',
'level_1': 'userid',
'level_2': 'rating'})
return data


def filter_by_length(data, session_length=20):
sz = data.groupby('userid', sort=False).size()
valid_users = sz.index[(sz > session_length)]
new_data = data[data.userid.isin(valid_users)]
new_data = data[data.userid.isin(valid_users)]
return new_data
4 changes: 2 additions & 2 deletions polara/evaluation/evaluation_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def evaluate_models(models, metrics, topk=None):
for metric in metrics:
model_scores = []
for model in models:
#print('model {}'.format(model.method))
# print('model {}'.format(model.method))
scores = model.evaluate(method=metric, topk=topk)
model_scores.append(scores)
metric_scores.append(pd.DataFrame(model_scores, index=[model.method for model in models]).T)
Expand All @@ -62,7 +62,7 @@ def consolidate(scores, params, metrics):
return res


def consolidate_folds(scores, folds, metrics, index_names = ['fold', 'top-n']):
def consolidate_folds(scores, folds, metrics, index_names=['fold', 'top-n']):
res = {}
for metric in metrics:
data = pd.concat([scores[j][metric] for j in folds], keys=folds)
Expand Down
6 changes: 3 additions & 3 deletions polara/evaluation/plotting.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import matplotlib.pyplot as plt


def _plot_pair(scores, keys, titles=None, errors=None, err_alpha = 0.2, figsize=(16, 5), ax=None):
def _plot_pair(scores, keys, titles=None, errors=None, err_alpha=0.2, figsize=(16, 5), ax=None):
if not ax:
fig, ax = plt.subplots(1, 2, figsize=figsize)
show_legend = True
Expand All @@ -15,7 +15,7 @@ def _plot_pair(scores, keys, titles=None, errors=None, err_alpha = 0.2, figsize=
scores[right].plot(ax=ax[1], legend=False)

if show_legend:
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))

if errors is not None:
errG = errors[left]
Expand Down Expand Up @@ -54,7 +54,7 @@ def show_ranking(all_scores, **kwargs):
_plot_pair(scores, keys, **kwargs)


def _cross_plot(scores, keys, titles=None, errors=None, err_alpha = 0.2, ROC_middle=False, figsize=(8, 5), limit=None, ax=None):
def _cross_plot(scores, keys, titles=None, errors=None, err_alpha=0.2, ROC_middle=False, figsize=(8, 5), limit=None, ax=None):
if not ax:
fig = plt.figure(figsize=figsize)
ax = fig.gca()
Expand Down
8 changes: 5 additions & 3 deletions polara/lib/hosvd.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np
from numba import jit


@jit(nopython=True, nogil=True)
def double_tensordot(idx, val, U, V, new_shape1, new_shape2, ten_mode0, ten_mode1, ten_mode2, res):
I = idx.shape[0]
Expand All @@ -26,23 +27,24 @@ def tensordot2(idx, val, shape, U, V, modes):
ten_mode1, mat_mode1 = modes[0]
ten_mode2, mat_mode2 = modes[1]

ten_mode0, = [x for x in (0,1,2) if x not in (ten_mode1, ten_mode2)]
ten_mode0, = [x for x in (0, 1, 2) if x not in (ten_mode1, ten_mode2)]
new_shape = (shape[ten_mode0], U.shape[1-mat_mode1], V.shape[1-mat_mode2])
res = np.zeros(new_shape)

if mat_mode1==1:
if mat_mode1 == 1:
vU = U.T
else:
vU = U

if mat_mode2==1:
if mat_mode2 == 1:
vV = V.T
else:
vV = V

double_tensordot(idx, val, vU, vV, new_shape[1], new_shape[2], ten_mode0, ten_mode1, ten_mode2, res)
return res


def tucker_als(idx, val, shape, core_shape, iters=25, growth_tol=0.01, batch_run=False):
'''
The function computes Tucker ALS decomposition of sparse tensor
Expand Down
40 changes: 22 additions & 18 deletions polara/lib/similarity.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
# python 2/3 interoperability
from __future__ import division

try:
range = xrange
except NameError:
pass

try:
long
except NameError:
long = int

import math
import types
from collections import defaultdict, OrderedDict
import numpy as np
import pandas as pd
from numba import jit
import scipy as sp
import scipy.sparse
from scipy.sparse import csc_matrix, csr_matrix, coo_matrix, SparseEfficiencyWarning
import warnings

Expand All @@ -36,7 +40,7 @@ def _fix_empty_features(feature_mat):


def set_diagonal_values(mat, val=1):
#disable warning when setting diagonal elements of sparse similarity matrix
# disable warning when setting diagonal elements of sparse similarity matrix
with warnings.catch_warnings():
warnings.simplefilter('ignore', category=SparseEfficiencyWarning)
mat.setdiag(val)
Expand All @@ -53,7 +57,7 @@ def normalize_binary_features(feature_mat):
if feature_mat.format == 'csc':
ind = feature_mat.indices.copy()
ptr = feature_mat.indptr.copy()
norm_data = 1 / np.sqrt(np.take(sqsum, ind))
norm_data = 1 / np.sqrt(np.take(sqsum, ind))
normed = csc_matrix((norm_data, ind, ptr), shape=feature_mat.shape)
else:
norm_data = safe_inverse_root(sqsum.astype(np.float64))
Expand Down Expand Up @@ -144,10 +148,10 @@ def cosine_tfidf_similarity(F, fill_diagonal=True):
@jit(nopython=True)
def _jaccard_similarity_weighted_tri(dat, ind, ptr, shift):
z = dat[0] #np.float64
#need to initialize lists with certain dtype
data = [z,]
cols = [z,]
rows = [z,]
# need to initialize lists with certain dtype
data = [z]
cols = [z]
rows = [z]

nrows = len(ptr) - 1
for i in range(nrows):
Expand Down Expand Up @@ -183,7 +187,7 @@ def _jaccard_similarity_weighted_tri(dat, ind, ptr, shift):
max_sum += dat[s]

if min_sum:
wjac = min_sum/max_sum
wjac = min_sum / max_sum
data.append(wjac)
cols.append(i)
rows.append(j)
Expand All @@ -203,7 +207,7 @@ def jaccard_similarity_weighted(F, fill_diagonal=True):
shift = 1 if fill_diagonal else 0
data, rows, cols = _jaccard_similarity_weighted_tri(dat, ind, ptr, shift)

S = sp.sparse.coo_matrix((data, (rows, cols)), shape=(F.shape[0],)*2).tocsc()
S = coo_matrix((data, (rows, cols)), shape=(F.shape[0],)*2).tocsc()
S += S.T # doubles diagonal values if fill_diagonal is False

if fill_diagonal:
Expand Down Expand Up @@ -292,10 +296,10 @@ def get_features_data(meta_data, ranking=None, deduplicate=True):
ranking = 'linear'

if isinstance(ranking, str):
ranking = [ranking,] * len(features)
ranking = [ranking] * len(features)

if not isinstance(ranking, dict):
ranking = {k:v for k, v in zip(features, ranking)}
ranking = {k: v for k, v in zip(features, ranking)}

for feature in features:
feature_data = meta_data[feature]
Expand Down Expand Up @@ -325,10 +329,10 @@ def get_similarity_data(meta_data, similarity_type='jaccard'):
features = meta_data.columns

if isinstance(similarity_type, str):
similarity_type = [similarity_type,] * len(features)
similarity_type = [similarity_type] * len(features)

if not isinstance(similarity_type, dict):
similarity_type = {k:v for k, v in zip(features, similarity_type)}
similarity_type = {k: v for k, v in zip(features, similarity_type)}

similarity_mats = {}
for feature in features:
Expand Down Expand Up @@ -356,16 +360,16 @@ def combine_similarity_data(meta_data, similarity_type='jaccard', weights=None):
num_feat = len(features)

if isinstance(similarity_type, str):
similarity_type = [similarity_type,] * num_feat
similarity_type = [similarity_type] * num_feat

if not isinstance(similarity_type, dict):
similarity_type = {k:v for k, v in zip(features, similarity_type)}
similarity_type = {k: v for k, v in zip(features, similarity_type)}

if weights is None:
weights = [1.0/num_feat,] * num_feat
weights = [1.0/num_feat] * num_feat

if not isinstance(weights, dict):
weights = {k:v for k, v in zip(features, weights)}
weights = {k: v for k, v in zip(features, weights)}

similarity = csc_matrix((meta_data.shape[0],)*2)
for feature in features:
Expand Down
12 changes: 8 additions & 4 deletions polara/lib/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
pass

import numpy as np
import scipy as sp
from scipy import sparse
from scipy.sparse import csr_matrix
from numba import jit

# matvec implementation is based on
# http://stackoverflow.com/questions/18595981/improving-performance-of-multiplication-of-scipy-sparse-matrices


@jit(nopython=True, nogil=True)
def matvec2dense(m_ptr, m_ind, m_val, v_nnz, v_val, out):
l = len(v_nnz)
Expand Down Expand Up @@ -59,11 +59,12 @@ def csc_matvec(mat_csc, vec, dense_output=True, dtype=None):
indices = np.empty((n,), dtype=np.intp)
indptr = np.array([0, n], dtype=np.intp)
matvec2sparse(m_ptr, m_ind, m_val, v_nnz, v_val, sizes, indices, data)
res = sp.sparse.csr_matrix((data, indices, indptr), shape=(1, mat_csc.shape[0]), dtype=res_dtype)
res = csr_matrix((data, indices, indptr), shape=(1, mat_csc.shape[0]), dtype=res_dtype)
res.sum_duplicates() # expensive operation
return res

jit(nopython=True)

@jit(nopython=True)
def _blockify(ind, ptr, major_dim):
# convenient function to compute only diagonal
# elements of the product of 2 matrices;
Expand All @@ -77,17 +78,20 @@ def _blockify(ind, ptr, major_dim):
shift_ind = i * major_dim
ind[j] += shift_ind


def row_unblockify(mat, block_size):
# only for CSR matrices
factor = (mat.indices // block_size) * block_size
mat.indices -= factor
mat._shape = (mat.shape[0], block_size)


def row_blockify(mat, block_size):
# only for CSR matrices
_blockify(mat.indices, mat.indptr, block_size)
mat._shape = (mat.shape[0], block_size*mat.shape[0])


def inverse_permutation(p):
s = np.empty(p.size, p.dtype)
s[p] = np.arange(p.size)
Expand Down
4 changes: 1 addition & 3 deletions polara/recommender/coldstart/data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from collections import namedtuple, defaultdict
import numpy as np
import pandas as pd
from polara.recommender.data import RecommenderData
from polara.lib.similarity import build_indicator_matrix

Expand Down Expand Up @@ -42,7 +41,6 @@ def prepare(self):


def _split_test_index(self):
userid = self.fields.userid
itemid = self.fields.itemid

item_idx = np.arange(len(self._unique_items))
Expand Down Expand Up @@ -99,7 +97,7 @@ def _reindex_cold_items(self):
item_index = self.index.itemid

new_item_index = (namedtuple('ItemIndex', 'training cold_start')
._make([item_index, cold_item_index]))
._make([item_index, cold_item_index]))
self.index = self.index._replace(itemid=new_item_index)


Expand Down
Loading

0 comments on commit e2209e6

Please sign in to comment.