diff --git a/.gitignore b/.gitignore index 4af44bf..a67e6ec 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.pyc *.so *.egg-info +*.swp diff --git a/asgd/__init__.py b/asgd/__init__.py index eabbd9e..4501d95 100644 --- a/asgd/__init__.py +++ b/asgd/__init__.py @@ -1,3 +1,8 @@ -from naive_asgd import NaiveBinaryASGD, NaiveOVAASGD +from naive_asgd import NaiveBinaryASGD +from naive_asgd import NaiveOVAASGD +from naive_asgd import NaiveRankASGD +from naive_asgd import SparseUpdateRankASGD from experimental_asgd import ExperimentalBinaryASGD + +from linsvm import LinearSVM diff --git a/asgd/auto_step_size.py b/asgd/auto_step_size.py index 3045520..15a0dd9 100644 --- a/asgd/auto_step_size.py +++ b/asgd/auto_step_size.py @@ -1,17 +1,19 @@ import copy +import logging +import time import numpy as np -from scipy import optimize +import scipy.optimize -DEFAULT_INITIAL_RANGE = 0.25, 0.5 -DEFAULT_MAX_EXAMPLES = 1000 -DEFAULT_TOLERANCE = 0.5 -DEFAULT_BRENT_OUTPUT = False +logger = logging.getLogger(__name__) + +DEFAULT_MAX_EXAMPLES = 1000 # estimate stepsize from this many examples +DEFAULT_TOLERANCE = 1.0 # in log-2 units of the learning rate +DEFAULT_SGD_STEP_SIZE_FLOOR = 1e-7 # -- for huge feature vectors, reduce this. def find_sgd_step_size0( - model, X, y, - initial_range=DEFAULT_INITIAL_RANGE, - tolerance=DEFAULT_TOLERANCE, brent_output=DEFAULT_BRENT_OUTPUT): + model, partial_fit_args, + tolerance=DEFAULT_TOLERANCE): """Use a Brent line search to find the best step size Parameters @@ -19,98 +21,121 @@ def find_sgd_step_size0( model: BinaryASGD Instance of a BinaryASGD - X: array, shape = [n_samples, n_features] - Array of features - - y: array, shape = [n_samples] - Array of labels in (-1, 1) - - initial_range: tuple of float - Initial range for the sgd_step_size0 search (low, high) + partial_fit_args - tuple of arguments for model.partial_fit. + This tuple must start with X, y, ... - max_iterations: - Maximum number of interations + tolerance: in logarithmic step size units Returns ------- - best_sgd_step_size0: float Optimal sgd_step_size0 given `X` and `y`. """ - # -- stupid scipy calls some sizes twice!? + # -- stupid solver calls some sizes twice!? _cache = {} def eval_size0(log2_size0): try: - return _cache[log2_size0] + return _cache[float(log2_size0)] except KeyError: pass other = copy.deepcopy(model) current_step_size = 2 ** log2_size0 other.sgd_step_size0 = current_step_size other.sgd_step_size = current_step_size - other.partial_fit(X, y) + other.partial_fit(*partial_fit_args) # Hack: asgd is lower variance than sgd, but it's tuned to work # well asymptotically, not after just a few examples - weights = .5 * (other.asgd_weights + other.sgd_weights) - bias = .5 * (other.asgd_bias + other.sgd_bias) - - margin = y * (np.dot(X, weights) + bias) - l2_cost = other.l2_regularization * (weights ** 2).sum() - rval = np.maximum(0, 1 - margin).mean() + l2_cost - _cache[log2_size0] = rval + other.asgd_weights = .5 * (other.asgd_weights + other.sgd_weights) + other.asgd_bias = .5 * (other.asgd_bias + other.sgd_bias) + + X, y = partial_fit_args[:2] + rval = other.cost(X, y) + if np.isnan(rval): + rval = float('inf') + logger.info('find step %e: %e' % (current_step_size, rval)) + _cache[float(log2_size0)] = rval return rval - best_sgd_step_size0 = optimize.brent( - eval_size0, brack=np.log2(initial_range), tol=tolerance) - - return best_sgd_step_size0 - - + if tolerance < 0.5: + raise NotImplementedError( + 'tolerance too small, need adaptive stepsize') + + step = -tolerance + x0 = np.log2(model.sgd_step_size0) + x1 = np.log2(model.sgd_step_size0) + step + y0 = eval_size0(x0) + y1 = eval_size0(x1) + if y1 > y0: + step *= -1 + y0, y1 = y1, y0 + + while y1 < y0: + x0, y0 = x1, y1 + x1 = x0 + step + y1 = eval_size0(x1) + + # I tried using sp.optimize.fmin, but this function is bumpy and we only + # want a really coarse estimate of the optimmum, so fmin and fmin_powell + # end up being relatively inefficient even compared to this really stupid + # search. + # + # TODO: increase the stepsize every time it still goes down, and then + # backtrack when we over-step + + rval = 2.0 ** x0 + return rval + + +# XXX: use different name, function is not specific to binary classification def binary_fit( - model, X, y, + model, fit_args, max_examples=DEFAULT_MAX_EXAMPLES, + step_size_floor=DEFAULT_SGD_STEP_SIZE_FLOOR, **find_sgd_step_size0_kwargs): """Returns a model with automatically-selected sgd_step_size0 Parameters ---------- - model: BinaryASGD + model: BaseASGD instance Instance of the model to be fitted. - X: array, shape = [n_samples, n_features] - Array of features - - y: array, shape = [n_samples] - Array of labels in (-1, 1) + fit_args - tuple of args to model.fit + This method assumes they are all length-of-dataset ndarrays. max_examples: int Maximum number of examples to use from `X` and `y` to find an - estimate of the best sgd_step_size0 + estimate of the best sgd_step_size0. N.B. That the entirety of X and y + is used for the final fit() call after the best step size has been found. Returns ------- - model: BinaryASGD - Instances of the model, fitted with an estimate of the best - sgd_step_size0 + model: model, fitted with an estimate of the best sgd_step_size0 """ - assert X.ndim == 2 - assert len(X) == len(y) assert max_examples > 0 + logger.info('binary_fit: design matrix shape %s' % str(fit_args[0].shape)) # randomly choose up to max_examples uniformly without replacement from # across the whole set of training data. - idxs = model.rstate.permutation(len(X))[:max_examples] + all_idxs = model.rstate.permutation(len(fit_args[0])) + idxs = all_idxs[:max_examples] # Find the best learning rate for that subset + t0 = time.time() best = find_sgd_step_size0( - model, X[idxs], y[idxs], **find_sgd_step_size0_kwargs) + model, [a[idxs] for a in fit_args], **find_sgd_step_size0_kwargs) + logger.info('found best stepsize %e in %f seconds' % ( + best, time.time() - t0)) # Heuristic: take the best stepsize according to the first max_examples, # and go half that fast for the full run. - best_estimate = 2. ** (best - 1.0) - model.sgd_step_size0 = best_estimate - model.sgd_step_size = best_estimate - model.fit(X, y) + step_size0 = max(best / 2.0, step_size_floor) + + logger.info('setting sgd_step_size: %e' % step_size0) + model.sgd_step_size0 = float(step_size0) + model.sgd_step_size = float(step_size0) + t0 = time.time() + model.fit(*fit_args) + logger.info('full fit took %f seconds' % (time.time() - t0)) return model diff --git a/asgd/linsvm.py b/asgd/linsvm.py new file mode 100644 index 0000000..f06c15a --- /dev/null +++ b/asgd/linsvm.py @@ -0,0 +1,153 @@ +""" +Automatic heuristic solver selection: LinearSVM + +""" + +import numpy as np + +from .auto_step_size import binary_fit +from .naive_asgd import NaiveBinaryASGD +from .naive_asgd import NaiveRankASGD +from .naive_asgd import SparseUpdateRankASGD + +try: + import sklearn.svm +except ImportError: + pass + + +class LinearSVM(object): + """ + SVM-fitting object that implements a heuristic for choosing + the right solver among several that may be installed in sklearn, and asgd. + + """ + + def __init__(self, l2_regularization, solver='auto', label_dct=None, + label_weights=None): + self.l2_regularization = l2_regularization + self.solver = solver + self.label_dct = label_dct + self.label_weights = label_weights + + def fit(self, X, y, weights=None, bias=None): + solver = self.solver + label_dct = self.label_dct + l2_regularization = self.l2_regularization + + if weights or bias: + raise NotImplementedError( + 'Currently only train_set = (X, y) is supported') + del weights, bias + if self.label_weights: + raise NotImplementedError() + + n_train, n_feats = X.shape + if self.label_dct is None: + label_dct = dict([(v, i) for (i, v) in enumerate(sorted(set(y)))]) + else: + label_dct = self.label_dct + n_classes = len(label_dct) + + if n_classes < 2: + raise ValueError('must be at least 2 labels') + + elif n_classes == 2: + if set(y) != set([-1, 1]): + # TODO: use the label_dct to automatically adjust + raise NotImplementedError() + + if solver == 'auto': + if n_feats > n_train: + solver = ('sklearn.svm.SVC', {'kernel': 'precomputed'}) + else: + solver = ('asgd.NaiveBinaryASGD', {}) + + method, method_kwargs = solver + + if method == 'asgd.NaiveBinaryASGD': + method_kwargs = dict(method_kwargs) + method_kwargs.setdefault('rstate', np.random.RandomState(123)) + svm = NaiveBinaryASGD( + l2_regularization=l2_regularization, + **method_kwargs) + svm = binary_fit(svm, (X, y)) + + elif method == 'sklearn.svm.SVC': + C = 1.0 / (l2_regularization * len(X)) + svm = sklearn.svm.SVC(C=C, scale_C=False, **method_kwargs) + raise NotImplementedError( + 'save ktrn, multiply Xtst by X in predict()') + ktrn = linear_kernel(X, X) + svm.fit(ktrn, y) + + else: + raise ValueError('unrecognized method', method) + + else: # n_classes > 2 + if set(y) != set(range(len(set(y)))): + # TODO: use the label_dct to automatically adjust + raise NotImplementedError('labels need adapting', + set(y)) + if solver == 'auto': + solver = ('asgd.SparseUpdateRankASGD', { + 'sgd_step_size0': 10.0 / X.shape[1], + }) + + method, method_kwargs = solver + + if method == 'asgd.NaiveRankASGD': + method_kwargs = dict(method_kwargs) + method_kwargs.setdefault('rstate', np.random.RandomState(123)) + svm = NaiveRankASGD(n_classes, n_feats, + l2_regularization=l2_regularization, + **method_kwargs) + svm = binary_fit(svm, (X, y)) + + elif method == 'asgd.SparseUpdateRankASGD': + method_kwargs = dict(method_kwargs) + method_kwargs.setdefault('rstate', np.random.RandomState(123)) + svm = SparseUpdateRankASGD(n_classes, n_feats, + l2_regularization=l2_regularization, + **method_kwargs) + svm = binary_fit(svm, (X, y)) + + elif method == 'asgd.NaiveOVAASGD': + # -- one vs. all + method_kwargs = dict(method_kwargs) + method_kwargs.setdefault('rstate', np.random.RandomState(123)) + svm = NaiveOVAASGD(n_classes, n_feats, + l2_regularization=l2_regularization, + **method_kwargs) + svm = binary_fit(svm, (X, y)) + + elif method == 'sklearn.svm.SVC': + # -- one vs. one + raise NotImplementedError(method) + + elif method == 'sklearn.svm.NuSVC': + # -- one vs. one + raise NotImplementedError(method) + + elif method == 'sklearn.svm.LinearSVC': + # -- Crammer & Singer multi-class + C = 1.0 / (l2_regularization * len(X)) + svm = sklearn.svm.LinearSVC( + C=C, + scale_C=False, + multi_class=True, + **method_kwargs) + svm.fit(X, y) + + else: + raise ValueError('unrecognized method', method) + + self.svm = svm + + def predict(self, *args, **kwargs): + return self.svm.predict(*args, **kwargs) + + def decision_function(self, *args, **kwargs): + return self.svm.decision_function(*args, **kwargs) + + diff --git a/asgd/naive_asgd.py b/asgd/naive_asgd.py index 0396344..6fd0156 100644 --- a/asgd/naive_asgd.py +++ b/asgd/naive_asgd.py @@ -3,16 +3,21 @@ naive, non-optimized implementation """ +import sys import numpy as np from numpy import dot from itertools import izip DEFAULT_SGD_STEP_SIZE0 = 1e-2 DEFAULT_L2_REGULARIZATION = 1e-3 -DEFAULT_N_ITERATIONS = 10 +DEFAULT_MIN_OBSERVATIONS = 1000 +DEFAULT_MAX_OBSERVATIONS = sys.maxint DEFAULT_FEEDBACK = False DEFAULT_RSTATE = None DEFAULT_DTYPE = np.float32 +DEFAULT_N_PARTIAL = 5000 +DEFAULT_FIT_ABS_TOLERANCE = 1e-4 +DEFAULT_FIT_REL_TOLERANCE = 1e-2 class BaseASGD(object): @@ -20,15 +25,28 @@ class BaseASGD(object): def __init__(self, n_features, sgd_step_size0=DEFAULT_SGD_STEP_SIZE0, l2_regularization=DEFAULT_L2_REGULARIZATION, - n_iterations=DEFAULT_N_ITERATIONS, feedback=DEFAULT_FEEDBACK, - rstate=DEFAULT_RSTATE, dtype=DEFAULT_DTYPE): + min_observations=DEFAULT_MIN_OBSERVATIONS, + max_observations=DEFAULT_MAX_OBSERVATIONS, + fit_n_partial=DEFAULT_N_PARTIAL, + fit_abs_tolerance=DEFAULT_FIT_ABS_TOLERANCE, + fit_rel_tolerance=DEFAULT_FIT_REL_TOLERANCE, + feedback=DEFAULT_FEEDBACK, + rstate=DEFAULT_RSTATE, + dtype=DEFAULT_DTYPE): # -- assert n_features > 1 self.n_features = n_features - assert n_iterations > 0 - self.n_iterations = n_iterations + if not 0 <= min_observations <= max_observations: + raise ValueError('min_observations > max_observations', + (min_observations, max_observations)) + self.min_observations = min_observations + self.max_observations = max_observations + + self.fit_n_partial = fit_n_partial + self.fit_abs_tolerance = fit_abs_tolerance + self.fit_rel_tolerance = fit_rel_tolerance if feedback: raise NotImplementedError("FIXME: feedback support is buggy") @@ -51,35 +69,113 @@ def __init__(self, n_features, self.asgd_step_size0 = 1 self.asgd_step_size = self.asgd_step_size0 + # -- self.n_observations = 0 + self.train_means = [] + self.recent_train_costs = [] + def fit_converged(self): + """ + There are two convergence tests here. Training is considered to be + over if it has -class NaiveBinaryASGD(BaseASGD): + * stalled: latest train_means is allclose to a previous one (*) - def __init__(self, n_features, sgd_step_size0=DEFAULT_SGD_STEP_SIZE0, - l2_regularization=DEFAULT_L2_REGULARIZATION, - n_iterations=DEFAULT_N_ITERATIONS, feedback=DEFAULT_FEEDBACK, - rstate=DEFAULT_RSTATE, dtype=DEFAULT_DTYPE): - - super(NaiveBinaryASGD, self).__init__( - n_features, - sgd_step_size0=sgd_step_size0, - l2_regularization=l2_regularization, - n_iterations=n_iterations, - feedback=feedback, - rstate=rstate, - dtype=dtype, - ) + * terminated: latest train_means is allclose to 0 - # -- - self.sgd_weights = np.zeros((n_features), dtype=dtype) - self.sgd_bias = np.zeros((1), dtype=dtype) + """ + train_means = self.train_means + rtol = self.fit_rel_tolerance + atol = self.fit_abs_tolerance + + # -- check for perfect fit + if len(train_means) > 1: + assert np.min(train_means) >= 0 + if np.allclose(train_means[-1], 0, atol=atol, rtol=rtol): + return True + + # -- check for stall condition + if len(train_means) > 2: + old_pt = max( + len(train_means) // 2, + len(train_means) - 4) + thresh = (1 - rtol) * train_means[old_pt] - atol + if train_means[-1] > thresh: + return True + + return False + + def fit(self, X, y): + + assert X.ndim == 2 + assert y.ndim == 1 + + n_points, n_features = X.shape + assert n_features == self.n_features + assert n_points == y.size + + n_points_remaining = self.max_observations - self.n_observations + + while n_points_remaining > 0: - self.asgd_weights = np.zeros((n_features), dtype=dtype) - self.asgd_bias = np.zeros((1), dtype=dtype) + # -- every iteration will train from n_partial observations and + # then check for convergence + fit_n_partial = min(n_points_remaining, self.fit_n_partial) + + idx = self.rstate.permutation(n_points) + Xb = X[idx[:fit_n_partial]] + yb = y[idx[:fit_n_partial]] + self.partial_fit(Xb, yb) + + if self.feedback: + raise NotImplementedError( + 'partial_fit logic requires memory to be distinct') + self.sgd_weights = self.asgd_weights + self.sgd_bias = self.asgd_bias + + if (self.n_observations >= self.min_observations + and self.fit_converged()): + break + + n_points_remaining -= len(Xb) + + return self + + +class BaseBinaryASGD(BaseASGD): + + def __init__(self, *args, **kwargs): + BaseASGD.__init__(self, *args, **kwargs) + + self.sgd_weights = np.zeros((self.n_features,), dtype=self.dtype) + self.sgd_bias = np.asarray(0, dtype=self.dtype) + + self.asgd_weights = np.zeros((self.n_features,), dtype=self.dtype) + self.asgd_bias = np.asarray(0, dtype=self.dtype) + + def decision_function(self, X): + X = np.asarray(X) + return dot(self.asgd_weights, X.T) + self.asgd_bias + + def predict(self, X): + return np.sign(self.decision_function(X)) + + def cost(self, X, y): + weights = self.asgd_weights + bias = self.asgd_bias + margin = y * (np.dot(X, weights) + bias) + l2_cost = self.l2_regularization * (weights ** 2).sum() + hinge_loss = np.maximum(0, 1 - margin) + rval = hinge_loss.mean() + l2_cost + return rval + + +class NaiveBinaryASGD(BaseBinaryASGD): def partial_fit(self, X, y): + assert np.all(y**2 == 1) + sgd_step_size0 = self.sgd_step_size0 sgd_step_size = self.sgd_step_size sgd_step_size_scheduling_exponent = \ @@ -96,6 +192,8 @@ def partial_fit(self, X, y): l2_regularization = self.l2_regularization n_observations = self.n_observations + train_means = self.train_means + recent_train_costs = self.recent_train_costs for obs, label in izip(X, y): @@ -110,6 +208,9 @@ def partial_fit(self, X, y): sgd_weights += sgd_step_size * label * obs sgd_bias += sgd_step_size * label + recent_train_costs.append(1 - float(margin)) + else: + recent_train_costs.append(0) # -- update asgd asgd_weights = (1 - asgd_step_size) * asgd_weights \ @@ -126,6 +227,12 @@ def partial_fit(self, X, y): sgd_step_size_scheduling_exponent) asgd_step_size = 1. / n_observations + if len(recent_train_costs) == self.fit_n_partial: + train_means.append(np.mean(recent_train_costs) + + l2_regularization * np.dot( + self.asgd_weights, self.asgd_weights)) + self.recent_train_costs = recent_train_costs = [] + # -- self.sgd_weights = sgd_weights self.sgd_bias = sgd_bias @@ -139,69 +246,126 @@ def partial_fit(self, X, y): return self - def fit(self, X, y): - assert X.ndim == 2 - assert y.ndim == 1 +class BaseMultiASGD(BaseASGD): + """ + Allocate weight vectors for one-vs-all multiclass SVM + """ + def __init__(self, n_classes, n_features, *args, **kwargs): + BaseASGD.__init__(self, n_features, *args, **kwargs) - n_points, n_features = X.shape - assert n_features == self.n_features - assert n_points == y.size + # -- + self.n_classes = n_classes - n_iterations = self.n_iterations + dtype = self.dtype + # -- + self.sgd_weights = np.zeros((n_features, n_classes), dtype=dtype) + self.sgd_bias = np.zeros(n_classes, dtype=dtype) + self.asgd_weights = np.zeros((n_features, n_classes), dtype=dtype) + self.asgd_bias = np.zeros(n_classes, dtype=dtype) - for i in xrange(n_iterations): + def decision_function(self, X): + X = np.asarray(X) + return dot(X, self.asgd_weights) + self.asgd_bias - idx = self.rstate.permutation(n_points) - Xb = X[idx] - yb = y[idx] - self.partial_fit(Xb, yb) + def predict(self, X): + return self.decision_function(X).argmax(axis=1) - if self.feedback: - self.sgd_weights = self.asgd_weights - self.sgd_bias = self.asgd_bias - return self +class NaiveOVAASGD(BaseMultiASGD): - def decision_function(self, X): - return dot(self.asgd_weights, X.T) + self.asgd_bias + def partial_fit(self, X, y): - def predict(self, X): - return np.sign(self.decision_function(X)) + if set(y) > set(range(self.n_classes)): + raise ValueError("Invalid 'y'") + + sgd_step_size0 = self.sgd_step_size0 + sgd_step_size = self.sgd_step_size + sgd_step_size_scheduling_exponent = \ + self.sgd_step_size_scheduling_exponent + sgd_step_size_scheduling_multiplier = \ + self.sgd_step_size_scheduling_multiplier + sgd_weights = self.sgd_weights + sgd_bias = self.sgd_bias + asgd_weights = self.asgd_weights + asgd_bias = self.asgd_bias + asgd_step_size = self.asgd_step_size -class NaiveOVAASGD(BaseASGD): + l2_regularization = self.l2_regularization - def __init__(self, n_classes, n_features, - sgd_step_size0=DEFAULT_SGD_STEP_SIZE0, - l2_regularization=DEFAULT_L2_REGULARIZATION, - n_iterations=DEFAULT_N_ITERATIONS, - feedback=DEFAULT_FEEDBACK, - rstate=DEFAULT_RSTATE, - dtype=DEFAULT_DTYPE): + n_observations = self.n_observations + n_classes = self.n_classes - super(NaiveOVAASGD, self).__init__( - n_features, - sgd_step_size0=sgd_step_size0, - l2_regularization=l2_regularization, - n_iterations=n_iterations, - feedback=feedback, - rstate=rstate, - dtype=dtype, - ) + train_means = self.train_means + recent_train_costs = self.recent_train_costs - # -- - assert n_classes > 1 - self.n_classes = n_classes + for obs, label in izip(X, y): + label = 2 * (np.arange(n_classes) == label).astype(int) - 1 + + # -- compute margin + margin = label * (dot(obs, sgd_weights) + sgd_bias) + + # -- update sgd + if l2_regularization: + sgd_weights *= (1 - l2_regularization * sgd_step_size) + + violations = margin < 1 + + if np.any(violations): + label_violated = label[violations] + sgd_weights[:, violations] += ( + sgd_step_size + * label_violated[np.newaxis, :] + * obs[:, np.newaxis] + ) + sgd_bias[violations] += sgd_step_size * label_violated + recent_train_costs.append( + n_classes - margin[violations].sum()) + else: + recent_train_costs.append(0.0) + + # -- update asgd + asgd_weights = (1 - asgd_step_size) * asgd_weights \ + + asgd_step_size * sgd_weights + asgd_bias = (1 - asgd_step_size) * asgd_bias \ + + asgd_step_size * sgd_bias + + # -- update step_sizes + n_observations += 1 + sgd_step_size_scheduling = (1 + sgd_step_size0 * n_observations * + sgd_step_size_scheduling_multiplier) + sgd_step_size = sgd_step_size0 / \ + (sgd_step_size_scheduling ** \ + sgd_step_size_scheduling_exponent) + asgd_step_size = 1. / n_observations + + if len(recent_train_costs) == self.fit_n_partial: + train_means.append(np.mean(recent_train_costs) + + l2_regularization * np.dot( + self.asgd_weights, self.asgd_weights)) + self.recent_train_costs = recent_train_costs = [] # -- - self.sgd_weights = np.zeros((n_features, n_classes), dtype=dtype) - self.sgd_bias = np.zeros((n_classes,), dtype=dtype) - self.asgd_weights = np.zeros((n_features, n_classes), dtype=dtype) - self.asgd_bias = np.zeros((n_classes), dtype=dtype) + self.sgd_weights = sgd_weights + self.sgd_bias = sgd_bias + self.sgd_step_size = sgd_step_size - def partial_fit(self, X, y): + self.asgd_weights = asgd_weights + self.asgd_bias = asgd_bias + self.asgd_step_size = asgd_step_size + self.n_observations = n_observations + + return self + + +class NaiveRankASGD(BaseMultiASGD): + """ + Implements rank-based multiclass SVM. + """ + + def partial_fit(self, X, y): if set(y) > set(range(self.n_classes)): raise ValueError("Invalid 'y'") @@ -223,24 +387,31 @@ def partial_fit(self, X, y): n_observations = self.n_observations n_classes = self.n_classes + train_means = self.train_means + recent_train_costs = self.recent_train_costs + for obs, label in izip(X, y): - label = 2 * (np.arange(n_classes) == label).astype(int) - 1 # -- compute margin - margin = label * (dot(obs, sgd_weights) + sgd_bias) + decision = dot(obs, sgd_weights) + sgd_bias + + dsrt = np.argsort(decision) + distractor = dsrt[-2] if dsrt[-1] == label else dsrt[-1] + margin = decision[label] - decision[distractor] # -- update sgd if l2_regularization: - sgd_weights *= (1 - l2_regularization * sgd_step_size) + sgd_weights *= 1 - l2_regularization * sgd_step_size - violations = margin < 1 - label_violated = label[violations] - sgd_weights[:, violations] += ( - sgd_step_size - * label_violated[np.newaxis, :] - * obs[:, np.newaxis] - ) - sgd_bias[violations] += sgd_step_size * label_violated + if margin < 1: + winc = sgd_step_size * obs + sgd_weights[:, distractor] -= winc + sgd_weights[:, label] += winc + sgd_bias[distractor] -= sgd_step_size + sgd_bias[label] += sgd_step_size + recent_train_costs.append(1 - float(margin)) + else: + recent_train_costs.append(0.0) # -- update asgd asgd_weights = (1 - asgd_step_size) * asgd_weights \ @@ -257,6 +428,13 @@ def partial_fit(self, X, y): sgd_step_size_scheduling_exponent) asgd_step_size = 1. / n_observations + if len(recent_train_costs) == self.fit_n_partial: + flat_weights = asgd_weights.flatten() + l2_term = np.dot(flat_weights, flat_weights) + train_means.append(np.mean(recent_train_costs) + + l2_regularization * l2_term) + self.recent_train_costs = recent_train_costs = [] + # -- self.sgd_weights = sgd_weights self.sgd_bias = sgd_bias @@ -270,32 +448,187 @@ def partial_fit(self, X, y): return self - def fit(self, X, y): + def cost(self, X, y): + weights = self.asgd_weights + bias = self.asgd_bias + decisions = dot(np.asarray(X), weights) + bias + margins = [] + for decision, label in zip(decisions, y): + dsrt = np.argsort(decision) + distractor = dsrt[-2] if dsrt[-1] == label else dsrt[-1] + margin = decision[label] - decision[distractor] + margins.append(margin) - assert X.ndim == 2 - assert y.ndim == 1 + flat_weights = weights.flatten() + l2_term = np.dot(flat_weights, flat_weights) + l2_cost = self.l2_regularization * l2_term + rval = np.maximum(0, 1 - np.asarray(margins)).mean() + l2_cost + return rval - n_points, n_features = X.shape - assert n_features == self.n_features - assert n_points == y.size - n_iterations = self.n_iterations +#import line_profiler +#profile = line_profiler.LineProfiler() +#import time - for i in xrange(n_iterations): - idx = self.rstate.permutation(n_points) - Xb = X[idx] - yb = y[idx] - self.partial_fit(Xb, yb) +class SparseUpdateRankASGD(BaseMultiASGD): + """ + Implements rank-based multiclass SVM. + """ - if self.feedback: - self.sgd_weights = self.asgd_weights - self.sgd_bias = self.asgd_bias + #XXX + #XXX + # TODO: implement l2-SVM here by updating + # sgd rows in proportion to hinge loss + + #@profile + def partial_fit(self, X, y): + if set(y) > set(range(self.n_classes)): + raise ValueError("Invalid 'y'") + #ttt = time.time() + sgd_step_size0 = self.sgd_step_size0 + sgd_step_size = self.sgd_step_size + sgd_step_size_scheduling_exponent = \ + self.sgd_step_size_scheduling_exponent + sgd_step_size_scheduling_multiplier = \ + self.sgd_step_size_scheduling_multiplier + sgd_weights = np.asarray(self.sgd_weights, order='F') + sgd_bias = self.sgd_bias + + asgd_weights = np.asarray(self.asgd_weights, order='F') + asgd_bias = self.asgd_bias + asgd_step_size = self.asgd_step_size + + l2_regularization = self.l2_regularization + + n_observations = self.n_observations + n_classes = self.n_classes + + train_means = self.train_means + recent_train_costs = self.recent_train_costs + + # -- the logical sgd_weight matrix is sgd_weights_scale * sgd_weights + sgd_weights_scale = 1.0 + + use_asgd_scale = True + + if use_asgd_scale: + # -- the logical asgd_weights is stored as a row-wise linear + # -- combination of asgd_weights and sgd_weights + asgd_scale = np.ones((n_classes, 2), dtype=asgd_weights.dtype) + asgd_scale[:, 1] = 0 # -- originally there is no sgd contribution + + for obs, label in izip(X, y): + + # -- compute margin + decision = sgd_weights_scale * dot(obs, sgd_weights) + sgd_bias + + dsrt = np.argsort(decision) + distractor = dsrt[-2] if dsrt[-1] == label else dsrt[-1] + margin = decision[label] - decision[distractor] + + # -- update sgd + sgd_weights_scale *= 1 - l2_regularization * sgd_step_size + + if margin < 1: + # -- perform the delayed updates to the rows of asgd + + if use_asgd_scale: + # -- XXX: this should be a single BLAS call to axpy + asgd_weights[:, distractor] *= asgd_scale[distractor, 0] + asgd_weights[:, distractor] += \ + (asgd_scale[distractor, 1] * sgd_weights_scale) \ + * sgd_weights[:, distractor] + asgd_scale[distractor] = [1, 0] + + # -- XXX: this should be a single BLAS call to axpy + asgd_weights[:, label] *= asgd_scale[label, 0] + asgd_weights[:, label] += \ + (asgd_scale[label, 1] * sgd_weights_scale)\ + * sgd_weights[:, label] + asgd_scale[label] = [1, 0] + + winc = (sgd_step_size / sgd_weights_scale) * obs + sgd_weights[:, distractor] -= winc + sgd_weights[:, label] += winc + sgd_bias[distractor] -= sgd_step_size + sgd_bias[label] += sgd_step_size + recent_train_costs.append(1 - float(margin)) + else: + recent_train_costs.append(0.0) + + if use_asgd_scale: + # -- update asgd via scale variables + asgd_scale *= (1 - asgd_step_size) + asgd_scale[:, 1] += asgd_step_size + else: + asgd_weights = (1 - asgd_step_size) * asgd_weights \ + + asgd_step_size * sgd_weights + + asgd_bias = (1 - asgd_step_size) * asgd_bias \ + + asgd_step_size * sgd_bias + + # -- update step_sizes + n_observations += 1 + sgd_step_size_scheduling = (1 + sgd_step_size0 * n_observations * + sgd_step_size_scheduling_multiplier) + sgd_step_size = sgd_step_size0 / \ + (sgd_step_size_scheduling ** \ + sgd_step_size_scheduling_exponent) + if n_observations <= self.fit_n_partial: + asgd_step_size = 1. + else: + asgd_step_size = 1. / (n_observations - self.fit_n_partial) + + if len(recent_train_costs) == self.fit_n_partial: + if use_asgd_scale: + asgd_weights[:] *= asgd_scale[:, 0] + asgd_weights[:] += (sgd_weights_scale * asgd_scale[:, 1]) * sgd_weights + asgd_scale[:, 0] = 1 + asgd_scale[:, 1] = 0 + + flat_weights = asgd_weights.flatten() + l2_term = np.dot(flat_weights, flat_weights) + train_means.append(np.mean(recent_train_costs) + + l2_regularization * l2_term) + self.recent_train_costs = recent_train_costs = [] + + # -- + self.sgd_weights = sgd_weights * sgd_weights_scale + self.sgd_bias = sgd_bias + self.sgd_step_size = sgd_step_size + + if use_asgd_scale: + asgd_weights[:] *= asgd_scale[:, 0] + asgd_weights[:] += (sgd_weights_scale * asgd_scale[:, 1]) * sgd_weights + asgd_scale[:, 0] = 1 + asgd_scale[:, 1] = 0 + + self.asgd_weights = asgd_weights + self.asgd_bias = asgd_bias + self.asgd_step_size = asgd_step_size + + self.n_observations = n_observations + + #profile.print_stats() + #print 'n_obs', n_observations, 'time/%i' % len(X), (time.time() - ttt) return self - def decision_function(self, X): - return dot(X, self.asgd_weights) + self.asgd_bias + def cost(self, X, y): + weights = self.asgd_weights + bias = self.asgd_bias + decisions = dot(np.asarray(X), weights) + bias + margins = [] + for decision, label in zip(decisions, y): + dsrt = np.argsort(decision) + distractor = dsrt[-2] if dsrt[-1] == label else dsrt[-1] + margin = decision[label] - decision[distractor] + margins.append(margin) + + flat_weights = weights.flatten() + l2_term = np.dot(flat_weights, flat_weights) + l2_cost = self.l2_regularization * l2_term + rval = np.maximum(0, 1 - np.asarray(margins)).mean() + l2_cost + return rval - def predict(self, X): - return self.decision_function(X).argmax(1) diff --git a/asgd/tests/test_auto_step_size.py b/asgd/tests/test_auto_step_size.py index 89799e1..1f3c672 100644 --- a/asgd/tests/test_auto_step_size.py +++ b/asgd/tests/test_auto_step_size.py @@ -1,4 +1,5 @@ from nose.tools import assert_equal +import numpy as np from numpy.testing import assert_almost_equal from numpy.testing import assert_array_equal @@ -11,11 +12,13 @@ from test_naive_asgd import get_fake_data -def get_new_model(n_features, rstate): +def get_new_model(n_features, rstate, n_points): return BinaryASGD(n_features, rstate=rstate, - sgd_step_size0=1e3, + sgd_step_size0=1e3, # intentionally large, binary_fit will fix l2_regularization=1e-3, - n_iterations=5) + max_observations=5 * n_points, + min_observations=5 * n_points, + ) def test_binary_sgd_step_size0(): @@ -24,13 +27,10 @@ def test_binary_sgd_step_size0(): X, y = get_fake_data(100, n_features, rstate) - clf = get_new_model(n_features, rstate) - best = find_sgd_step_size0(clf, X, y, (.25, .5)) - assert_almost_equal(best, -4.9927, decimal=4) - - # start a little lower, still works - best = find_sgd_step_size0(clf, X, y, (.125, .25)) - assert_almost_equal(best, -4.6180, decimal=4) + clf = get_new_model(n_features, rstate, 100) + best0 = find_sgd_step_size0(clf, (X, y)) + print best0 + assert np.allclose(best0, 0.04, atol=.1, rtol=.5) # find_sgd_step_size0 does not change clf assert clf.sgd_step_size0 == 1000.0 @@ -40,25 +40,15 @@ def test_binary_fit(): rstate = RandomState(42) n_features = 20 - clf100 = get_new_model(n_features, rstate) - X, y = get_fake_data(100, n_features, rstate) - _clf100 = binary_fit(clf100, X, y) - assert _clf100 is clf100 - assert_almost_equal(clf100.sgd_step_size0, 0.04812, decimal=4) - - # smoke test - clf1000 = get_new_model(n_features, rstate) - X, y = get_fake_data(DEFAULT_MAX_EXAMPLES, n_features, rstate) - _clf1000 = binary_fit(clf1000, X, y) - assert _clf1000 is clf1000 - assert_almost_equal(clf1000.sgd_step_size0, 0.0047, decimal=4) + for L in [100, DEFAULT_MAX_EXAMPLES, int(DEFAULT_MAX_EXAMPLES * 1.5), + int(DEFAULT_MAX_EXAMPLES * 3)]: - # smoke test that at least it runs - clf2000 = get_new_model(n_features, rstate) - X, y = get_fake_data(2000, n_features, rstate) - _clf2000 = binary_fit(clf2000, X, y) - assert _clf2000 == clf2000 - assert_almost_equal(clf2000.sgd_step_size0, 0.0067, decimal=4) + clf = get_new_model(n_features, rstate, L) + X, y = get_fake_data(L, n_features, rstate, separation=0.1) + best = find_sgd_step_size0(clf, (X, y)) + _clf = binary_fit(clf, (X, y)) + assert _clf is clf + assert 0 < clf.sgd_step_size0 <= best def test_fit_replicable(): @@ -67,11 +57,11 @@ def test_fit_replicable(): X, y = get_fake_data(100, n_features, RandomState(4)) - m0 = get_new_model(n_features, RandomState(45)) - m0 = binary_fit(m0, X, y) + m0 = get_new_model(n_features, RandomState(45), 100) + m0 = binary_fit(m0, (X, y)) - m1 = get_new_model(n_features, RandomState(45)) - m1 = binary_fit(m1, X, y) + m1 = get_new_model(n_features, RandomState(45), 100) + m1 = binary_fit(m1, (X, y)) assert_array_equal(m0.sgd_weights, m1.sgd_weights) assert_array_equal(m0.sgd_bias, m1.sgd_bias) diff --git a/asgd/tests/test_naive_asgd.py b/asgd/tests/test_naive_asgd.py index 0900f87..6826373 100644 --- a/asgd/tests/test_naive_asgd.py +++ b/asgd/tests/test_naive_asgd.py @@ -1,3 +1,4 @@ +import sys from nose.tools import assert_equal, raises from nose.plugins.skip import SkipTest from numpy.testing import assert_allclose @@ -16,14 +17,15 @@ DEFAULT_ARGS = (N_FEATURES,) DEFAULT_KWARGS = dict(sgd_step_size0=1e-3, l2_regularization=1e-6, - n_iterations=4, + min_observations=4 * N_POINTS, + max_observations=4 * N_POINTS, dtype=np.float32) -def get_fake_data(n_points, n_features, rstate): +def get_fake_data(n_points, n_features, rstate, separation=1e-1): X = rstate.randn(n_points, n_features).astype(np.float32) y = 2 * (rstate.randn(n_points) > 0) - 1 - X[y == 1] += 1e-1 + X[y == 1] += separation return X, y @@ -41,7 +43,6 @@ def get_fake_multiclass_data(n_points, n_features, n_classes, rstate): def test_naive_asgd(): - rstate = RandomState(42) X, y = get_fake_data(N_POINTS, N_FEATURES, rstate) @@ -50,6 +51,7 @@ def test_naive_asgd(): clf = BinaryASGD(*DEFAULT_ARGS, rstate=rstate, **DEFAULT_KWARGS) clf.fit(X, y) + assert clf.n_observations == clf.min_observations == clf.max_observations ytrn_preds = clf.predict(X) ytst_preds = clf.predict(Xtst) ytrn_acc = (ytrn_preds == y).mean() @@ -78,7 +80,6 @@ def test_naive_asgd_with_feedback(): def test_naive_asgd_multi_labels(): - rstate = RandomState(44) X, y = get_fake_binary_data_multi_labels(N_POINTS, N_FEATURES, rstate) @@ -87,7 +88,10 @@ def test_naive_asgd_multi_labels(): # n_classes is 2 since it is actually a binary case clf = OVAASGD(2, N_FEATURES, sgd_step_size0=1e-3, l2_regularization=1e-6, - n_iterations=4, dtype=np.float32, rstate=rstate) + min_observations=4 * N_POINTS, + max_observations=4 * N_POINTS, + dtype=np.float32, + rstate=rstate) clf.fit(X, y) ytrn_preds = clf.predict(X) ytst_preds = clf.predict(Xtst) @@ -98,7 +102,6 @@ def test_naive_asgd_multi_labels(): def test_naive_multiclass_ova_asgd(): - rstate = RandomState(45) n_classes = 10 @@ -108,7 +111,10 @@ def test_naive_multiclass_ova_asgd(): rstate) clf = OVAASGD(n_classes, N_FEATURES, sgd_step_size0=1e-3, - l2_regularization=1e-6, n_iterations=4, dtype=np.float32, + l2_regularization=1e-6, + min_observations=4 * N_POINTS, + max_observations=4 * N_POINTS, + dtype=np.float32, rstate=rstate) clf.fit(X, y) @@ -123,7 +129,6 @@ def test_naive_multiclass_ova_asgd(): def test_naive_multiclass_ova_vs_binary_asgd(): - rstate = RandomState(42) n_classes = 3 @@ -162,7 +167,6 @@ def test_naive_multiclass_ova_vs_binary_asgd(): @raises(ValueError) def test_naive_ova_asgd_wrong_labels(): - rstate = RandomState(42) n_classes = 10 @@ -174,3 +178,34 @@ def test_naive_ova_asgd_wrong_labels(): rstate=RandomState(999), **DEFAULT_KWARGS) ytrn_bad = rstate.randint(n_classes + 42, size=len(ytrn)) clf.partial_fit(Xtrn, ytrn_bad) + + +def test_early_stopping(): + rstate = RandomState(42) + + for N_POINTS in (1000, 2000): + + X, y = get_fake_data(N_POINTS, N_FEATURES, rstate) + Xtst, ytst = get_fake_data(N_POINTS, N_FEATURES, rstate) + + kwargs = dict(DEFAULT_KWARGS) + kwargs['fit_n_partial'] = 1500 + del kwargs['max_observations'] + print kwargs + print 'N_POINTS', N_POINTS + + clf = BinaryASGD(*DEFAULT_ARGS, rstate=rstate, **kwargs) + + clf.fit(X, y) + print clf.fit_n_partial + print len(clf.train_means) + print clf.n_observations + assert clf.fit_n_partial == kwargs['fit_n_partial'] + assert len(clf.train_means) == clf.n_observations / clf.fit_n_partial + assert clf.min_observations == kwargs['min_observations'] + assert clf.max_observations == sys.maxint + assert clf.n_observations >= kwargs['min_observations'] + assert clf.fit_converged() + ytrn_preds = clf.predict(X) + ytrn_acc = (ytrn_preds == y).mean() + assert ytrn_acc > 0.7