npinto · jaberg · Jan 31, 2012 · Jan 31, 2012 · Jan 31, 2012 · Jan 31, 2012
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 *.pyc
 *.so
 *.egg-info
+*.swp
diff --git a/asgd/__init__.py b/asgd/__init__.py
@@ -1,3 +1,8 @@
-from naive_asgd import NaiveBinaryASGD, NaiveOVAASGD
+from naive_asgd import NaiveBinaryASGD
+from naive_asgd import NaiveOVAASGD
+from naive_asgd import NaiveRankASGD
+from naive_asgd import SparseUpdateRankASGD
 
 from experimental_asgd import ExperimentalBinaryASGD
+
+from linsvm import LinearSVM
diff --git a/asgd/auto_step_size.py b/asgd/auto_step_size.py
@@ -1,116 +1,141 @@
 import copy
+import logging
+import time
 import numpy as np
-from scipy import optimize
+import scipy.optimize
 
-DEFAULT_INITIAL_RANGE = 0.25, 0.5
-DEFAULT_MAX_EXAMPLES = 1000
-DEFAULT_TOLERANCE = 0.5
-DEFAULT_BRENT_OUTPUT = False
+logger = logging.getLogger(__name__)
+
+DEFAULT_MAX_EXAMPLES = 1000  # estimate stepsize from this many examples
+DEFAULT_TOLERANCE = 1.0     # in log-2 units of the learning rate
+DEFAULT_SGD_STEP_SIZE_FLOOR = 1e-7  # -- for huge feature vectors, reduce this.
 
 
 def find_sgd_step_size0(
-    model, X, y,
-    initial_range=DEFAULT_INITIAL_RANGE,
-    tolerance=DEFAULT_TOLERANCE, brent_output=DEFAULT_BRENT_OUTPUT):
+    model, partial_fit_args,
+    tolerance=DEFAULT_TOLERANCE):
     """Use a Brent line search to find the best step size
 
     Parameters
     ----------
     model: BinaryASGD
         Instance of a BinaryASGD
 
-    X: array, shape = [n_samples, n_features]
-        Array of features
-
-    y: array, shape = [n_samples]
-        Array of labels in (-1, 1)
-
-    initial_range: tuple of float
-        Initial range for the sgd_step_size0 search (low, high)
+    partial_fit_args - tuple of arguments for model.partial_fit.
+        This tuple must start with X, y, ...
 
-    max_iterations:
-        Maximum number of interations
+    tolerance: in logarithmic step size units
 
     Returns
     -------
-    best_sgd_step_size0: float
         Optimal sgd_step_size0 given `X` and `y`.
     """
-    # -- stupid scipy calls some sizes twice!?
+    # -- stupid solver calls some sizes twice!?
     _cache = {}
 
     def eval_size0(log2_size0):
         try:
-            return _cache[log2_size0]
+            return _cache[float(log2_size0)]
         except KeyError:
             pass
         other = copy.deepcopy(model)
         current_step_size = 2 ** log2_size0
         other.sgd_step_size0 = current_step_size
         other.sgd_step_size = current_step_size
-        other.partial_fit(X, y)
+        other.partial_fit(*partial_fit_args)
         # Hack: asgd is lower variance than sgd, but it's tuned to work
         # well asymptotically, not after just a few examples
-        weights = .5 * (other.asgd_weights + other.sgd_weights)
-        bias = .5 * (other.asgd_bias + other.sgd_bias)
-
-        margin = y * (np.dot(X, weights) + bias)
-        l2_cost = other.l2_regularization * (weights ** 2).sum()
-        rval = np.maximum(0, 1 - margin).mean() + l2_cost
-        _cache[log2_size0] = rval
+        other.asgd_weights = .5 * (other.asgd_weights + other.sgd_weights)
+        other.asgd_bias = .5 * (other.asgd_bias + other.sgd_bias)
+
+        X, y = partial_fit_args[:2]
+        rval = other.cost(X, y)
+        if np.isnan(rval):
+            rval = float('inf')
+        logger.info('find step %e: %e' % (current_step_size, rval))
+        _cache[float(log2_size0)] = rval
         return rval
 
-    best_sgd_step_size0 = optimize.brent(
-        eval_size0, brack=np.log2(initial_range), tol=tolerance)
-
-    return best_sgd_step_size0
-
-
+    if tolerance < 0.5:
+        raise NotImplementedError(
+                'tolerance too small, need adaptive stepsize')
+
+    step = -tolerance
+    x0 = np.log2(model.sgd_step_size0)
+    x1 = np.log2(model.sgd_step_size0) + step
+    y0 = eval_size0(x0)
+    y1 = eval_size0(x1)
+    if y1 > y0:
+        step *= -1
+        y0, y1 = y1, y0
+
+    while y1 < y0:
+        x0, y0 = x1, y1
+        x1 = x0 + step
+        y1 = eval_size0(x1)
+
+    # I tried using sp.optimize.fmin, but this function is bumpy and we only
+    # want a really coarse estimate of the optimmum, so fmin and fmin_powell
+    # end up being relatively inefficient even compared to this really stupid
+    # search.
+    #
+    # TODO: increase the stepsize every time it still goes down, and then
+    #       backtrack when we over-step
+
+    rval = 2.0 ** x0
+    return rval
+
+
+# XXX: use different name, function is not specific to binary classification
 def binary_fit(
-    model, X, y,
+    model, fit_args,
     max_examples=DEFAULT_MAX_EXAMPLES,
+    step_size_floor=DEFAULT_SGD_STEP_SIZE_FLOOR,
     **find_sgd_step_size0_kwargs):
     """Returns a model with automatically-selected sgd_step_size0
 
     Parameters
     ----------
-    model: BinaryASGD
+    model: BaseASGD instance
         Instance of the model to be fitted.
 
-    X: array, shape = [n_samples, n_features]
-        Array of features
-
-    y: array, shape = [n_samples]
-        Array of labels in (-1, 1)
+    fit_args - tuple of args to model.fit
+        This method assumes they are all length-of-dataset ndarrays.
 
     max_examples: int
         Maximum number of examples to use from `X` and `y` to find an
-        estimate of the best sgd_step_size0
+        estimate of the best sgd_step_size0. N.B. That the entirety of X and y
+        is used for the final fit() call after the best step size has been found.
 
     Returns
     -------
-    model: BinaryASGD
-        Instances of the model, fitted with an estimate of the best
-        sgd_step_size0
+    model: model, fitted with an estimate of the best sgd_step_size0
     """
 
-    assert X.ndim == 2
-    assert len(X) == len(y)
     assert max_examples > 0
+    logger.info('binary_fit: design matrix shape %s' % str(fit_args[0].shape))
 
     # randomly choose up to max_examples uniformly without replacement from
     # across the whole set of training data.
-    idxs = model.rstate.permutation(len(X))[:max_examples]
+    all_idxs = model.rstate.permutation(len(fit_args[0]))
+    idxs = all_idxs[:max_examples]
 
     # Find the best learning rate for that subset
+    t0 = time.time()
     best = find_sgd_step_size0(
-        model, X[idxs], y[idxs], **find_sgd_step_size0_kwargs)
+        model, [a[idxs] for a in fit_args], **find_sgd_step_size0_kwargs)
+    logger.info('found best stepsize %e in %f seconds' % (
+        best, time.time() - t0))
 
     # Heuristic: take the best stepsize according to the first max_examples,
     # and go half that fast for the full run.
-    best_estimate = 2. ** (best - 1.0)
-    model.sgd_step_size0 = best_estimate
-    model.sgd_step_size = best_estimate
-    model.fit(X, y)
+    step_size0 = max(best / 2.0, step_size_floor)
+
+    logger.info('setting sgd_step_size: %e' % step_size0)
+    model.sgd_step_size0 = float(step_size0)
+    model.sgd_step_size = float(step_size0)
+    t0 = time.time()
+    model.fit(*fit_args)
+    logger.info('full fit took %f seconds' % (time.time() - t0))
 
     return model
diff --git a/asgd/linsvm.py b/asgd/linsvm.py
@@ -0,0 +1,153 @@
+"""
+Automatic heuristic solver selection: LinearSVM
+
+"""
+
+import numpy as np
+
+from .auto_step_size import binary_fit
+from .naive_asgd import NaiveBinaryASGD
+from .naive_asgd import NaiveRankASGD
+from .naive_asgd import SparseUpdateRankASGD
+
+try:
+    import sklearn.svm
+except ImportError:
+    pass
+
+
+class LinearSVM(object):
+    """
+    SVM-fitting object that implements a heuristic for choosing
+    the right solver among several that may be installed in sklearn, and asgd.
+
+    """
+
+    def __init__(self, l2_regularization, solver='auto', label_dct=None,
+            label_weights=None):
+        self.l2_regularization = l2_regularization
+        self.solver = solver
+        self.label_dct = label_dct
+        self.label_weights = label_weights
+
+    def fit(self, X, y, weights=None, bias=None):
+        solver = self.solver
+        label_dct = self.label_dct
+        l2_regularization = self.l2_regularization
+
+        if weights or bias:
+            raise NotImplementedError(
+                    'Currently only train_set = (X, y) is supported')
+        del weights, bias
+        if self.label_weights:
+            raise NotImplementedError()
+
+        n_train, n_feats = X.shape
+        if self.label_dct is None:
+            label_dct = dict([(v, i) for (i, v) in enumerate(sorted(set(y)))])
+        else:
+            label_dct = self.label_dct
+        n_classes = len(label_dct)
+
+        if n_classes < 2:
+            raise ValueError('must be at least 2 labels')
+
+        elif n_classes == 2:
+            if set(y) != set([-1, 1]):
+                # TODO: use the label_dct to automatically adjust
+                raise NotImplementedError()
+
+            if solver == 'auto':
+                if n_feats > n_train:
+                    solver = ('sklearn.svm.SVC', {'kernel': 'precomputed'})
+                else:
+                    solver = ('asgd.NaiveBinaryASGD', {})
+
+            method, method_kwargs = solver
+
+            if method == 'asgd.NaiveBinaryASGD':
+                method_kwargs = dict(method_kwargs)
+                method_kwargs.setdefault('rstate', np.random.RandomState(123))
+                svm = NaiveBinaryASGD(
+                        l2_regularization=l2_regularization,
+                        **method_kwargs)
+                svm = binary_fit(svm, (X, y))
+
+            elif method == 'sklearn.svm.SVC':
+                C = 1.0 / (l2_regularization * len(X))
+                svm = sklearn.svm.SVC(C=C, scale_C=False, **method_kwargs)
+                raise NotImplementedError(
+                    'save ktrn, multiply Xtst by X in predict()')
+                ktrn = linear_kernel(X, X)
+                svm.fit(ktrn, y)
+
+            else:
+                raise ValueError('unrecognized method', method)
+
+        else:  # n_classes > 2
+            if set(y) != set(range(len(set(y)))):
+                # TODO: use the label_dct to automatically adjust
+                raise NotImplementedError('labels need adapting',
+                        set(y))
+            if solver == 'auto':
+                solver = ('asgd.SparseUpdateRankASGD', {
+                        'sgd_step_size0': 10.0 / X.shape[1],
+                        })
+
+            method, method_kwargs = solver
+
+            if method == 'asgd.NaiveRankASGD':
+                method_kwargs = dict(method_kwargs)
+                method_kwargs.setdefault('rstate', np.random.RandomState(123))
+                svm = NaiveRankASGD(n_classes, n_feats,
+                        l2_regularization=l2_regularization,
+                        **method_kwargs)
+                svm = binary_fit(svm, (X, y))
+
+            elif method == 'asgd.SparseUpdateRankASGD':
+                method_kwargs = dict(method_kwargs)
+                method_kwargs.setdefault('rstate', np.random.RandomState(123))
+                svm = SparseUpdateRankASGD(n_classes, n_feats,
+                        l2_regularization=l2_regularization,
+                        **method_kwargs)
+                svm = binary_fit(svm, (X, y))
+
+            elif method == 'asgd.NaiveOVAASGD':
+                # -- one vs. all
+                method_kwargs = dict(method_kwargs)
+                method_kwargs.setdefault('rstate', np.random.RandomState(123))
+                svm = NaiveOVAASGD(n_classes, n_feats,
+                        l2_regularization=l2_regularization,
+                        **method_kwargs)
+                svm = binary_fit(svm, (X, y))
+
+            elif method == 'sklearn.svm.SVC':
+                # -- one vs. one
+                raise NotImplementedError(method)
+
+            elif method == 'sklearn.svm.NuSVC':
+                # -- one vs. one
+                raise NotImplementedError(method)
+
+            elif method == 'sklearn.svm.LinearSVC':
+                # -- Crammer & Singer multi-class
+                C = 1.0 / (l2_regularization * len(X))
+                svm = sklearn.svm.LinearSVC(
+                        C=C,
+                        scale_C=False,
+                        multi_class=True,
+                        **method_kwargs)
+                svm.fit(X, y)
+
+            else:
+                raise ValueError('unrecognized method', method)
+
+        self.svm = svm
+
+    def predict(self, *args, **kwargs):
+        return self.svm.predict(*args, **kwargs)
+
+    def decision_function(self, *args, **kwargs):
+        return self.svm.decision_function(*args, **kwargs)
+
+