diff --git a/.gitignore b/.gitignore
index 4af44bf..a67e6ec 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 *.pyc
 *.so
 *.egg-info
+*.swp
diff --git a/asgd/__init__.py b/asgd/__init__.py
index eabbd9e..4501d95 100644
--- a/asgd/__init__.py
+++ b/asgd/__init__.py
@@ -1,3 +1,8 @@
-from naive_asgd import NaiveBinaryASGD, NaiveOVAASGD
+from naive_asgd import NaiveBinaryASGD
+from naive_asgd import NaiveOVAASGD
+from naive_asgd import NaiveRankASGD
+from naive_asgd import SparseUpdateRankASGD
 
 from experimental_asgd import ExperimentalBinaryASGD
+
+from linsvm import LinearSVM
diff --git a/asgd/auto_step_size.py b/asgd/auto_step_size.py
index 3045520..15a0dd9 100644
--- a/asgd/auto_step_size.py
+++ b/asgd/auto_step_size.py
@@ -1,17 +1,19 @@
 import copy
+import logging
+import time
 import numpy as np
-from scipy import optimize
+import scipy.optimize
 
-DEFAULT_INITIAL_RANGE = 0.25, 0.5
-DEFAULT_MAX_EXAMPLES = 1000
-DEFAULT_TOLERANCE = 0.5
-DEFAULT_BRENT_OUTPUT = False
+logger = logging.getLogger(__name__)
+
+DEFAULT_MAX_EXAMPLES = 1000  # estimate stepsize from this many examples
+DEFAULT_TOLERANCE = 1.0     # in log-2 units of the learning rate
+DEFAULT_SGD_STEP_SIZE_FLOOR = 1e-7  # -- for huge feature vectors, reduce this.
 
 
 def find_sgd_step_size0(
-    model, X, y,
-    initial_range=DEFAULT_INITIAL_RANGE,
-    tolerance=DEFAULT_TOLERANCE, brent_output=DEFAULT_BRENT_OUTPUT):
+    model, partial_fit_args,
+    tolerance=DEFAULT_TOLERANCE):
     """Use a Brent line search to find the best step size
 
     Parameters
@@ -19,98 +21,121 @@ def find_sgd_step_size0(
     model: BinaryASGD
         Instance of a BinaryASGD
 
-    X: array, shape = [n_samples, n_features]
-        Array of features
-
-    y: array, shape = [n_samples]
-        Array of labels in (-1, 1)
-
-    initial_range: tuple of float
-        Initial range for the sgd_step_size0 search (low, high)
+    partial_fit_args - tuple of arguments for model.partial_fit.
+        This tuple must start with X, y, ...
 
-    max_iterations:
-        Maximum number of interations
+    tolerance: in logarithmic step size units
 
     Returns
     -------
-    best_sgd_step_size0: float
         Optimal sgd_step_size0 given `X` and `y`.
     """
-    # -- stupid scipy calls some sizes twice!?
+    # -- stupid solver calls some sizes twice!?
     _cache = {}
 
     def eval_size0(log2_size0):
         try:
-            return _cache[log2_size0]
+            return _cache[float(log2_size0)]
         except KeyError:
             pass
         other = copy.deepcopy(model)
         current_step_size = 2 ** log2_size0
         other.sgd_step_size0 = current_step_size
         other.sgd_step_size = current_step_size
-        other.partial_fit(X, y)
+        other.partial_fit(*partial_fit_args)
         # Hack: asgd is lower variance than sgd, but it's tuned to work
         # well asymptotically, not after just a few examples
-        weights = .5 * (other.asgd_weights + other.sgd_weights)
-        bias = .5 * (other.asgd_bias + other.sgd_bias)
-
-        margin = y * (np.dot(X, weights) + bias)
-        l2_cost = other.l2_regularization * (weights ** 2).sum()
-        rval = np.maximum(0, 1 - margin).mean() + l2_cost
-        _cache[log2_size0] = rval
+        other.asgd_weights = .5 * (other.asgd_weights + other.sgd_weights)
+        other.asgd_bias = .5 * (other.asgd_bias + other.sgd_bias)
+
+        X, y = partial_fit_args[:2]
+        rval = other.cost(X, y)
+        if np.isnan(rval):
+            rval = float('inf')
+        logger.info('find step %e: %e' % (current_step_size, rval))
+        _cache[float(log2_size0)] = rval
         return rval
 
-    best_sgd_step_size0 = optimize.brent(
-        eval_size0, brack=np.log2(initial_range), tol=tolerance)
-
-    return best_sgd_step_size0
-
-
+    if tolerance < 0.5:
+        raise NotImplementedError(
+                'tolerance too small, need adaptive stepsize')
+
+    step = -tolerance
+    x0 = np.log2(model.sgd_step_size0)
+    x1 = np.log2(model.sgd_step_size0) + step
+    y0 = eval_size0(x0)
+    y1 = eval_size0(x1)
+    if y1 > y0:
+        step *= -1
+        y0, y1 = y1, y0
+
+    while y1 < y0:
+        x0, y0 = x1, y1
+        x1 = x0 + step
+        y1 = eval_size0(x1)
+
+    # I tried using sp.optimize.fmin, but this function is bumpy and we only
+    # want a really coarse estimate of the optimmum, so fmin and fmin_powell
+    # end up being relatively inefficient even compared to this really stupid
+    # search.
+    #
+    # TODO: increase the stepsize every time it still goes down, and then
+    #       backtrack when we over-step
+
+    rval = 2.0 ** x0
+    return rval
+
+
+# XXX: use different name, function is not specific to binary classification
 def binary_fit(
-    model, X, y,
+    model, fit_args,
     max_examples=DEFAULT_MAX_EXAMPLES,
+    step_size_floor=DEFAULT_SGD_STEP_SIZE_FLOOR,
     **find_sgd_step_size0_kwargs):
     """Returns a model with automatically-selected sgd_step_size0
 
     Parameters
     ----------
-    model: BinaryASGD
+    model: BaseASGD instance
         Instance of the model to be fitted.
 
-    X: array, shape = [n_samples, n_features]
-        Array of features
-
-    y: array, shape = [n_samples]
-        Array of labels in (-1, 1)
+    fit_args - tuple of args to model.fit
+        This method assumes they are all length-of-dataset ndarrays.
 
     max_examples: int
         Maximum number of examples to use from `X` and `y` to find an
-        estimate of the best sgd_step_size0
+        estimate of the best sgd_step_size0. N.B. That the entirety of X and y
+        is used for the final fit() call after the best step size has been found.
 
     Returns
     -------
-    model: BinaryASGD
-        Instances of the model, fitted with an estimate of the best
-        sgd_step_size0
+    model: model, fitted with an estimate of the best sgd_step_size0
     """
 
-    assert X.ndim == 2
-    assert len(X) == len(y)
     assert max_examples > 0
+    logger.info('binary_fit: design matrix shape %s' % str(fit_args[0].shape))
 
     # randomly choose up to max_examples uniformly without replacement from
     # across the whole set of training data.
-    idxs = model.rstate.permutation(len(X))[:max_examples]
+    all_idxs = model.rstate.permutation(len(fit_args[0]))
+    idxs = all_idxs[:max_examples]
 
     # Find the best learning rate for that subset
+    t0 = time.time()
     best = find_sgd_step_size0(
-        model, X[idxs], y[idxs], **find_sgd_step_size0_kwargs)
+        model, [a[idxs] for a in fit_args], **find_sgd_step_size0_kwargs)
+    logger.info('found best stepsize %e in %f seconds' % (
+        best, time.time() - t0))
 
     # Heuristic: take the best stepsize according to the first max_examples,
     # and go half that fast for the full run.
-    best_estimate = 2. ** (best - 1.0)
-    model.sgd_step_size0 = best_estimate
-    model.sgd_step_size = best_estimate
-    model.fit(X, y)
+    step_size0 = max(best / 2.0, step_size_floor)
+
+    logger.info('setting sgd_step_size: %e' % step_size0)
+    model.sgd_step_size0 = float(step_size0)
+    model.sgd_step_size = float(step_size0)
+    t0 = time.time()
+    model.fit(*fit_args)
+    logger.info('full fit took %f seconds' % (time.time() - t0))
 
     return model
diff --git a/asgd/linsvm.py b/asgd/linsvm.py
new file mode 100644
index 0000000..f06c15a
--- /dev/null
+++ b/asgd/linsvm.py
@@ -0,0 +1,153 @@
+"""
+Automatic heuristic solver selection: LinearSVM
+
+"""
+
+import numpy as np
+
+from .auto_step_size import binary_fit
+from .naive_asgd import NaiveBinaryASGD
+from .naive_asgd import NaiveRankASGD
+from .naive_asgd import SparseUpdateRankASGD
+
+try:
+    import sklearn.svm
+except ImportError:
+    pass
+
+
+class LinearSVM(object):
+    """
+    SVM-fitting object that implements a heuristic for choosing
+    the right solver among several that may be installed in sklearn, and asgd.
+
+    """
+
+    def __init__(self, l2_regularization, solver='auto', label_dct=None,
+            label_weights=None):
+        self.l2_regularization = l2_regularization
+        self.solver = solver
+        self.label_dct = label_dct
+        self.label_weights = label_weights
+
+    def fit(self, X, y, weights=None, bias=None):
+        solver = self.solver
+        label_dct = self.label_dct
+        l2_regularization = self.l2_regularization
+
+        if weights or bias:
+            raise NotImplementedError(
+                    'Currently only train_set = (X, y) is supported')
+        del weights, bias
+        if self.label_weights:
+            raise NotImplementedError()
+
+        n_train, n_feats = X.shape
+        if self.label_dct is None:
+            label_dct = dict([(v, i) for (i, v) in enumerate(sorted(set(y)))])
+        else:
+            label_dct = self.label_dct
+        n_classes = len(label_dct)
+
+        if n_classes < 2:
+            raise ValueError('must be at least 2 labels')
+
+        elif n_classes == 2:
+            if set(y) != set([-1, 1]):
+                # TODO: use the label_dct to automatically adjust
+                raise NotImplementedError()
+
+            if solver == 'auto':
+                if n_feats > n_train:
+                    solver = ('sklearn.svm.SVC', {'kernel': 'precomputed'})
+                else:
+                    solver = ('asgd.NaiveBinaryASGD', {})
+
+            method, method_kwargs = solver
+
+            if method == 'asgd.NaiveBinaryASGD':
+                method_kwargs = dict(method_kwargs)
+                method_kwargs.setdefault('rstate', np.random.RandomState(123))
+                svm = NaiveBinaryASGD(
+                        l2_regularization=l2_regularization,
+                        **method_kwargs)
+                svm = binary_fit(svm, (X, y))
+
+            elif method == 'sklearn.svm.SVC':
+                C = 1.0 / (l2_regularization * len(X))
+                svm = sklearn.svm.SVC(C=C, scale_C=False, **method_kwargs)
+                raise NotImplementedError(
+                    'save ktrn, multiply Xtst by X in predict()')
+                ktrn = linear_kernel(X, X)
+                svm.fit(ktrn, y)
+
+            else:
+                raise ValueError('unrecognized method', method)
+
+        else:  # n_classes > 2
+            if set(y) != set(range(len(set(y)))):
+                # TODO: use the label_dct to automatically adjust
+                raise NotImplementedError('labels need adapting',
+                        set(y))
+            if solver == 'auto':
+                solver = ('asgd.SparseUpdateRankASGD', {
+                        'sgd_step_size0': 10.0 / X.shape[1],
+                        })
+
+            method, method_kwargs = solver
+
+            if method == 'asgd.NaiveRankASGD':
+                method_kwargs = dict(method_kwargs)
+                method_kwargs.setdefault('rstate', np.random.RandomState(123))
+                svm = NaiveRankASGD(n_classes, n_feats,
+                        l2_regularization=l2_regularization,
+                        **method_kwargs)
+                svm = binary_fit(svm, (X, y))
+
+            elif method == 'asgd.SparseUpdateRankASGD':
+                method_kwargs = dict(method_kwargs)
+                method_kwargs.setdefault('rstate', np.random.RandomState(123))
+                svm = SparseUpdateRankASGD(n_classes, n_feats,
+                        l2_regularization=l2_regularization,
+                        **method_kwargs)
+                svm = binary_fit(svm, (X, y))
+
+            elif method == 'asgd.NaiveOVAASGD':
+                # -- one vs. all
+                method_kwargs = dict(method_kwargs)
+                method_kwargs.setdefault('rstate', np.random.RandomState(123))
+                svm = NaiveOVAASGD(n_classes, n_feats,
+                        l2_regularization=l2_regularization,
+                        **method_kwargs)
+                svm = binary_fit(svm, (X, y))
+
+            elif method == 'sklearn.svm.SVC':
+                # -- one vs. one
+                raise NotImplementedError(method)
+
+            elif method == 'sklearn.svm.NuSVC':
+                # -- one vs. one
+                raise NotImplementedError(method)
+
+            elif method == 'sklearn.svm.LinearSVC':
+                # -- Crammer & Singer multi-class
+                C = 1.0 / (l2_regularization * len(X))
+                svm = sklearn.svm.LinearSVC(
+                        C=C,
+                        scale_C=False,
+                        multi_class=True,
+                        **method_kwargs)
+                svm.fit(X, y)
+
+            else:
+                raise ValueError('unrecognized method', method)
+
+        self.svm = svm
+
+    def predict(self, *args, **kwargs):
+        return self.svm.predict(*args, **kwargs)
+
+    def decision_function(self, *args, **kwargs):
+        return self.svm.decision_function(*args, **kwargs)
+
+
diff --git a/asgd/naive_asgd.py b/asgd/naive_asgd.py
index 0396344..6fd0156 100644
--- a/asgd/naive_asgd.py
+++ b/asgd/naive_asgd.py
@@ -3,16 +3,21 @@
 naive, non-optimized implementation
 """
 
+import sys
 import numpy as np
 from numpy import dot
 from itertools import izip
 
 DEFAULT_SGD_STEP_SIZE0 = 1e-2
 DEFAULT_L2_REGULARIZATION = 1e-3
-DEFAULT_N_ITERATIONS = 10
+DEFAULT_MIN_OBSERVATIONS = 1000
+DEFAULT_MAX_OBSERVATIONS = sys.maxint
 DEFAULT_FEEDBACK = False
 DEFAULT_RSTATE = None
 DEFAULT_DTYPE = np.float32
+DEFAULT_N_PARTIAL = 5000
+DEFAULT_FIT_ABS_TOLERANCE = 1e-4
+DEFAULT_FIT_REL_TOLERANCE = 1e-2
 
 
 class BaseASGD(object):
@@ -20,15 +25,28 @@ class BaseASGD(object):
     def __init__(self, n_features,
                  sgd_step_size0=DEFAULT_SGD_STEP_SIZE0,
                  l2_regularization=DEFAULT_L2_REGULARIZATION,
-                 n_iterations=DEFAULT_N_ITERATIONS, feedback=DEFAULT_FEEDBACK,
-                 rstate=DEFAULT_RSTATE, dtype=DEFAULT_DTYPE):
+                 min_observations=DEFAULT_MIN_OBSERVATIONS,
+                 max_observations=DEFAULT_MAX_OBSERVATIONS,
+                 fit_n_partial=DEFAULT_N_PARTIAL,
+                 fit_abs_tolerance=DEFAULT_FIT_ABS_TOLERANCE,
+                 fit_rel_tolerance=DEFAULT_FIT_REL_TOLERANCE,
+                 feedback=DEFAULT_FEEDBACK,
+                 rstate=DEFAULT_RSTATE,
+                 dtype=DEFAULT_DTYPE):
 
         # --
         assert n_features > 1
         self.n_features = n_features
 
-        assert n_iterations > 0
-        self.n_iterations = n_iterations
+        if not 0 <= min_observations <= max_observations:
+            raise ValueError('min_observations > max_observations',
+                    (min_observations, max_observations))
+        self.min_observations = min_observations
+        self.max_observations = max_observations
+
+        self.fit_n_partial = fit_n_partial
+        self.fit_abs_tolerance = fit_abs_tolerance
+        self.fit_rel_tolerance = fit_rel_tolerance
 
         if feedback:
             raise NotImplementedError("FIXME: feedback support is buggy")
@@ -51,35 +69,113 @@ def __init__(self, n_features,
         self.asgd_step_size0 = 1
         self.asgd_step_size = self.asgd_step_size0
 
+        # --
         self.n_observations = 0
+        self.train_means = []
+        self.recent_train_costs = []
 
+    def fit_converged(self):
+        """
+        There are two convergence tests here. Training is considered to be
+        over if it has
 
-class NaiveBinaryASGD(BaseASGD):
+        * stalled: latest train_means is allclose to a previous one (*)
 
-    def __init__(self, n_features, sgd_step_size0=DEFAULT_SGD_STEP_SIZE0,
-                 l2_regularization=DEFAULT_L2_REGULARIZATION,
-                 n_iterations=DEFAULT_N_ITERATIONS, feedback=DEFAULT_FEEDBACK,
-                 rstate=DEFAULT_RSTATE, dtype=DEFAULT_DTYPE):
-
-        super(NaiveBinaryASGD, self).__init__(
-            n_features,
-            sgd_step_size0=sgd_step_size0,
-            l2_regularization=l2_regularization,
-            n_iterations=n_iterations,
-            feedback=feedback,
-            rstate=rstate,
-            dtype=dtype,
-            )
+        * terminated: latest train_means is allclose to 0
 
-        # --
-        self.sgd_weights = np.zeros((n_features), dtype=dtype)
-        self.sgd_bias = np.zeros((1), dtype=dtype)
+        """
+        train_means = self.train_means
+        rtol = self.fit_rel_tolerance
+        atol = self.fit_abs_tolerance
+
+        # -- check for perfect fit
+        if len(train_means) > 1:
+            assert np.min(train_means) >= 0
+            if np.allclose(train_means[-1], 0, atol=atol, rtol=rtol):
+                return True
+
+        # -- check for stall condition
+        if len(train_means) > 2:
+            old_pt = max(
+                    len(train_means) // 2,
+                    len(train_means) - 4)
+            thresh = (1 - rtol) * train_means[old_pt] - atol
+            if train_means[-1] > thresh:
+                return True
+
+        return False
+
+    def fit(self, X, y):
+
+        assert X.ndim == 2
+        assert y.ndim == 1
+
+        n_points, n_features = X.shape
+        assert n_features == self.n_features
+        assert n_points == y.size
+
+        n_points_remaining = self.max_observations - self.n_observations
+
+        while n_points_remaining > 0:
 
-        self.asgd_weights = np.zeros((n_features), dtype=dtype)
-        self.asgd_bias = np.zeros((1), dtype=dtype)
+            # -- every iteration will train from n_partial observations and
+            # then check for convergence
+            fit_n_partial = min(n_points_remaining, self.fit_n_partial)
+
+            idx = self.rstate.permutation(n_points)
+            Xb = X[idx[:fit_n_partial]]
+            yb = y[idx[:fit_n_partial]]
+            self.partial_fit(Xb, yb)
+
+            if self.feedback:
+                raise NotImplementedError(
+                    'partial_fit logic requires memory to be distinct')
+                self.sgd_weights = self.asgd_weights
+                self.sgd_bias = self.asgd_bias
+
+            if (self.n_observations >= self.min_observations
+                    and self.fit_converged()):
+                break
+
+            n_points_remaining -= len(Xb)
+
+        return self
+
+
+class BaseBinaryASGD(BaseASGD):
+
+    def __init__(self, *args, **kwargs):
+        BaseASGD.__init__(self, *args, **kwargs)
+
+        self.sgd_weights = np.zeros((self.n_features,), dtype=self.dtype)
+        self.sgd_bias = np.asarray(0, dtype=self.dtype)
+
+        self.asgd_weights = np.zeros((self.n_features,), dtype=self.dtype)
+        self.asgd_bias = np.asarray(0, dtype=self.dtype)
+
+    def decision_function(self, X):
+        X = np.asarray(X)
+        return dot(self.asgd_weights, X.T) + self.asgd_bias
+
+    def predict(self, X):
+        return np.sign(self.decision_function(X))
+
+    def cost(self, X, y):
+        weights = self.asgd_weights
+        bias = self.asgd_bias
+        margin = y * (np.dot(X, weights) + bias)
+        l2_cost = self.l2_regularization * (weights ** 2).sum()
+        hinge_loss = np.maximum(0, 1 - margin)
+        rval = hinge_loss.mean() + l2_cost
+        return rval
+
+
+class NaiveBinaryASGD(BaseBinaryASGD):
 
     def partial_fit(self, X, y):
 
+        assert np.all(y**2 == 1)
+
         sgd_step_size0 = self.sgd_step_size0
         sgd_step_size = self.sgd_step_size
         sgd_step_size_scheduling_exponent = \
@@ -96,6 +192,8 @@ def partial_fit(self, X, y):
         l2_regularization = self.l2_regularization
 
         n_observations = self.n_observations
+        train_means = self.train_means
+        recent_train_costs = self.recent_train_costs
 
         for obs, label in izip(X, y):
 
@@ -110,6 +208,9 @@ def partial_fit(self, X, y):
 
                 sgd_weights += sgd_step_size * label * obs
                 sgd_bias += sgd_step_size * label
+                recent_train_costs.append(1 - float(margin))
+            else:
+                recent_train_costs.append(0)
 
             # -- update asgd
             asgd_weights = (1 - asgd_step_size) * asgd_weights \
@@ -126,6 +227,12 @@ def partial_fit(self, X, y):
                      sgd_step_size_scheduling_exponent)
             asgd_step_size = 1. / n_observations
 
+            if len(recent_train_costs) == self.fit_n_partial:
+                train_means.append(np.mean(recent_train_costs)
+                        + l2_regularization * np.dot(
+                            self.asgd_weights, self.asgd_weights))
+                self.recent_train_costs = recent_train_costs = []
+
         # --
         self.sgd_weights = sgd_weights
         self.sgd_bias = sgd_bias
@@ -139,69 +246,126 @@ def partial_fit(self, X, y):
 
         return self
 
-    def fit(self, X, y):
 
-        assert X.ndim == 2
-        assert y.ndim == 1
+class BaseMultiASGD(BaseASGD):
+    """
+    Allocate weight vectors for one-vs-all multiclass SVM
+    """
+    def __init__(self, n_classes, n_features, *args, **kwargs):
+        BaseASGD.__init__(self, n_features, *args, **kwargs)
 
-        n_points, n_features = X.shape
-        assert n_features == self.n_features
-        assert n_points == y.size
+        # --
+        self.n_classes = n_classes
 
-        n_iterations = self.n_iterations
+        dtype = self.dtype
+        # --
+        self.sgd_weights = np.zeros((n_features, n_classes), dtype=dtype)
+        self.sgd_bias = np.zeros(n_classes, dtype=dtype)
+        self.asgd_weights = np.zeros((n_features, n_classes), dtype=dtype)
+        self.asgd_bias = np.zeros(n_classes, dtype=dtype)
 
-        for i in xrange(n_iterations):
+    def decision_function(self, X):
+        X = np.asarray(X)
+        return dot(X, self.asgd_weights) + self.asgd_bias
 
-            idx = self.rstate.permutation(n_points)
-            Xb = X[idx]
-            yb = y[idx]
-            self.partial_fit(Xb, yb)
+    def predict(self, X):
+        return self.decision_function(X).argmax(axis=1)
 
-            if self.feedback:
-                self.sgd_weights = self.asgd_weights
-                self.sgd_bias = self.asgd_bias
 
-        return self
+class NaiveOVAASGD(BaseMultiASGD):
 
-    def decision_function(self, X):
-        return dot(self.asgd_weights, X.T) + self.asgd_bias
+    def partial_fit(self, X, y):
 
-    def predict(self, X):
-        return np.sign(self.decision_function(X))
+        if set(y) > set(range(self.n_classes)):
+            raise ValueError("Invalid 'y'")
+
+        sgd_step_size0 = self.sgd_step_size0
+        sgd_step_size = self.sgd_step_size
+        sgd_step_size_scheduling_exponent = \
+                self.sgd_step_size_scheduling_exponent
+        sgd_step_size_scheduling_multiplier = \
+                self.sgd_step_size_scheduling_multiplier
+        sgd_weights = self.sgd_weights
+        sgd_bias = self.sgd_bias
 
+        asgd_weights = self.asgd_weights
+        asgd_bias = self.asgd_bias
+        asgd_step_size = self.asgd_step_size
 
-class NaiveOVAASGD(BaseASGD):
+        l2_regularization = self.l2_regularization
 
-    def __init__(self, n_classes, n_features,
-                 sgd_step_size0=DEFAULT_SGD_STEP_SIZE0,
-                 l2_regularization=DEFAULT_L2_REGULARIZATION,
-                 n_iterations=DEFAULT_N_ITERATIONS,
-                 feedback=DEFAULT_FEEDBACK,
-                 rstate=DEFAULT_RSTATE,
-                 dtype=DEFAULT_DTYPE):
+        n_observations = self.n_observations
+        n_classes = self.n_classes
 
-        super(NaiveOVAASGD, self).__init__(
-            n_features,
-            sgd_step_size0=sgd_step_size0,
-            l2_regularization=l2_regularization,
-            n_iterations=n_iterations,
-            feedback=feedback,
-            rstate=rstate,
-            dtype=dtype,
-            )
+        train_means = self.train_means
+        recent_train_costs = self.recent_train_costs
 
-        # --
-        assert n_classes > 1
-        self.n_classes = n_classes
+        for obs, label in izip(X, y):
+            label = 2 * (np.arange(n_classes) == label).astype(int) - 1
+
+            # -- compute margin
+            margin = label * (dot(obs, sgd_weights) + sgd_bias)
+
+            # -- update sgd
+            if l2_regularization:
+                sgd_weights *= (1 - l2_regularization * sgd_step_size)
+
+            violations = margin < 1
+
+            if np.any(violations):
+                label_violated = label[violations]
+                sgd_weights[:, violations] += (
+                    sgd_step_size
+                    * label_violated[np.newaxis, :]
+                    * obs[:, np.newaxis]
+                )
+                sgd_bias[violations] += sgd_step_size * label_violated
+                recent_train_costs.append(
+                        n_classes - margin[violations].sum())
+            else:
+                recent_train_costs.append(0.0)
+
+            # -- update asgd
+            asgd_weights = (1 - asgd_step_size) * asgd_weights \
+                    + asgd_step_size * sgd_weights
+            asgd_bias = (1 - asgd_step_size) * asgd_bias \
+                    + asgd_step_size * sgd_bias
+
+            # -- update step_sizes
+            n_observations += 1
+            sgd_step_size_scheduling = (1 + sgd_step_size0 * n_observations *
+                                        sgd_step_size_scheduling_multiplier)
+            sgd_step_size = sgd_step_size0 / \
+                    (sgd_step_size_scheduling ** \
+                     sgd_step_size_scheduling_exponent)
+            asgd_step_size = 1. / n_observations
+
+            if len(recent_train_costs) == self.fit_n_partial:
+                train_means.append(np.mean(recent_train_costs)
+                        + l2_regularization * np.dot(
+                            self.asgd_weights, self.asgd_weights))
+                self.recent_train_costs = recent_train_costs = []
 
         # --
-        self.sgd_weights = np.zeros((n_features, n_classes), dtype=dtype)
-        self.sgd_bias = np.zeros((n_classes,), dtype=dtype)
-        self.asgd_weights = np.zeros((n_features, n_classes), dtype=dtype)
-        self.asgd_bias = np.zeros((n_classes), dtype=dtype)
+        self.sgd_weights = sgd_weights
+        self.sgd_bias = sgd_bias
+        self.sgd_step_size = sgd_step_size
 
-    def partial_fit(self, X, y):
+        self.asgd_weights = asgd_weights
+        self.asgd_bias = asgd_bias
+        self.asgd_step_size = asgd_step_size
 
+        self.n_observations = n_observations
+
+        return self
+
+
+class NaiveRankASGD(BaseMultiASGD):
+    """
+    Implements rank-based multiclass SVM.
+    """
+
+    def partial_fit(self, X, y):
         if set(y) > set(range(self.n_classes)):
             raise ValueError("Invalid 'y'")
 
@@ -223,24 +387,31 @@ def partial_fit(self, X, y):
         n_observations = self.n_observations
         n_classes = self.n_classes
 
+        train_means = self.train_means
+        recent_train_costs = self.recent_train_costs
+
         for obs, label in izip(X, y):
-            label = 2 * (np.arange(n_classes) == label).astype(int) - 1
 
             # -- compute margin
-            margin = label * (dot(obs, sgd_weights) + sgd_bias)
+            decision = dot(obs, sgd_weights) + sgd_bias
+
+            dsrt = np.argsort(decision)
+            distractor = dsrt[-2] if dsrt[-1] == label else dsrt[-1]
+            margin = decision[label] - decision[distractor]
 
             # -- update sgd
             if l2_regularization:
-                sgd_weights *= (1 - l2_regularization * sgd_step_size)
+                sgd_weights *= 1 - l2_regularization * sgd_step_size
 
-            violations = margin < 1
-            label_violated = label[violations]
-            sgd_weights[:, violations] += (
-                sgd_step_size
-                * label_violated[np.newaxis, :]
-                * obs[:, np.newaxis]
-            )
-            sgd_bias[violations] += sgd_step_size * label_violated
+            if margin < 1:
+                winc = sgd_step_size * obs
+                sgd_weights[:, distractor] -= winc
+                sgd_weights[:, label] += winc
+                sgd_bias[distractor] -= sgd_step_size
+                sgd_bias[label] += sgd_step_size
+                recent_train_costs.append(1 - float(margin))
+            else:
+                recent_train_costs.append(0.0)
 
             # -- update asgd
             asgd_weights = (1 - asgd_step_size) * asgd_weights \
@@ -257,6 +428,13 @@ def partial_fit(self, X, y):
                      sgd_step_size_scheduling_exponent)
             asgd_step_size = 1. / n_observations
 
+            if len(recent_train_costs) == self.fit_n_partial:
+                flat_weights = asgd_weights.flatten()
+                l2_term = np.dot(flat_weights, flat_weights)
+                train_means.append(np.mean(recent_train_costs)
+                        + l2_regularization * l2_term)
+                self.recent_train_costs = recent_train_costs = []
+
         # --
         self.sgd_weights = sgd_weights
         self.sgd_bias = sgd_bias
@@ -270,32 +448,187 @@ def partial_fit(self, X, y):
 
         return self
 
-    def fit(self, X, y):
+    def cost(self, X, y):
+        weights = self.asgd_weights
+        bias = self.asgd_bias
+        decisions = dot(np.asarray(X), weights) + bias
+        margins = []
+        for decision, label in zip(decisions, y):
+            dsrt = np.argsort(decision)
+            distractor = dsrt[-2] if dsrt[-1] == label else dsrt[-1]
+            margin = decision[label] - decision[distractor]
+            margins.append(margin)
 
-        assert X.ndim == 2
-        assert y.ndim == 1
+        flat_weights = weights.flatten()
+        l2_term = np.dot(flat_weights, flat_weights)
+        l2_cost = self.l2_regularization * l2_term
+        rval = np.maximum(0, 1 - np.asarray(margins)).mean() + l2_cost
+        return rval
 
-        n_points, n_features = X.shape
-        assert n_features == self.n_features
-        assert n_points == y.size
 
-        n_iterations = self.n_iterations
+#import line_profiler
+#profile = line_profiler.LineProfiler()
+#import time
 
-        for i in xrange(n_iterations):
 
-            idx = self.rstate.permutation(n_points)
-            Xb = X[idx]
-            yb = y[idx]
-            self.partial_fit(Xb, yb)
+class SparseUpdateRankASGD(BaseMultiASGD):
+    """
+    Implements rank-based multiclass SVM.
+    """
 
-            if self.feedback:
-                self.sgd_weights = self.asgd_weights
-                self.sgd_bias = self.asgd_bias
+    #XXX
+    #XXX
+    # TODO: implement l2-SVM here by updating
+    #       sgd rows in proportion to hinge loss
+
+    #@profile
+    def partial_fit(self, X, y):
+        if set(y) > set(range(self.n_classes)):
+            raise ValueError("Invalid 'y'")
+        #ttt = time.time()
 
+        sgd_step_size0 = self.sgd_step_size0
+        sgd_step_size = self.sgd_step_size
+        sgd_step_size_scheduling_exponent = \
+                self.sgd_step_size_scheduling_exponent
+        sgd_step_size_scheduling_multiplier = \
+                self.sgd_step_size_scheduling_multiplier
+        sgd_weights = np.asarray(self.sgd_weights, order='F')
+        sgd_bias = self.sgd_bias
+
+        asgd_weights = np.asarray(self.asgd_weights, order='F')
+        asgd_bias = self.asgd_bias
+        asgd_step_size = self.asgd_step_size
+
+        l2_regularization = self.l2_regularization
+
+        n_observations = self.n_observations
+        n_classes = self.n_classes
+
+        train_means = self.train_means
+        recent_train_costs = self.recent_train_costs
+
+        # -- the logical sgd_weight matrix is sgd_weights_scale * sgd_weights
+        sgd_weights_scale = 1.0
+
+        use_asgd_scale = True
+
+        if use_asgd_scale:
+            # -- the logical asgd_weights is stored as a row-wise linear
+            # -- combination of asgd_weights and sgd_weights
+            asgd_scale = np.ones((n_classes, 2), dtype=asgd_weights.dtype)
+            asgd_scale[:, 1] = 0  # -- originally there is no sgd contribution
+
+        for obs, label in izip(X, y):
+
+            # -- compute margin
+            decision = sgd_weights_scale * dot(obs, sgd_weights) + sgd_bias
+
+            dsrt = np.argsort(decision)
+            distractor = dsrt[-2] if dsrt[-1] == label else dsrt[-1]
+            margin = decision[label] - decision[distractor]
+
+            # -- update sgd
+            sgd_weights_scale *= 1 - l2_regularization * sgd_step_size
+
+            if margin < 1:
+                # -- perform the delayed updates to the rows of asgd
+
+                if use_asgd_scale:
+                    # -- XXX: this should be a single BLAS call to axpy
+                    asgd_weights[:, distractor] *= asgd_scale[distractor, 0]
+                    asgd_weights[:, distractor] += \
+                            (asgd_scale[distractor, 1] * sgd_weights_scale) \
+                            * sgd_weights[:, distractor]
+                    asgd_scale[distractor] = [1, 0]
+
+                    # -- XXX: this should be a single BLAS call to axpy
+                    asgd_weights[:, label] *= asgd_scale[label, 0]
+                    asgd_weights[:, label] += \
+                            (asgd_scale[label, 1] * sgd_weights_scale)\
+                            * sgd_weights[:, label]
+                    asgd_scale[label] = [1, 0]
+
+                winc = (sgd_step_size / sgd_weights_scale) * obs
+                sgd_weights[:, distractor] -= winc
+                sgd_weights[:, label] += winc
+                sgd_bias[distractor] -= sgd_step_size
+                sgd_bias[label] += sgd_step_size
+                recent_train_costs.append(1 - float(margin))
+            else:
+                recent_train_costs.append(0.0)
+
+            if use_asgd_scale:
+                # -- update asgd via scale variables
+                asgd_scale *= (1 - asgd_step_size)
+                asgd_scale[:, 1] += asgd_step_size
+            else:
+                asgd_weights = (1 - asgd_step_size) * asgd_weights \
+                        + asgd_step_size * sgd_weights
+
+            asgd_bias = (1 - asgd_step_size) * asgd_bias \
+                    + asgd_step_size * sgd_bias
+
+            # -- update step_sizes
+            n_observations += 1
+            sgd_step_size_scheduling = (1 + sgd_step_size0 * n_observations *
+                                        sgd_step_size_scheduling_multiplier)
+            sgd_step_size = sgd_step_size0 / \
+                    (sgd_step_size_scheduling ** \
+                     sgd_step_size_scheduling_exponent)
+            if n_observations <= self.fit_n_partial:
+                asgd_step_size = 1.
+            else:
+                asgd_step_size = 1. / (n_observations - self.fit_n_partial)
+
+            if len(recent_train_costs) == self.fit_n_partial:
+                if use_asgd_scale:
+                    asgd_weights[:] *= asgd_scale[:, 0]
+                    asgd_weights[:] += (sgd_weights_scale * asgd_scale[:, 1]) * sgd_weights
+                    asgd_scale[:, 0] = 1
+                    asgd_scale[:, 1] = 0
+
+                flat_weights = asgd_weights.flatten()
+                l2_term = np.dot(flat_weights, flat_weights)
+                train_means.append(np.mean(recent_train_costs)
+                        + l2_regularization * l2_term)
+                self.recent_train_costs = recent_train_costs = []
+
+        # --
+        self.sgd_weights = sgd_weights * sgd_weights_scale
+        self.sgd_bias = sgd_bias
+        self.sgd_step_size = sgd_step_size
+
+        if use_asgd_scale:
+            asgd_weights[:] *= asgd_scale[:, 0]
+            asgd_weights[:] += (sgd_weights_scale * asgd_scale[:, 1]) * sgd_weights
+            asgd_scale[:, 0] = 1
+            asgd_scale[:, 1] = 0
+
+        self.asgd_weights = asgd_weights
+        self.asgd_bias = asgd_bias
+        self.asgd_step_size = asgd_step_size
+
+        self.n_observations = n_observations
+
+        #profile.print_stats()
+        #print 'n_obs', n_observations, 'time/%i' % len(X), (time.time() - ttt)
         return self
 
-    def decision_function(self, X):
-        return dot(X, self.asgd_weights) + self.asgd_bias
+    def cost(self, X, y):
+        weights = self.asgd_weights
+        bias = self.asgd_bias
+        decisions = dot(np.asarray(X), weights) + bias
+        margins = []
+        for decision, label in zip(decisions, y):
+            dsrt = np.argsort(decision)
+            distractor = dsrt[-2] if dsrt[-1] == label else dsrt[-1]
+            margin = decision[label] - decision[distractor]
+            margins.append(margin)
+
+        flat_weights = weights.flatten()
+        l2_term = np.dot(flat_weights, flat_weights)
+        l2_cost = self.l2_regularization * l2_term
+        rval = np.maximum(0, 1 - np.asarray(margins)).mean() + l2_cost
+        return rval
 
-    def predict(self, X):
-        return self.decision_function(X).argmax(1)
diff --git a/asgd/tests/test_auto_step_size.py b/asgd/tests/test_auto_step_size.py
index 89799e1..1f3c672 100644
--- a/asgd/tests/test_auto_step_size.py
+++ b/asgd/tests/test_auto_step_size.py
@@ -1,4 +1,5 @@
 from nose.tools import assert_equal
+import numpy as np
 from numpy.testing import assert_almost_equal
 from numpy.testing import assert_array_equal
 
@@ -11,11 +12,13 @@
 from test_naive_asgd import get_fake_data
 
 
-def get_new_model(n_features, rstate):
+def get_new_model(n_features, rstate, n_points):
     return BinaryASGD(n_features, rstate=rstate,
-            sgd_step_size0=1e3,
+            sgd_step_size0=1e3,  #  intentionally large, binary_fit will fix
             l2_regularization=1e-3,
-            n_iterations=5)
+            max_observations=5 * n_points,
+            min_observations=5 * n_points,
+            )
 
 
 def test_binary_sgd_step_size0():
@@ -24,13 +27,10 @@ def test_binary_sgd_step_size0():
 
     X, y = get_fake_data(100, n_features, rstate)
 
-    clf = get_new_model(n_features, rstate)
-    best = find_sgd_step_size0(clf, X, y, (.25, .5))
-    assert_almost_equal(best, -4.9927, decimal=4)
-
-    # start a little lower, still works
-    best = find_sgd_step_size0(clf, X, y, (.125, .25))
-    assert_almost_equal(best, -4.6180, decimal=4)
+    clf = get_new_model(n_features, rstate, 100)
+    best0 = find_sgd_step_size0(clf, (X, y))
+    print best0
+    assert np.allclose(best0, 0.04, atol=.1, rtol=.5)
 
     # find_sgd_step_size0 does not change clf
     assert clf.sgd_step_size0 == 1000.0
@@ -40,25 +40,15 @@ def test_binary_fit():
     rstate = RandomState(42)
     n_features = 20
 
-    clf100 = get_new_model(n_features, rstate)
-    X, y = get_fake_data(100, n_features, rstate)
-    _clf100 = binary_fit(clf100, X, y)
-    assert _clf100 is clf100
-    assert_almost_equal(clf100.sgd_step_size0, 0.04812, decimal=4)
-
-    # smoke test
-    clf1000 = get_new_model(n_features, rstate)
-    X, y = get_fake_data(DEFAULT_MAX_EXAMPLES, n_features, rstate)
-    _clf1000 = binary_fit(clf1000, X, y)
-    assert _clf1000 is clf1000
-    assert_almost_equal(clf1000.sgd_step_size0, 0.0047, decimal=4)
+    for L in [100, DEFAULT_MAX_EXAMPLES, int(DEFAULT_MAX_EXAMPLES * 1.5),
+            int(DEFAULT_MAX_EXAMPLES * 3)]:
 
-    # smoke test that at least it runs
-    clf2000 = get_new_model(n_features, rstate)
-    X, y = get_fake_data(2000, n_features, rstate)
-    _clf2000 = binary_fit(clf2000, X, y)
-    assert _clf2000 == clf2000
-    assert_almost_equal(clf2000.sgd_step_size0, 0.0067, decimal=4)
+        clf = get_new_model(n_features, rstate, L)
+        X, y = get_fake_data(L, n_features, rstate, separation=0.1)
+        best = find_sgd_step_size0(clf, (X, y))
+        _clf = binary_fit(clf, (X, y))
+        assert _clf is clf
+        assert 0 < clf.sgd_step_size0 <= best
 
 
 def test_fit_replicable():
@@ -67,11 +57,11 @@ def test_fit_replicable():
 
     X, y = get_fake_data(100, n_features, RandomState(4))
 
-    m0 = get_new_model(n_features, RandomState(45))
-    m0 = binary_fit(m0, X, y)
+    m0 = get_new_model(n_features, RandomState(45), 100)
+    m0 = binary_fit(m0, (X, y))
 
-    m1 = get_new_model(n_features, RandomState(45))
-    m1 = binary_fit(m1, X, y)
+    m1 = get_new_model(n_features, RandomState(45), 100)
+    m1 = binary_fit(m1, (X, y))
 
     assert_array_equal(m0.sgd_weights, m1.sgd_weights)
     assert_array_equal(m0.sgd_bias, m1.sgd_bias)
diff --git a/asgd/tests/test_naive_asgd.py b/asgd/tests/test_naive_asgd.py
index 0900f87..6826373 100644
--- a/asgd/tests/test_naive_asgd.py
+++ b/asgd/tests/test_naive_asgd.py
@@ -1,3 +1,4 @@
+import sys
 from nose.tools import assert_equal, raises
 from nose.plugins.skip import SkipTest
 from numpy.testing import assert_allclose
@@ -16,14 +17,15 @@
 DEFAULT_ARGS = (N_FEATURES,)
 DEFAULT_KWARGS = dict(sgd_step_size0=1e-3,
                       l2_regularization=1e-6,
-                      n_iterations=4,
+                      min_observations=4 * N_POINTS,
+                      max_observations=4 * N_POINTS,
                       dtype=np.float32)
 
 
-def get_fake_data(n_points, n_features, rstate):
+def get_fake_data(n_points, n_features, rstate, separation=1e-1):
     X = rstate.randn(n_points, n_features).astype(np.float32)
     y = 2 * (rstate.randn(n_points) > 0) - 1
-    X[y == 1] += 1e-1
+    X[y == 1] += separation
     return X, y
 
 
@@ -41,7 +43,6 @@ def get_fake_multiclass_data(n_points, n_features, n_classes, rstate):
 
 
 def test_naive_asgd():
-
     rstate = RandomState(42)
 
     X, y = get_fake_data(N_POINTS, N_FEATURES, rstate)
@@ -50,6 +51,7 @@ def test_naive_asgd():
     clf = BinaryASGD(*DEFAULT_ARGS, rstate=rstate, **DEFAULT_KWARGS)
 
     clf.fit(X, y)
+    assert clf.n_observations == clf.min_observations == clf.max_observations
     ytrn_preds = clf.predict(X)
     ytst_preds = clf.predict(Xtst)
     ytrn_acc = (ytrn_preds == y).mean()
@@ -78,7 +80,6 @@ def test_naive_asgd_with_feedback():
 
 
 def test_naive_asgd_multi_labels():
-
     rstate = RandomState(44)
 
     X, y = get_fake_binary_data_multi_labels(N_POINTS, N_FEATURES, rstate)
@@ -87,7 +88,10 @@ def test_naive_asgd_multi_labels():
 
     # n_classes is 2 since it is actually a binary case
     clf = OVAASGD(2, N_FEATURES, sgd_step_size0=1e-3, l2_regularization=1e-6,
-                  n_iterations=4, dtype=np.float32, rstate=rstate)
+                  min_observations=4 * N_POINTS,
+                  max_observations=4 * N_POINTS,
+                  dtype=np.float32,
+                  rstate=rstate)
     clf.fit(X, y)
     ytrn_preds = clf.predict(X)
     ytst_preds = clf.predict(Xtst)
@@ -98,7 +102,6 @@ def test_naive_asgd_multi_labels():
 
 
 def test_naive_multiclass_ova_asgd():
-
     rstate = RandomState(45)
 
     n_classes = 10
@@ -108,7 +111,10 @@ def test_naive_multiclass_ova_asgd():
                                           rstate)
 
     clf = OVAASGD(n_classes, N_FEATURES, sgd_step_size0=1e-3,
-                  l2_regularization=1e-6, n_iterations=4, dtype=np.float32,
+                  l2_regularization=1e-6,
+                  min_observations=4 * N_POINTS,
+                  max_observations=4 * N_POINTS,
+                  dtype=np.float32,
                   rstate=rstate)
 
     clf.fit(X, y)
@@ -123,7 +129,6 @@ def test_naive_multiclass_ova_asgd():
 
 
 def test_naive_multiclass_ova_vs_binary_asgd():
-
     rstate = RandomState(42)
 
     n_classes = 3
@@ -162,7 +167,6 @@ def test_naive_multiclass_ova_vs_binary_asgd():
 
 @raises(ValueError)
 def test_naive_ova_asgd_wrong_labels():
-
     rstate = RandomState(42)
 
     n_classes = 10
@@ -174,3 +178,34 @@ def test_naive_ova_asgd_wrong_labels():
                   rstate=RandomState(999), **DEFAULT_KWARGS)
     ytrn_bad = rstate.randint(n_classes + 42, size=len(ytrn))
     clf.partial_fit(Xtrn, ytrn_bad)
+
+
+def test_early_stopping():
+    rstate = RandomState(42)
+
+    for N_POINTS in (1000, 2000):
+
+        X, y = get_fake_data(N_POINTS, N_FEATURES, rstate)
+        Xtst, ytst = get_fake_data(N_POINTS, N_FEATURES, rstate)
+
+        kwargs = dict(DEFAULT_KWARGS)
+        kwargs['fit_n_partial'] = 1500
+        del kwargs['max_observations']
+        print kwargs
+        print 'N_POINTS', N_POINTS
+
+        clf = BinaryASGD(*DEFAULT_ARGS, rstate=rstate, **kwargs)
+
+        clf.fit(X, y)
+        print clf.fit_n_partial
+        print len(clf.train_means)
+        print clf.n_observations
+        assert clf.fit_n_partial == kwargs['fit_n_partial']
+        assert len(clf.train_means) == clf.n_observations / clf.fit_n_partial
+        assert clf.min_observations == kwargs['min_observations']
+        assert clf.max_observations == sys.maxint
+        assert clf.n_observations >= kwargs['min_observations']
+        assert clf.fit_converged()
+        ytrn_preds = clf.predict(X)
+        ytrn_acc = (ytrn_preds == y).mean()
+        assert ytrn_acc > 0.7