Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Early stopping #21

Open
wants to merge 18 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
*.pyc
*.so
*.egg-info
*.swp
7 changes: 6 additions & 1 deletion asgd/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
from naive_asgd import NaiveBinaryASGD, NaiveOVAASGD
from naive_asgd import NaiveBinaryASGD
from naive_asgd import NaiveOVAASGD
from naive_asgd import NaiveRankASGD
from naive_asgd import SparseUpdateRankASGD

from experimental_asgd import ExperimentalBinaryASGD

from linsvm import LinearSVM
133 changes: 79 additions & 54 deletions asgd/auto_step_size.py
Original file line number Diff line number Diff line change
@@ -1,116 +1,141 @@
import copy
import logging
import time
import numpy as np
from scipy import optimize
import scipy.optimize

DEFAULT_INITIAL_RANGE = 0.25, 0.5
DEFAULT_MAX_EXAMPLES = 1000
DEFAULT_TOLERANCE = 0.5
DEFAULT_BRENT_OUTPUT = False
logger = logging.getLogger(__name__)

DEFAULT_MAX_EXAMPLES = 1000 # estimate stepsize from this many examples
DEFAULT_TOLERANCE = 1.0 # in log-2 units of the learning rate
DEFAULT_SGD_STEP_SIZE_FLOOR = 1e-7 # -- for huge feature vectors, reduce this.


def find_sgd_step_size0(
model, X, y,
initial_range=DEFAULT_INITIAL_RANGE,
tolerance=DEFAULT_TOLERANCE, brent_output=DEFAULT_BRENT_OUTPUT):
model, partial_fit_args,
tolerance=DEFAULT_TOLERANCE):
"""Use a Brent line search to find the best step size

Parameters
----------
model: BinaryASGD
Instance of a BinaryASGD

X: array, shape = [n_samples, n_features]
Array of features

y: array, shape = [n_samples]
Array of labels in (-1, 1)

initial_range: tuple of float
Initial range for the sgd_step_size0 search (low, high)
partial_fit_args - tuple of arguments for model.partial_fit.
This tuple must start with X, y, ...

max_iterations:
Maximum number of interations
tolerance: in logarithmic step size units

Returns
-------
best_sgd_step_size0: float
Optimal sgd_step_size0 given `X` and `y`.
"""
# -- stupid scipy calls some sizes twice!?
# -- stupid solver calls some sizes twice!?
_cache = {}

def eval_size0(log2_size0):
try:
return _cache[log2_size0]
return _cache[float(log2_size0)]
except KeyError:
pass
other = copy.deepcopy(model)
current_step_size = 2 ** log2_size0
other.sgd_step_size0 = current_step_size
other.sgd_step_size = current_step_size
other.partial_fit(X, y)
other.partial_fit(*partial_fit_args)
# Hack: asgd is lower variance than sgd, but it's tuned to work
# well asymptotically, not after just a few examples
weights = .5 * (other.asgd_weights + other.sgd_weights)
bias = .5 * (other.asgd_bias + other.sgd_bias)

margin = y * (np.dot(X, weights) + bias)
l2_cost = other.l2_regularization * (weights ** 2).sum()
rval = np.maximum(0, 1 - margin).mean() + l2_cost
_cache[log2_size0] = rval
other.asgd_weights = .5 * (other.asgd_weights + other.sgd_weights)
other.asgd_bias = .5 * (other.asgd_bias + other.sgd_bias)

X, y = partial_fit_args[:2]
rval = other.cost(X, y)
if np.isnan(rval):
rval = float('inf')
logger.info('find step %e: %e' % (current_step_size, rval))
_cache[float(log2_size0)] = rval
return rval

best_sgd_step_size0 = optimize.brent(
eval_size0, brack=np.log2(initial_range), tol=tolerance)

return best_sgd_step_size0


if tolerance < 0.5:
raise NotImplementedError(
'tolerance too small, need adaptive stepsize')

step = -tolerance
x0 = np.log2(model.sgd_step_size0)
x1 = np.log2(model.sgd_step_size0) + step
y0 = eval_size0(x0)
y1 = eval_size0(x1)
if y1 > y0:
step *= -1
y0, y1 = y1, y0

while y1 < y0:
x0, y0 = x1, y1
x1 = x0 + step
y1 = eval_size0(x1)

# I tried using sp.optimize.fmin, but this function is bumpy and we only
# want a really coarse estimate of the optimmum, so fmin and fmin_powell
# end up being relatively inefficient even compared to this really stupid
# search.
#
# TODO: increase the stepsize every time it still goes down, and then
# backtrack when we over-step

rval = 2.0 ** x0
return rval


# XXX: use different name, function is not specific to binary classification
def binary_fit(
model, X, y,
model, fit_args,
max_examples=DEFAULT_MAX_EXAMPLES,
step_size_floor=DEFAULT_SGD_STEP_SIZE_FLOOR,
**find_sgd_step_size0_kwargs):
"""Returns a model with automatically-selected sgd_step_size0

Parameters
----------
model: BinaryASGD
model: BaseASGD instance
Instance of the model to be fitted.

X: array, shape = [n_samples, n_features]
Array of features

y: array, shape = [n_samples]
Array of labels in (-1, 1)
fit_args - tuple of args to model.fit
This method assumes they are all length-of-dataset ndarrays.

max_examples: int
Maximum number of examples to use from `X` and `y` to find an
estimate of the best sgd_step_size0
estimate of the best sgd_step_size0. N.B. That the entirety of X and y
is used for the final fit() call after the best step size has been found.

Returns
-------
model: BinaryASGD
Instances of the model, fitted with an estimate of the best
sgd_step_size0
model: model, fitted with an estimate of the best sgd_step_size0
"""

assert X.ndim == 2
assert len(X) == len(y)
assert max_examples > 0
logger.info('binary_fit: design matrix shape %s' % str(fit_args[0].shape))

# randomly choose up to max_examples uniformly without replacement from
# across the whole set of training data.
idxs = model.rstate.permutation(len(X))[:max_examples]
all_idxs = model.rstate.permutation(len(fit_args[0]))
idxs = all_idxs[:max_examples]

# Find the best learning rate for that subset
t0 = time.time()
best = find_sgd_step_size0(
model, X[idxs], y[idxs], **find_sgd_step_size0_kwargs)
model, [a[idxs] for a in fit_args], **find_sgd_step_size0_kwargs)
logger.info('found best stepsize %e in %f seconds' % (
best, time.time() - t0))

# Heuristic: take the best stepsize according to the first max_examples,
# and go half that fast for the full run.
best_estimate = 2. ** (best - 1.0)
model.sgd_step_size0 = best_estimate
model.sgd_step_size = best_estimate
model.fit(X, y)
step_size0 = max(best / 2.0, step_size_floor)

logger.info('setting sgd_step_size: %e' % step_size0)
model.sgd_step_size0 = float(step_size0)
model.sgd_step_size = float(step_size0)
t0 = time.time()
model.fit(*fit_args)
logger.info('full fit took %f seconds' % (time.time() - t0))

return model
153 changes: 153 additions & 0 deletions asgd/linsvm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
"""
Automatic heuristic solver selection: LinearSVM

"""

import numpy as np

from .auto_step_size import binary_fit
from .naive_asgd import NaiveBinaryASGD
from .naive_asgd import NaiveRankASGD
from .naive_asgd import SparseUpdateRankASGD

try:
import sklearn.svm
except ImportError:
pass


class LinearSVM(object):
"""
SVM-fitting object that implements a heuristic for choosing
the right solver among several that may be installed in sklearn, and asgd.

"""

def __init__(self, l2_regularization, solver='auto', label_dct=None,
label_weights=None):
self.l2_regularization = l2_regularization
self.solver = solver
self.label_dct = label_dct
self.label_weights = label_weights

def fit(self, X, y, weights=None, bias=None):
solver = self.solver
label_dct = self.label_dct
l2_regularization = self.l2_regularization

if weights or bias:
raise NotImplementedError(
'Currently only train_set = (X, y) is supported')
del weights, bias
if self.label_weights:
raise NotImplementedError()

n_train, n_feats = X.shape
if self.label_dct is None:
label_dct = dict([(v, i) for (i, v) in enumerate(sorted(set(y)))])
else:
label_dct = self.label_dct
n_classes = len(label_dct)

if n_classes < 2:
raise ValueError('must be at least 2 labels')

elif n_classes == 2:
if set(y) != set([-1, 1]):
# TODO: use the label_dct to automatically adjust
raise NotImplementedError()

if solver == 'auto':
if n_feats > n_train:
solver = ('sklearn.svm.SVC', {'kernel': 'precomputed'})
else:
solver = ('asgd.NaiveBinaryASGD', {})

method, method_kwargs = solver

if method == 'asgd.NaiveBinaryASGD':
method_kwargs = dict(method_kwargs)
method_kwargs.setdefault('rstate', np.random.RandomState(123))
svm = NaiveBinaryASGD(
l2_regularization=l2_regularization,
**method_kwargs)
svm = binary_fit(svm, (X, y))

elif method == 'sklearn.svm.SVC':
C = 1.0 / (l2_regularization * len(X))
svm = sklearn.svm.SVC(C=C, scale_C=False, **method_kwargs)
raise NotImplementedError(
'save ktrn, multiply Xtst by X in predict()')
ktrn = linear_kernel(X, X)
svm.fit(ktrn, y)

else:
raise ValueError('unrecognized method', method)

else: # n_classes > 2
if set(y) != set(range(len(set(y)))):
# TODO: use the label_dct to automatically adjust
raise NotImplementedError('labels need adapting',
set(y))
if solver == 'auto':
solver = ('asgd.SparseUpdateRankASGD', {
'sgd_step_size0': 10.0 / X.shape[1],
})

method, method_kwargs = solver

if method == 'asgd.NaiveRankASGD':
method_kwargs = dict(method_kwargs)
method_kwargs.setdefault('rstate', np.random.RandomState(123))
svm = NaiveRankASGD(n_classes, n_feats,
l2_regularization=l2_regularization,
**method_kwargs)
svm = binary_fit(svm, (X, y))

elif method == 'asgd.SparseUpdateRankASGD':
method_kwargs = dict(method_kwargs)
method_kwargs.setdefault('rstate', np.random.RandomState(123))
svm = SparseUpdateRankASGD(n_classes, n_feats,
l2_regularization=l2_regularization,
**method_kwargs)
svm = binary_fit(svm, (X, y))

elif method == 'asgd.NaiveOVAASGD':
# -- one vs. all
method_kwargs = dict(method_kwargs)
method_kwargs.setdefault('rstate', np.random.RandomState(123))
svm = NaiveOVAASGD(n_classes, n_feats,
l2_regularization=l2_regularization,
**method_kwargs)
svm = binary_fit(svm, (X, y))

elif method == 'sklearn.svm.SVC':
# -- one vs. one
raise NotImplementedError(method)

elif method == 'sklearn.svm.NuSVC':
# -- one vs. one
raise NotImplementedError(method)

elif method == 'sklearn.svm.LinearSVC':
# -- Crammer & Singer multi-class
C = 1.0 / (l2_regularization * len(X))
svm = sklearn.svm.LinearSVC(
C=C,
scale_C=False,
multi_class=True,
**method_kwargs)
svm.fit(X, y)

else:
raise ValueError('unrecognized method', method)

self.svm = svm

def predict(self, *args, **kwargs):
return self.svm.predict(*args, **kwargs)

def decision_function(self, *args, **kwargs):
return self.svm.decision_function(*args, **kwargs)


Loading