Skip to content

Commit

Permalink
Added some safety checks for subsamplers.
Browse files Browse the repository at this point in the history
Also, added `states_` attribute to TI estimator.
  • Loading branch information
dotsdl committed May 18, 2017
1 parent 30f2fed commit 6dd8eb5
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 33 deletions.
6 changes: 6 additions & 0 deletions src/alchemlyb/estimators/ti_.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,14 @@ class TI(BaseEstimator):
delta_f_ : DataFrame
The estimated dimensionless free energy difference between each state.
d_delta_f_ : DataFrame
The estimated statistical uncertainty (one standard deviation) in
dimensionless free energy differences.
states_ : list
Lambda states for which free energy differences were obtained.
"""

def __init__(self, verbose=False):
Expand Down Expand Up @@ -81,4 +85,6 @@ def fit(self, dHdl):
columns=variances.index.values,
index=variances.index.values)

self.states_ = means.index.values.tolist()

return self
62 changes: 43 additions & 19 deletions src/alchemlyb/preprocessing/subsampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,34 +6,49 @@
from pymbar.timeseries import detectEquilibration


def slicing(df, lower=None, upper=None, step=None):
def _check_multiple_times(df):
return df.sort_index(0).reset_index(0).duplicated('time').any()


def _check_sorted(df):
return df.reset_index(0)['time'].is_monotonic_increasing


def slicing(df, lower=None, upper=None, step=None, force=False):
"""Subsample a DataFrame using simple slicing.
Parameters
----------
df : DataFrame
DataFrame to subsample.
lower : float
Lower bound to slice from.
Lower time to slice from.
upper : float
Upper bound to slice to (inclusive).
Upper time to slice to (inclusive).
step : int
Step between rows to slice by.
force : bool
Ignore checks that DataFrame is in proper form for expected behavior.
Returns
-------
DataFrame
`df` subsampled.
"""
df = df.loc[lower:upper:step]
try:
df = df.loc[lower:upper:step]
except KeyError:
raise KeyError("DataFrame rows must be sorted by time, increasing.")

if not force and _check_multiple_times(df):
raise KeyError("Duplicate time values found; it's generally advised "
"to use slicing on DataFrames with unique time values "
"for each row. Use `force=True` to ignore this error.")

# drop any rows that have missing values
df = df.dropna()

# subsample according to step
#df = df.iloc[::step]

return df


Expand Down Expand Up @@ -68,14 +83,17 @@ def statistical_inefficiency(df, series=None, lower=None, upper=None, step=None)
pymbar.timeseries.statisticalInefficiency : detailed background
"""
if series is not None:
series = series.loc[lower:upper]
if _check_multiple_times(df):
raise KeyError("Duplicate time values found; statistical inefficiency "
"only works on a single, contiguous, "
"and sorted timeseries.")

# drop any rows that have missing values
series = series.dropna()
if not _check_sorted(df):
raise KeyError("Statistical inefficiency only works as expected if "
"values are sorted by time, increasing.")

# subsample according to step
series = series.iloc[::step]
if series is not None:
series = slicing(series, lower=lower, upper=upper, step=step)

# calculate statistical inefficiency of series
statinef = statisticalInefficiency(series)
Expand Down Expand Up @@ -123,14 +141,20 @@ def equilibrium_detection(df, series=None, lower=None, upper=None, step=None):
pymbar.timeseries.detectEquilibration : detailed background
"""
if series is not None:
series = series.loc[lower:upper]
if _check_multiple_times(df):
raise KeyError("Duplicate time values found; equilibrium detection "
"is only meaningful for a single, contiguous, "
"and sorted timeseries.")

if not _check_sorted(df):
raise KeyError("Equilibrium detection only works as expected if "
"values are sorted by time, increasing.")

# drop any rows that have missing values
series = series.dropna()
if series is not None:
series = slicing(series, lower=lower, upper=upper, step=step)

# subsample according to step
series = series.iloc[::step]
# calculate statistical inefficiency of series
statinef = statisticalInefficiency(series)

# calculate statistical inefficiency of series, with equilibrium detection
t, statinef, Neff_max = detectEquilibration(series.values)
Expand Down
67 changes: 53 additions & 14 deletions src/alchemlyb/tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest

import pandas as pd
import numpy as np

from alchemlyb.parsing import gmx
from alchemlyb.preprocessing import slicing
Expand All @@ -23,6 +24,15 @@ def gmx_benzene_u_nk():
return gmx.extract_u_nk(dataset['data']['Coulomb'][0], T=300)


def gmx_benzene_dHdl_full():
dataset = alchemtest.gmx.load_benzene()
return pd.concat([gmx.extract_dHdl(i, T=300) for i in dataset['data']['Coulomb']])


def gmx_benzene_u_nk_full():
dataset = alchemtest.gmx.load_benzene()
return pd.concat([gmx.extract_u_nk(i, T=300) for i in dataset['data']['Coulomb']])

class TestSlicing:
"""Test slicing functionality.
Expand All @@ -35,30 +45,59 @@ def slicer(self, *args, **kwargs):
def test_basic_slicing(self, data, size):
assert len(self.slicer(data, lower=1000, upper=34000, step=5)) == size

@pytest.mark.parametrize('data', [gmx_benzene_dHdl(),
gmx_benzene_u_nk()])
def test_disordered_exception(self, data):
"""Test that a shuffled DataFrame yields a KeyError.
class TestStatisticalInefficiency(TestSlicing):
"""
indices = np.arange(len(data))
np.random.shuffle(indices)

def slicer(self, *args, **kwargs):
return statistical_inefficiency(*args, **kwargs)
df = data.iloc[indices]

with pytest.raises(KeyError):
self.slicer(df, lower=200)

@pytest.mark.parametrize('data', [gmx_benzene_dHdl_full(),
gmx_benzene_u_nk_full()])
def test_duplicated_exception(self, data):
"""Test that a DataFrame with duplicate times yields a KeyError.
"""
with pytest.raises(KeyError):
self.slicer(data, lower=200)


class CorrelatedPreprocessors:

@pytest.mark.parametrize(('data', 'size'), [(gmx_benzene_dHdl(), 4001),
(gmx_benzene_u_nk(), 4001)])
def test_statinef_subsampling(self, data, size):
def test_subsampling(self, data, size):
"""Basic test for execution; resulting size of dataset sensitive to
machine.
machine and depends on algorithm.
"""
assert len(self.slicer(data, series=data.iloc[:, 0])) <= size

@pytest.mark.parametrize('data', [gmx_benzene_dHdl(),
gmx_benzene_u_nk()])
def test_no_series(self, data):
"""Check that we get the same result as simple slicing with no Series.
class TestEquilibriumDetection(TestSlicing):
"""
df_sub = self.slicer(data, lower=200, upper=5000, step=2)
df_sliced = slicing(data, lower=200, upper=5000, step=2)

assert np.all((df_sub == df_sliced))


class TestStatisticalInefficiency(TestSlicing, CorrelatedPreprocessors):

def slicer(self, *args, **kwargs):
return equilibrium_detection(*args, **kwargs)
return statistical_inefficiency(*args, **kwargs)

@pytest.mark.parametrize(('data', 'size'), [(gmx_benzene_dHdl(), 4001),
(gmx_benzene_u_nk(), 4001)])
def test_equildet_subsampling(self, data, size):
"""Basic test for execution; resulting size of dataset sensitive to
machine.
"""
assert len(self.slicer(data, series=data.iloc[:, 0])) <= size

class TestEquilibriumDetection(TestSlicing, CorrelatedPreprocessors):

def slicer(self, *args, **kwargs):
return equilibrium_detection(*args, **kwargs)

0 comments on commit 6dd8eb5

Please sign in to comment.