Added some safety checks for subsamplers.

Also, added `states_` attribute to TI estimator.
alchemistry · May 18, 2017 · 6dd8eb5 · 6dd8eb5
1 parent 30f2fed
commit 6dd8eb5
Show file tree

Hide file tree

Showing 3 changed files with 102 additions and 33 deletions.
diff --git a/src/alchemlyb/estimators/ti_.py b/src/alchemlyb/estimators/ti_.py
@@ -18,10 +18,14 @@ class TI(BaseEstimator):
 
     delta_f_ : DataFrame
         The estimated dimensionless free energy difference between each state.
+
     d_delta_f_ : DataFrame
         The estimated statistical uncertainty (one standard deviation) in 
         dimensionless free energy differences.
 
+    states_ : list
+        Lambda states for which free energy differences were obtained.
+
     """
 
     def __init__(self, verbose=False):
@@ -81,4 +85,6 @@ def fit(self, dHdl):
                                        columns=variances.index.values,
                                        index=variances.index.values)
 
+        self.states_ = means.index.values.tolist()
+
         return self
diff --git a/src/alchemlyb/preprocessing/subsampling.py b/src/alchemlyb/preprocessing/subsampling.py
@@ -6,34 +6,49 @@
 from pymbar.timeseries import detectEquilibration
 
 
-def slicing(df, lower=None, upper=None, step=None):
+def _check_multiple_times(df):
+    return df.sort_index(0).reset_index(0).duplicated('time').any()
+
+
+def _check_sorted(df):
+    return df.reset_index(0)['time'].is_monotonic_increasing
+
+
+def slicing(df, lower=None, upper=None, step=None, force=False):
     """Subsample a DataFrame using simple slicing.
 
     Parameters
     ----------
     df : DataFrame
         DataFrame to subsample.
     lower : float
-        Lower bound to slice from.
+        Lower time to slice from.
     upper : float
-        Upper bound to slice to (inclusive).
+        Upper time to slice to (inclusive).
     step : int
         Step between rows to slice by.
+    force : bool
+        Ignore checks that DataFrame is in proper form for expected behavior.
 
     Returns
     -------
     DataFrame
         `df` subsampled.
 
     """
-    df = df.loc[lower:upper:step]
+    try:
+        df = df.loc[lower:upper:step]
+    except KeyError:
+        raise KeyError("DataFrame rows must be sorted by time, increasing.")
+
+    if not force and _check_multiple_times(df):
+        raise KeyError("Duplicate time values found; it's generally advised "
+                       "to use slicing on DataFrames with unique time values "
+                       "for each row. Use `force=True` to ignore this error.")
 
     # drop any rows that have missing values
     df = df.dropna()
 
-    # subsample according to step
-    #df = df.iloc[::step]
-
     return df
 
 
@@ -68,14 +83,17 @@ def statistical_inefficiency(df, series=None, lower=None, upper=None, step=None)
     pymbar.timeseries.statisticalInefficiency : detailed background
 
     """
-    if series is not None:
-        series = series.loc[lower:upper]
+    if _check_multiple_times(df):
+        raise KeyError("Duplicate time values found; statistical inefficiency "
+                       "only works on a single, contiguous, "
+                       "and sorted timeseries.")
 
-        # drop any rows that have missing values
-        series = series.dropna()
+    if not _check_sorted(df):
+        raise KeyError("Statistical inefficiency only works as expected if "
+                       "values are sorted by time, increasing.")
 
-        # subsample according to step
-        series = series.iloc[::step]
+    if series is not None:
+        series = slicing(series, lower=lower, upper=upper, step=step)
 
         # calculate statistical inefficiency of series
         statinef  = statisticalInefficiency(series)
@@ -123,14 +141,20 @@ def equilibrium_detection(df, series=None, lower=None, upper=None, step=None):
     pymbar.timeseries.detectEquilibration : detailed background
 
     """
-    if series is not None:
-        series = series.loc[lower:upper]
+    if _check_multiple_times(df):
+        raise KeyError("Duplicate time values found; equilibrium detection "
+                       "is only meaningful for a single, contiguous, "
+                       "and sorted timeseries.")
+
+    if not _check_sorted(df):
+        raise KeyError("Equilibrium detection only works as expected if "
+                       "values are sorted by time, increasing.")
 
-        # drop any rows that have missing values
-        series = series.dropna()
+    if series is not None:
+        series = slicing(series, lower=lower, upper=upper, step=step)
 
-        # subsample according to step
-        series = series.iloc[::step]
+        # calculate statistical inefficiency of series
+        statinef  = statisticalInefficiency(series)
 
         # calculate statistical inefficiency of series, with equilibrium detection
         t, statinef, Neff_max  = detectEquilibration(series.values)

diff --git a/src/alchemlyb/tests/test_preprocessing.py b/src/alchemlyb/tests/test_preprocessing.py
@@ -4,6 +4,7 @@
 import pytest
 
 import pandas as pd
+import numpy as np
 
 from alchemlyb.parsing import gmx
 from alchemlyb.preprocessing import slicing
@@ -23,6 +24,15 @@ def gmx_benzene_u_nk():
     return gmx.extract_u_nk(dataset['data']['Coulomb'][0], T=300)
 
 
+def gmx_benzene_dHdl_full():
+    dataset = alchemtest.gmx.load_benzene()
+    return pd.concat([gmx.extract_dHdl(i, T=300) for i in dataset['data']['Coulomb']])
+
+
+def gmx_benzene_u_nk_full():
+    dataset = alchemtest.gmx.load_benzene()
+    return pd.concat([gmx.extract_u_nk(i, T=300) for i in dataset['data']['Coulomb']])
+
 class TestSlicing:
     """Test slicing functionality.
 
@@ -35,30 +45,59 @@ def slicer(self, *args, **kwargs):
     def test_basic_slicing(self, data, size):
         assert len(self.slicer(data, lower=1000, upper=34000, step=5)) == size
 
+    @pytest.mark.parametrize('data', [gmx_benzene_dHdl(),
+                                      gmx_benzene_u_nk()])
+    def test_disordered_exception(self, data):
+        """Test that a shuffled DataFrame yields a KeyError.
 
-class TestStatisticalInefficiency(TestSlicing):
+        """
+        indices = np.arange(len(data))
+        np.random.shuffle(indices)
 
-    def slicer(self, *args, **kwargs):
-        return statistical_inefficiency(*args, **kwargs)
+        df = data.iloc[indices]
+
+        with pytest.raises(KeyError):
+            self.slicer(df, lower=200)
+
+    @pytest.mark.parametrize('data', [gmx_benzene_dHdl_full(),
+                                      gmx_benzene_u_nk_full()])
+    def test_duplicated_exception(self, data):
+        """Test that a DataFrame with duplicate times yields a KeyError.
+
+        """
+        with pytest.raises(KeyError):
+            self.slicer(data, lower=200)
+
+
+class CorrelatedPreprocessors:
 
     @pytest.mark.parametrize(('data', 'size'), [(gmx_benzene_dHdl(), 4001),
                                                 (gmx_benzene_u_nk(), 4001)])
-    def test_statinef_subsampling(self, data, size):
+    def test_subsampling(self, data, size):
         """Basic test for execution; resulting size of dataset sensitive to
-        machine.
+        machine and depends on algorithm.
         """
         assert len(self.slicer(data, series=data.iloc[:, 0])) <= size
 
+    @pytest.mark.parametrize('data', [gmx_benzene_dHdl(),
+                                      gmx_benzene_u_nk()])
+    def test_no_series(self, data):
+        """Check that we get the same result as simple slicing with no Series.
 
-class TestEquilibriumDetection(TestSlicing):
+        """
+        df_sub = self.slicer(data, lower=200, upper=5000, step=2) 
+        df_sliced = slicing(data, lower=200, upper=5000, step=2) 
+
+        assert np.all((df_sub == df_sliced))
+
+
+class TestStatisticalInefficiency(TestSlicing, CorrelatedPreprocessors):
 
     def slicer(self, *args, **kwargs):
-        return equilibrium_detection(*args, **kwargs)
+        return statistical_inefficiency(*args, **kwargs)
 
-    @pytest.mark.parametrize(('data', 'size'), [(gmx_benzene_dHdl(), 4001),
-                                                (gmx_benzene_u_nk(), 4001)])
-    def test_equildet_subsampling(self, data, size):
-        """Basic test for execution; resulting size of dataset sensitive to
-        machine.
-        """
-        assert len(self.slicer(data, series=data.iloc[:, 0])) <= size
+
+class TestEquilibriumDetection(TestSlicing, CorrelatedPreprocessors):
+
+    def slicer(self, *args, **kwargs):
+        return equilibrium_detection(*args, **kwargs)