Moving average (#381)

* Add moving_average function for visualization and convergence testing * Update versionadded * Run Black * Bug fix bar_.py states * Update Changelog * Update the docs * Add tests * Formatting to align with Black * Update tests * Refactor moving_average to align with forward_backward_convergence function * Update tests * Update test_convergence and lambda tests in convergence.moving_average * Adjust convergence.py and tests for codecoverage * black * Update moving_average to block_average for more accurate descriptive name * Address reviewer comments * Update test to align with changed handling of dfs of different length in block_average * Remove incorrect popagation of error in BAR * Add tests and error catch for ill constructed BAR input, u_nk * black * Updated version comments --------- Co-authored-by: Oliver Beckstein <orbeckst@gmail.com>
alchemistry · Sep 14, 2024 · b1b6d4f · b1b6d4f
1 parent 4cba9ed
commit b1b6d4f
Show file tree

Hide file tree

Showing 15 changed files with 515 additions and 29 deletions.
diff --git a/CHANGES b/CHANGES
@@ -13,6 +13,16 @@ The rules for this file:
   * release numbers follow "Semantic Versioning" https://semver.org
 
 ------------------------------------------------------------------------------
+
+??/??/2024 jaclark5
+
+  * 2.4.0
+
+Enhancements
+  - Addition of `block_average` function in both `convergence` and 
+    `visualization` (Issue #380, PR #381)
+
+
 08/24/2024 xiki-tempula
 
   * 2.3.2

diff --git a/docs/convergence.rst b/docs/convergence.rst
@@ -75,6 +75,31 @@ is, where 0 fully-unequilibrated and 1.0 is fully-equilibrated. ::
     >>> value = A_c(dhdl_list, tol=2)
     0.7085
 
+Block Average
+--------------
+If one obtains suspicious results from the forward / backward convergence plot,
+it may be useful to view the block averages of the change in free energy using
+:func:`~alchemlyb.convergence.block_average` and
+:func:`~alchemlyb.visualisation.plot_block_average` over the course of each 
+step in lambda individually, the following example is for :math:`\lambda` = 0
+
+    >>> from alchemtest.gmx import load_benzene
+    >>> from alchemlyb.parsing.gmx import extract_u_nk
+    >>> from alchemlyb.visualisation import plot_block_average
+    >>> from alchemlyb.convergence import block_average
+
+    >>> bz = load_benzene().data
+    >>> data_list = [extract_u_nk(xvg, T=300) for xvg in bz['Coulomb']]
+    >>> df = block_average(data_list, 'mbar')
+    >>> ax = plot_block_average(df)
+    >>> ax.figure.savefig('dF_t_block_average.png')
+
+Will give a plot looks like this
+
+.. figure:: images/dF_t_block_average.png
+
+   A plot of the free energy divided into block averages showing consistency/inconsistency 
+   across the trajectory.
 
 Convergence functions
 ---------------------

diff --git a/docs/convergence/alchemlyb.convergence.convergence.rst b/docs/convergence/alchemlyb.convergence.convergence.rst
@@ -15,3 +15,5 @@ All convergence functions are located in this submodule but for convenience they
 .. autofunction:: alchemlyb.convergence.fwdrev_cumavg_Rc
 
 .. autofunction:: alchemlyb.convergence.A_c		  
+
+.. autofunction:: alchemlyb.convergence.block_average
diff --git a/docs/images/dF_t_block_average.png b/docs/images/dF_t_block_average.png
diff --git a/docs/visualisation.rst b/docs/visualisation.rst
@@ -19,6 +19,7 @@ Plotting Functions
     plot_ti_dhdl
     plot_dF_state
     plot_convergence
+    plot_block_average
 
 .. _plot_overlap_matrix:
 

diff --git a/docs/visualisation/alchemlyb.visualisation.plot_block_average.rst b/docs/visualisation/alchemlyb.visualisation.plot_block_average.rst
@@ -0,0 +1,6 @@
+alchemlyb.visualisation.plot\_block\_average
+=============================================
+
+.. currentmodule:: alchemlyb.visualisation
+
+.. autofunction:: plot_block_average
diff --git a/src/alchemlyb/convergence/__init__.py b/src/alchemlyb/convergence/__init__.py
@@ -1 +1 @@
-from .convergence import forward_backward_convergence, fwdrev_cumavg_Rc, A_c
+from .convergence import forward_backward_convergence, fwdrev_cumavg_Rc, A_c, block_average
diff --git a/src/alchemlyb/convergence/convergence.py b/src/alchemlyb/convergence/convergence.py
@@ -30,20 +30,26 @@ def forward_backward_convergence(
     Parameters
     ----------
     df_list : list
-        List of DataFrame of either dHdl or u_nk.
+        List of DataFrame of either dHdl or u_nk, where each represents a
+        different value of lambda.
     estimator : {'MBAR', 'BAR', 'TI'}
         Name of the estimators.
-        See the important note below on the use of "MBAR".
 
         .. deprecated:: 1.0.0
            Lower case input is also accepted until release 2.0.0.
     num : int
-        The number of time points.
+        The number of blocks used to divide *each* DataFrame and progressively add
+        to assess convergence. Note that if the DataFrames are different lengths, 
+        the number of samples contributed with each block will be different.
     error_tol : float
         The maximum error tolerated for analytic error. If the analytic error is
         bigger than the error tolerance, the bootstrap error will be used.
 
         .. versionadded:: 2.3.0
+        .. versionchanged:: 2.4.0
+           Clarified docstring, removed incorrect estimation of std for cumulative
+           result in bar and added check that only a single lambda state is 
+           represented in the indices of each df in df_list.
 
     kwargs : dict
         Keyword arguments to be passed to the estimator.
@@ -94,7 +100,16 @@ def forward_backward_convergence(
         # select estimator class by name
         my_estimator = estimators_dispatch[estimator](**kwargs)
         logger.info(f"Use {estimator} estimator for convergence analysis.")
-
+
+    # Check that each df in the list has only one value of lambda
+    for i, df in enumerate(df_list):
+        lambda_values = list(set([x[1:] for x in df.index.to_numpy()]))
+        if len(lambda_values) > 1:
+            ind  = [j for j in range(len(lambda_values[0])) if len(list(set([x[j] for x in lambda_values]))) > 1][0]
+            raise ValueError(
+                "Provided DataFrame, df_list[{}] has more than one lambda value in df.index[{}]".format(i, ind)
+            )
+
     logger.info("Begin forward analysis")
     forward_list = []
     forward_error_list = []
@@ -170,14 +185,9 @@ def _forward_backward_convergence_estimate(
         my_estimator.initial_f_k = result.delta_f_.iloc[0, :]
     mean = result.delta_f_.iloc[0, -1]
     if estimator.lower() == "bar":
-        error = np.sqrt(
-            sum(
-                [
-                    result.d_delta_f_.iloc[i, i + 1] ** 2
-                    for i in range(len(result.d_delta_f_) - 1)
-                ]
-            )
-        )
+        # See https://github.com/alchemistry/alchemlyb/pull/60#issuecomment-430720742
+        # Error estimate generated by BAR ARE correlated
+        error = np.nan
     else:
         error = result.d_delta_f_.iloc[0, -1]
     if estimator.lower() == "mbar" and error > error_tol:
@@ -262,7 +272,7 @@ def fwdrev_cumavg_Rc(series, precision=0.01, tol=2):
     float
         Convergence time fraction :math:`R_c` [Fan2021]_
     :class:`pandas.DataFrame`
-        The DataFrame with moving average. ::
+        The DataFrame with block average. ::
 
                 Forward  Backward  data_fraction
             0  3.016442  3.065176            0.1
@@ -389,3 +399,106 @@ def A_c(series_list, precision=0.01, tol=2):
             d_R_c = sorted_array[-i] - sorted_array[-i - 1]
             result += d_R_c * sum(R_c_list <= element) / n_R_c
     return result
+
+
+def block_average(df_list, estimator="MBAR", num=10, **kwargs):
+    """Free energy estimate for portions of the trajectory.
+
+    Generate the free energy estimate for a series of blocks in time,
+    with the specified number of equally spaced points.
+    For example, setting `num` to 10 would give the block averages
+    which is the free energy estimate from the first 10% alone, then the
+    next 10% ... of the data.
+
+    Parameters
+    ----------
+    df_list : list
+        List of DataFrame of either dHdl or u_nk, where each represents a
+        different value of lambda.
+    estimator : {'MBAR', 'BAR', 'TI'}
+        Name of the estimators.
+    num : int
+        The number of blocks used to divide *each* DataFrame. Note that
+        if the DataFrames are different lengths, the number of samples
+        contributed to each block will be different.
+    kwargs : dict
+        Keyword arguments to be passed to the estimator.
+
+    Returns
+    -------
+    :class:`pandas.DataFrame`
+        The DataFrame with estimate data. ::
+
+               FE             FE_Error
+            0  3.016442       0.052748
+            1  3.078106       0.037170
+            2  3.072561       0.030186
+            3  3.048325       0.026070
+            4  3.049769       0.023359
+            5  3.034078       0.021260
+            6  3.043274       0.019642
+            7  3.035460       0.018340
+            8  3.042032       0.017319
+            9  3.044149       0.016405
+
+
+    .. versionadded:: 2.4.0
+
+    """
+    logger.info("Start block averaging analysis.")
+    logger.info("Check data availability.")
+    if estimator not in (FEP_ESTIMATORS + TI_ESTIMATORS):
+        msg = f"Estimator {estimator} is not available in {FEP_ESTIMATORS + TI_ESTIMATORS}."
+        logger.error(msg)
+        raise ValueError(msg)
+    else:
+        # select estimator class by name
+        estimator_fit = estimators_dispatch[estimator](**kwargs).fit
+        logger.info(f"Use {estimator} estimator for convergence analysis.")
+
+    # Check that each df in the list has only one value of lambda
+    for i, df in enumerate(df_list):
+        lambda_values = list(set([x[1:] for x in df.index.to_numpy()]))
+        if len(lambda_values) > 1:
+            ind  = [j for j in range(len(lambda_values[0])) if len(list(set([x[j] for x in lambda_values]))) > 1][0]
+            raise ValueError(
+                "Provided DataFrame, df_list[{}] has more than one lambda value in df.index[{}]".format(i, ind)
+            )
+
+    if estimator in ["BAR"] and len(df_list) > 2:
+        raise ValueError(
+            "Restrict to two DataFrames, one with a fep-lambda value and one its forward adjacent state for a "
+            "meaningful result."
+        )
+
+    logger.info("Begin Moving Average Analysis")
+    average_list = []
+    average_error_list = []
+    for i in range(1, num):
+        logger.info("Moving Average Analysis: {:.2f}%".format(100 * i / num))
+        sample = []
+        for data in df_list:
+            ind1, ind2 = len(data) // num * (i - 1), len(data) // num * i
+            sample.append(data[ind1:ind2])
+        sample = concat(sample)
+        result = estimator_fit(sample)
+
+        average_list.append(result.delta_f_.iloc[0, -1])
+        if estimator.lower() == "bar":
+            # See https://github.com/alchemistry/alchemlyb/pull/60#issuecomment-430720742
+            # Error estimate generated by BAR ARE correlated
+            average_error_list.append(np.nan)
+        else:
+            average_error_list.append(result.d_delta_f_.iloc[0, -1])
+        logger.info(
+            "{:.2f} +/- {:.2f} kT".format(average_list[-1], average_error_list[-1])
+        )
+
+    convergence = pd.DataFrame(
+        {
+            "FE": average_list,
+            "FE_Error": average_error_list,
+        }
+    )
+    convergence.attrs = df_list[0].attrs
+    return convergence
diff --git a/src/alchemlyb/estimators/bar_.py b/src/alchemlyb/estimators/bar_.py
@@ -54,6 +54,9 @@ class BAR(BaseEstimator, _EstimatorMixOut):
 
     .. versionchanged:: 1.0.0
        `delta_f_`, `d_delta_f_`, `states_` are view of the original object.
+    .. versionchanged:: 2.4.0
+       Added assessment of lambda states represented in the indices of u_nk
+       to provide meaningful errors to ensure proper use.
 
     """
 
@@ -88,7 +91,7 @@ def fit(self, u_nk):
         # sort by state so that rows from same state are in contiguous blocks
         u_nk = u_nk.sort_index(level=u_nk.index.names[1:])
 
-        # get a list of the lambda states
+        # get a list of the lambda states that are sampled
         self._states_ = u_nk.columns.values.tolist()
 
         # group u_nk by lambda states
@@ -97,11 +100,25 @@ def fit(self, u_nk):
             (len(groups.get_group(i)) if i in groups.groups else 0)
             for i in u_nk.columns
         ]
-
+
+        # Pull lambda states from indices
+        states = list(set( x[1:] for x in u_nk.index))
+        for state in states:
+            if len(state) == 1:
+                state = state[0]
+            if state not in self._states_:
+                raise ValueError(
+                    f"Indexed lambda state, {state}, is not represented in u_nk columns:"
+                    f" {self._states_}"
+                )
+
         # Now get free energy differences and their uncertainties for each step
         deltas = np.array([])
         d_deltas = np.array([])
         for k in range(len(N_k) - 1):
+            if N_k[k] == 0 or N_k[k + 1] == 0:
+                continue
+
             # get us from lambda step k
             uk = groups.get_group(self._states_[k])
             # get w_F
@@ -126,6 +143,13 @@ def fit(self, u_nk):
             deltas = np.append(deltas, df)
             d_deltas = np.append(d_deltas, ddf**2)
 
+        if len(deltas) == 0 and len(states) > 1:
+            raise ValueError(
+                "u_nk does not contain energies computed between any adjacent states.\n"
+                "To compute the free energy with BAR, ensure that values in u_nk exist"
+                f" for the columns:\n{states}."
+            )
+
         # build matrix of deltas between each state
         adelta = np.zeros((len(deltas) + 1, len(deltas) + 1))
         ad_delta = np.zeros_like(adelta)
@@ -150,13 +174,11 @@ def fit(self, u_nk):
             ad_delta += np.diagflat(np.array(dout), k=j + 1)
 
         # yield standard delta_f_ free energies between each state
-        self._delta_f_ = pd.DataFrame(
-            adelta - adelta.T, columns=self._states_, index=self._states_
-        )
+        self._delta_f_ = pd.DataFrame(adelta - adelta.T, columns=states, index=states)
 
         # yield standard deviation d_delta_f_ between each state
         self._d_delta_f_ = pd.DataFrame(
-            np.sqrt(ad_delta + ad_delta.T), columns=self._states_, index=self._states_
+            np.sqrt(ad_delta + ad_delta.T), columns=states, index=states
         )
         self._delta_f_.attrs = u_nk.attrs
         self._d_delta_f_.attrs = u_nk.attrs

diff --git a/src/alchemlyb/estimators/mbar_.py b/src/alchemlyb/estimators/mbar_.py
@@ -33,7 +33,7 @@ class MBAR(BaseEstimator, _EstimatorMixOut):
         .. versionchanged:: 2.3.0
            The new default is now "BAR" as it provides a substantial speedup
            over the previous default `None`.
-           
+
 
     method : str, optional, default="robust"
         The optimization routine to use.  This can be any of the methods
@@ -79,6 +79,11 @@ class MBAR(BaseEstimator, _EstimatorMixOut):
         default value for `method` was changed from "hybr" to "robust"
     .. versionchanged:: 2.1.0
         `n_bootstraps` option added.
+    .. versionchanged:: 2.4.0
+       Handle initial estimate, initial_f_k, from bar in the instance
+       that not all lambda states represented as column headers are 
+       represented in the indices of u_nk.
+    
     """
 
     def __init__(
@@ -135,6 +140,20 @@ def fit(self, u_nk):
             )
             bar.fit(u_nk)
             initial_f_k = bar.delta_f_.iloc[0, :]
+            if len(bar.delta_f_.iloc[0, :]) != len(self._states_):
+                states = [
+                    x
+                    for i, x in enumerate(self._states_[:-1])
+                    if N_k[i] > 0 and N_k[i + 1] > 0
+                ]
+                initial_f_k = pd.Series(
+                    [
+                        initial_f_k.loc(x) if x in states else np.nan
+                        for x in self._states_
+                    ],
+                    index=self._states_,
+                    dtype=float,
+                )
         else:
             initial_f_k = self.initial_f_k
Original file line number	Diff line number	Diff line change
Expand Up		@@ -15,3 +15,5 @@ All convergence functions are located in this submodule but for convenience they
		.. autofunction:: alchemlyb.convergence.fwdrev_cumavg_Rc

		.. autofunction:: alchemlyb.convergence.A_c

		.. autofunction:: alchemlyb.convergence.block_average
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from .convergence import forward_backward_convergence, fwdrev_cumavg_Rc, A_c
		from .convergence import forward_backward_convergence, fwdrev_cumavg_Rc, A_c, block_average