diff --git a/pypesto/ensemble/ensemble.py b/pypesto/ensemble/ensemble.py index ecd3d7b85..5446e2103 100644 --- a/pypesto/ensemble/ensemble.py +++ b/pypesto/ensemble/ensemble.py @@ -555,6 +555,7 @@ def __init__( def from_sample( result: Result, remove_burn_in: bool = True, + ci_level: float = None, chain_slice: slice = None, x_names: Sequence[str] = None, lower_bound: np.ndarray = None, @@ -571,6 +572,10 @@ def from_sample( remove_burn_in: Exclude parameter vectors from the ensemble if they are in the "burn-in". + ci_level: + A form of relative cutoff. Exclude parameter vectors, for which the + (non-normalized) posterior value is not within the `ci_level` best + values. chain_slice: Subset the chain with a slice. Any "burn-in" removal occurs first. x_names: @@ -594,14 +599,23 @@ def from_sample( lower_bound = result.problem.lb if upper_bound is None: upper_bound = result.problem.ub + burn_in = 0 if remove_burn_in: if result.sample_result.burn_in is None: geweke_test(result) burn_in = result.sample_result.burn_in x_vectors = x_vectors[burn_in:] + + # added cutoff + if ci_level is not None: + x_vectors = calculate_hpd( + result=result, burn_in=burn_in, ci_level=ci_level + ) + if chain_slice is not None: x_vectors = x_vectors[chain_slice] x_vectors = x_vectors.T + return Ensemble( x_vectors=x_vectors, x_names=x_names, @@ -1253,3 +1267,77 @@ def calculate_cutoff( range = chi2.ppf(q=percentile / 100, df=df) return fval_opt + range + + +def calculate_hpd( + result: Result, + burn_in: int = 0, + ci_level: float = 0.95, +): + """ + Calculate Highest Posterior Density (HPD) samples. + + The HPD is calculated for a user-defined credibility level (`ci_level`). The + HPD includes all parameter vectors with a (non-normalized) posterior + probability that is higher than the lowest `1-ci_level` % + posterior probability values. + + Parameters + ---------- + result: + The sampling result from which to create the ensemble. + burn_in: + Burn in index that is cut off before HPD is calculated. + ci_level: + Credibility level of the resulting HPD. 0.95 corresponds to the 95% CI. + Only values between 0 and 1 are allowed. + + Returns + ------- + The HPD parameter vectors. + """ + if not 0 <= ci_level <= 1: + raise ValueError( + f"ci_level={ci_level} is not valid. Choose 0<=ci_level<=1." + ) + # get names of chain parameters + param_names = result.problem.get_reduced_vector(result.problem.x_names) + + # Get converged parameter samples as numpy arrays + chain = np.asarray(result.sample_result.trace_x[0, burn_in:, :]) + neglogpost = result.sample_result.trace_neglogpost[0, burn_in:] + indices = np.arange( + burn_in, len(result.sample_result.trace_neglogpost[0, :]) + ) + + # create df first, as we need to match neglogpost to the according parameter values + pd_params = pd.DataFrame(chain, columns=param_names) + pd_fval = pd.DataFrame(neglogpost, columns=["neglogPosterior"]) + pd_iter = pd.DataFrame(indices, columns=["iteration"]) + + params_df = pd.concat( + [pd_params, pd_fval, pd_iter], axis=1, ignore_index=False + ) + + # get lower neglogpost bound for HPD + # sort neglogpost values of MCMC chain without burn in + neglogpost_sort = np.sort(neglogpost) + + # Get converged chain length + chain_length = len(neglogpost) + + # most negative ci percentage samples of the posterior are kept to get the according HPD + neglogpost_lower_bound = neglogpost_sort[int(chain_length * (ci_level))] + + # cut posterior to hpd + hpd_params_df = params_df[ + params_df["neglogPosterior"] <= neglogpost_lower_bound + ] + + # convert df to ensemble vector + hpd_params_df_vals_only = hpd_params_df.drop( + columns=["iteration", "neglogPosterior"] + ) + hpd_ensemble_vector = hpd_params_df_vals_only.to_numpy() + + return hpd_ensemble_vector diff --git a/test/base/test_ensemble.py b/test/base/test_ensemble.py index 12089af53..9a2383448 100644 --- a/test/base/test_ensemble.py +++ b/test/base/test_ensemble.py @@ -6,6 +6,7 @@ import pypesto import pypesto.optimize as optimize +import pypesto.sample as sample from pypesto.C import AMICI_STATUS, AMICI_T, AMICI_Y, MEAN, WEIGHTED_SIGMA from pypesto.engine import MultiProcessEngine from pypesto.ensemble import ( @@ -224,3 +225,49 @@ def post_processor(amici_outputs, output_type, output_ids): progress_bar=False, ) return ensemble_prediction + + +def test_hpd_calculation(): + """Test the calculation of Highest Posterior Density (HPD).""" + problem = create_petab_problem() + + sampler = sample.AdaptiveMetropolisSampler( + options={"show_progress": False} + ) + + result = optimize.minimize( + problem=problem, + n_starts=3, + progress_bar=False, + ) + + result = sample.sample( + problem=problem, + sampler=sampler, + n_samples=100, + result=result, + ) + + # Manually set up sample (only for testing) + burn_in = 1 + result.sample_result.burn_in = burn_in + result.sample_result.trace_neglogpost[0][1:] = np.random.permutation( + np.arange(len(result.sample_result.trace_neglogpost[0][1:])) + ) + + hpd_ensemble = Ensemble.from_sample( + result=result, remove_burn_in=True, ci_level=0.95 + ) + + expected_length = ( + int((result.sample_result.trace_x[0][burn_in:].shape[0]) * 0.95) + 1 + ) + # Check that the HPD parameters have the expected shape + assert hpd_ensemble.x_vectors.shape == (problem.dim, expected_length) + x_indices = np.where(result.sample_result.trace_neglogpost[0][1:] <= 95)[0] + assert np.all( + [ + np.any(np.all(x[:, None] == hpd_ensemble.x_vectors, axis=0)) + for x in result.sample_result.trace_x[0][burn_in:][x_indices] + ] + )