From 90357b908f3f35338cf73d3fcf45ae06694b25c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Tue, 30 Jan 2024 15:18:34 +0100 Subject: [PATCH 01/52] Revert "[MNT] skip `CyclicBoosting` and QPD tests until #189 failures are resolved" --- skpro/distributions/tests/test_all_distrs.py | 5 ----- skpro/regression/tests/test_all_regressors.py | 5 ----- 2 files changed, 10 deletions(-) diff --git a/skpro/distributions/tests/test_all_distrs.py b/skpro/distributions/tests/test_all_distrs.py index 88071df28..08898663b 100644 --- a/skpro/distributions/tests/test_all_distrs.py +++ b/skpro/distributions/tests/test_all_distrs.py @@ -60,11 +60,6 @@ def _has_capability(distr, method): class TestAllDistributions(PackageConfig, DistributionFixtureGenerator, QuickTester): """Module level tests for all skpro parameter fitters.""" - # TEMPORARY skip for CyclicBoosting and QPD classes - # due to silent failures on main, se #190 - exclude_objects = ["QPD_S", "QPD_B", "QPD_U"] - # remove this when fixing failures to re-enable testing - @pytest.mark.parametrize("shuffled", [False, True]) def test_sample(self, object_instance, shuffled): """Test sample expected return.""" diff --git a/skpro/regression/tests/test_all_regressors.py b/skpro/regression/tests/test_all_regressors.py index 5451109ac..972286491 100644 --- a/skpro/regression/tests/test_all_regressors.py +++ b/skpro/regression/tests/test_all_regressors.py @@ -20,11 +20,6 @@ class TestAllRegressors(PackageConfig, BaseFixtureGenerator, QuickTester): # which object types are generated; None=all, or class (passed to all_objects) object_type_filter = BaseProbaRegressor - # TEMPORARY skip for CyclicBoosting and QPD classes - # due to silent failures on main, se #190 - exclude_objects = ["CyclicBoosting"] - # remove this when fixing failures to re-enable testing - def test_input_output_contract(self, object_instance): """Tests that output of predict methods is as specified.""" import pandas as pd From 17b3e651f0bc4f0c05a44818598470612020c66a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Tue, 30 Jan 2024 15:29:51 +0100 Subject: [PATCH 02/52] Update test_all_regressors.py --- skpro/regression/tests/test_all_regressors.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/skpro/regression/tests/test_all_regressors.py b/skpro/regression/tests/test_all_regressors.py index fe6bc951d..06d991be3 100644 --- a/skpro/regression/tests/test_all_regressors.py +++ b/skpro/regression/tests/test_all_regressors.py @@ -20,11 +20,6 @@ class TestAllRegressors(PackageConfig, BaseFixtureGenerator, QuickTester): # passed to skpro.registry.all_objects as object_type object_type_filter = "regressor_proba" - # TEMPORARY skip for CyclicBoosting and QPD classes - # due to silent failures on main, se #190 - exclude_objects = ["CyclicBoosting"] - # remove this when fixing failures to re-enable testing - def test_input_output_contract(self, object_instance): """Tests that output of predict methods is as specified.""" import pandas as pd From c564906ff772ce8e679ea1c98c7a28319743db9e Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Wed, 3 Apr 2024 11:53:41 +0900 Subject: [PATCH 03/52] update for vectorized QPD and bug fix for qpd test --- pyproject.toml | 3 +- skpro/distributions/__init__.py | 4 +- skpro/distributions/qpd.py | 564 ++++++------------ skpro/distributions/tests/test_qpd.py | 26 +- skpro/regression/cyclic_boosting.py | 31 +- .../regression/tests/test_cyclic_boosting.py | 14 +- 6 files changed, 208 insertions(+), 434 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f7c1105ac..4dc064c01 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,7 +61,8 @@ all_extras = [ "statsmodels>=0.12.1", "tabulate", "uncertainties", - "cyclic-boosting>=1.2.5; python_version < '3.12'" + "cyclic-boosting>=1.4.0; python_version < '3.12'", + "findiff" ] dev = [ diff --git a/skpro/distributions/__init__.py b/skpro/distributions/__init__.py index 7fda17848..6aec89335 100644 --- a/skpro/distributions/__init__.py +++ b/skpro/distributions/__init__.py @@ -1,4 +1,5 @@ """Probability distribution objects.""" + # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) # adapted from sktime @@ -10,12 +11,11 @@ "TDistribution", "QPD_S", "QPD_B", - "QPD_U", ] from skpro.distributions.empirical import Empirical from skpro.distributions.laplace import Laplace from skpro.distributions.mixture import Mixture from skpro.distributions.normal import Normal -from skpro.distributions.qpd import QPD_B, QPD_S, QPD_U +from skpro.distributions.qpd import QPD_B, QPD_S from skpro.distributions.t import TDistribution diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 746206ce2..5e7e0d297 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -1,4 +1,7 @@ """Johnson Quantile-Parameterized Distributions.""" + +from __future__ import annotations + # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) __author__ = [ @@ -6,13 +9,17 @@ "setoguchi-naoki", ] # interface only. Cyclic boosting authors in cyclic_boosting package +import typing import warnings -from typing import Optional +from typing import Optional, Sequence, Union + +if typing.TYPE_CHECKING: + from cyclic_boosting.quantile_matching import J_QPD_S, J_QPD_B + from pandas import DataFrame, Index import numpy as np import pandas as pd -from scipy.integrate import quad -from scipy.misc import derivative +from findiff import FinDiff from scipy.stats import logistic, norm from skpro.distributions.base import BaseDistribution @@ -38,12 +45,9 @@ class QPD_S(BaseDistribution): qv_high : float or array_like[float] quantile function value of quantile ``1 - alpha`` lower : float - lower bound of semi-bounded range (default is 0) + lower bound of semi-bounded range version: str options are ``normal`` (default) or ``logistic`` - dist_shape: str - parameter modifying the logistic base distribution via - sinh/arcsinh-scaling (only active in sinhlogistic version) Example ------- @@ -76,12 +80,11 @@ class QPD_S(BaseDistribution): def __init__( self, alpha: float, - qv_low: float or object, - qv_median: float or object, - qv_high: float or object, - lower: Optional[float] = 0.0, - version: Optional[str] = "normal", - dist_shape: Optional[float] = 0.0, + qv_low: float | Sequence, + qv_median: float | Sequence, + qv_high: float | Sequence, + lower: float, + version: str | None = "normal", index=None, columns=None, ): @@ -92,21 +95,18 @@ def __init__( self.qv_high = qv_high self.lower = lower self.version = version - self.dist_shape = dist_shape self.index = index self.columns = columns super().__init__(index=index, columns=columns) - from cyclic_boosting.quantile_matching import J_QPD_extended_S + from cyclic_boosting.quantile_matching import J_QPD_S params = [alpha, qv_low, qv_median, qv_high] for idx, p in enumerate(params): if isinstance(p, float): params[idx] = np.array([p]) - elif ( - isinstance(p, tuple) or isinstance(p, list) or isinstance(p, np.ndarray) - ): + elif isinstance(p, (tuple, list, np.ndarray)): params[idx] = np.array(p) else: raise ValueError("data type is not float or array_like object") @@ -144,21 +144,16 @@ def __init__( qv_median[idx] = mid qv_high[idx] = high - iter = np.nditer(qv_low, flags=["c_index"]) - for _i in iter: - jqpd = J_QPD_extended_S( - alpha=alpha, - qv_low=qv_low[iter.index], - qv_median=qv_median[iter.index], - qv_high=qv_high[iter.index], - l=self.lower, - version=version, - shape=dist_shape, - ) - self.qpd.append(jqpd) - self.qpd = pd.DataFrame(self.qpd, index=self.index) + self.qpd = J_QPD_S( + alpha=alpha, + qv_low=qv_low, + qv_median=qv_median, + qv_high=qv_high, + l=self.lower, + version=version, + ) - def mean(self, lower=0.0, upper=np.inf): + def mean(self, lower: float = None, upper: float = None): """Return expected value of the distribution. Returns @@ -166,15 +161,14 @@ def mean(self, lower=0.0, upper=np.inf): pd.DataFrame with same rows, columns as `self` expected value of distribution (entry-wise) """ - loc = [] - for idx in self.index: - qpd = self.qpd.loc[idx, :].values[0] - l, _ = quad(exp_func, args=(qpd), a=lower, b=upper) - loc.append(l) - loc_arr = np.array(loc) - return pd.DataFrame(loc_arr, index=self.index, columns=self.columns) - - def var(self, lower=0.0, upper=np.inf): + if not lower: + lower = self.lower + if not upper: + upper = 1e3 + loc = exp_func(lower, upper, self.qpd, self.index.shape[0]) + return pd.DataFrame(loc, index=self.index, columns=self.columns) + + def var(self, lower: float = None, upper: float = None): """Return element/entry-wise variance of the distribution. Returns @@ -182,15 +176,13 @@ def var(self, lower=0.0, upper=np.inf): pd.DataFrame with same rows, columns as `self` variance of distribution (entry-wise) """ - mean = self.mean() - var = [] - for idx in self.index: - mu = mean.loc[idx, :].to_numpy() - qpd = self.qpd.loc[idx, :].values[0] - l, _ = quad(var_func, args=(mu, qpd), a=lower, b=upper) - var.append(l) - var_arr = np.array(var) - return pd.DataFrame(var_arr, index=self.index, columns=self.columns) + if not lower: + lower = self.lower + if not upper: + upper = 1e3 + mean = self.mean(lower, upper).values + var = var_func(mean, lower, upper, self.qpd, self.index.shape[0]) + return pd.DataFrame(var, index=self.index, columns=self.columns) def pdf(self, x: pd.DataFrame): """Probability density function. @@ -198,34 +190,18 @@ def pdf(self, x: pd.DataFrame): this fucntion transform cdf to pdf because j-qpd's pdf calculation is bit complex """ - pdf = [] - for idx in x.index: - qpd = self.qpd.loc[idx, :].values[0] - _x = x.loc[idx, :] - _pdf = derivative(qpd.cdf, _x, dx=1e-6) - pdf.append(_pdf) - pdf_arr = np.array(pdf) - return pd.DataFrame(pdf_arr, index=x.index, columns=x.columns) + pdf = pdf_func(x, self.qpd, self.index) + return pd.DataFrame(pdf, index=x.index, columns=x.columns) def ppf(self, p: pd.DataFrame): """Quantile function = percent point function = inverse cdf.""" - ppf = [] - for idx in p.index: - qpd = self.qpd.loc[idx, :].values[0] - _ppf = qpd.ppf(p.loc[idx, :]) - ppf.append(_ppf) - ppf_arr = np.array(ppf) - return pd.DataFrame(ppf_arr, index=p.index, columns=p.columns) + ppf = ppf_func(p, self.qpd, self.index) + return pd.DataFrame(ppf, index=p.index, columns=p.columns) def cdf(self, x: pd.DataFrame): """Cumulative distribution function.""" - cdf = [] - for idx in x.index: - qpd = self.qpd.loc[idx, :].values[0] - _cdf = qpd.cdf(x.loc[idx, :]) - cdf.append(_cdf) - cdf_arr = np.array(cdf) - return pd.DataFrame(cdf_arr, index=x.index, columns=x.columns) + cdf = cdf_func(x, self.qpd, self.index) + return pd.DataFrame(cdf, index=x.index, columns=x.columns) @classmethod def get_test_params(cls, parameter_set="default"): @@ -233,16 +209,18 @@ def get_test_params(cls, parameter_set="default"): params1 = { "alpha": 0.2, "version": "normal", - "qv_low": 0.2, - "qv_median": 0.5, - "qv_high": 0.8, + "qv_low": -0.3, + "qv_median": 0.0, + "qv_high": 0.3, + "lower": -0.5, } params2 = { "alpha": 0.2, "version": "normal", - "qv_low": [0.2, 0.2, 0.2], - "qv_median": [0.5, 0.5, 0.5], - "qv_high": [0.8, 0.8, 0.8], + "qv_low": [-0.3, -0.3, -0.3], + "qv_median": [0.0, 0.0, 0.0], + "qv_high": [0.3, 0.3, 0.3], + "lower": -0.5, "index": pd.Index([1, 2, 5]), "columns": pd.Index(["a"]), } @@ -274,9 +252,6 @@ class QPD_B(BaseDistribution): upper bound of supported range version: str options are ``normal`` (default) or ``logistic`` - dist_shape: str - parameter modifying the logistic base distribution via - sinh/arcsinh-scaling (only active in sinhlogistic version) Example ------- @@ -310,13 +285,12 @@ class QPD_B(BaseDistribution): def __init__( self, alpha: float, - qv_low: float or object, - qv_median: float or object, - qv_high: float or object, + qv_low: float | Sequence, + qv_median: float | Sequence, + qv_high: float | Sequence, lower: float, upper: float, - version: Optional[str] = "normal", - dist_shape: Optional[float] = 0.0, + version: str | None = "normal", index=None, columns=None, ): @@ -328,21 +302,18 @@ def __init__( self.lower = lower self.upper = upper self.version = version - self.dist_shape = dist_shape self.index = index self.columns = columns super().__init__(index=index, columns=columns) - from cyclic_boosting.quantile_matching import J_QPD_extended_B + from cyclic_boosting.quantile_matching import J_QPD_B params = [alpha, qv_low, qv_median, qv_high] for idx, p in enumerate(params): if isinstance(p, float): params[idx] = np.array([p]) - elif ( - isinstance(p, tuple) or isinstance(p, list) or isinstance(p, np.ndarray) - ): + elif isinstance(p, (tuple, list, np.ndarray)): params[idx] = np.array(p) else: raise ValueError("data type is not float or array_like object") @@ -380,22 +351,17 @@ def __init__( qv_median[idx] = mid qv_high[idx] = high - iter = np.nditer(qv_low, flags=["c_index"]) - for _i in iter: - jqpd = J_QPD_extended_B( - alpha=alpha, - qv_low=qv_low[iter.index], - qv_median=qv_median[iter.index], - qv_high=qv_high[iter.index], - l=lower, - u=upper, - version=version, - shape=dist_shape, - ) - self.qpd.append(jqpd) - self.qpd = pd.DataFrame(self.qpd, index=self.index) - - def mean(self, lower=0.0, upper=np.inf): + self.qpd = J_QPD_B( + alpha=alpha, + qv_low=qv_low, + qv_median=qv_median, + qv_high=qv_high, + l=self.lower, + u=self.upper, + version=version, + ) + + def mean(self, lower: float = None, upper: float = None): """Return expected value of the distribution. Returns @@ -403,15 +369,14 @@ def mean(self, lower=0.0, upper=np.inf): pd.DataFrame with same rows, columns as `self` expected value of distribution (entry-wise) """ - loc = [] - for idx in self.index: - qpd = self.qpd.loc[idx, :].values[0] - l, _ = quad(exp_func, args=(qpd), a=lower, b=upper) - loc.append(l) - loc_arr = np.array(loc) - return pd.DataFrame(loc_arr, index=self.index, columns=self.columns) - - def var(self, lower=0.0, upper=np.inf): + if not lower: + lower = self.lower + if not upper: + upper = self.upper + loc = exp_func(lower, upper, self.qpd, self.index.shape[0]) + return pd.DataFrame(loc, index=self.index, columns=self.columns) + + def var(self, lower: float = None, upper: float = None): """Return element/entry-wise variance of the distribution. Returns @@ -419,15 +384,13 @@ def var(self, lower=0.0, upper=np.inf): pd.DataFrame with same rows, columns as `self` variance of distribution (entry-wise) """ - mean = self.mean() - var = [] - for idx in self.index: - mu = mean.loc[idx, :].to_numpy() - qpd = self.qpd.loc[idx, :].values[0] - l, _ = quad(var_func, args=(mu, qpd), a=lower, b=upper) - var.append(l) - var_arr = np.array(var) - return pd.DataFrame(var_arr, index=self.index, columns=self.columns) + if not lower: + lower = self.lower + if not upper: + upper = self.upper + mean = self.mean(lower, upper).values + var = var_func(mean, lower, upper, self.qpd, self.index.shape[0]) + return pd.DataFrame(var, index=self.index, columns=self.columns) def pdf(self, x: pd.DataFrame): """Probability density function. @@ -435,34 +398,18 @@ def pdf(self, x: pd.DataFrame): this fucntion transform cdf to pdf because j-qpd's pdf calculation is bit complex """ - pdf = [] - for idx in x.index: - qpd = self.qpd.loc[idx, :].values[0] - _x = x.loc[idx, :] - _pdf = derivative(qpd.cdf, _x, dx=1e-6) - pdf.append(_pdf) - pdf_arr = np.array(pdf) - return pd.DataFrame(pdf_arr, index=x.index, columns=x.columns) + pdf = pdf_func(x, self.qpd, self.index) + return pd.DataFrame(pdf, index=x.index, columns=x.columns) def ppf(self, p: pd.DataFrame): """Quantile function = percent point function = inverse cdf.""" - ppf = [] - for idx in p.index: - qpd = self.qpd.loc[idx, :].values[0] - _ppf = qpd.ppf(p.loc[idx, :]) - ppf.append(_ppf) - ppf_arr = np.array(ppf) - return pd.DataFrame(ppf_arr, index=p.index, columns=p.columns) + ppf = ppf_func(p, self.qpd, self.index) + return pd.DataFrame(ppf, index=p.index, columns=p.columns) def cdf(self, x: pd.DataFrame): """Cumulative distribution function.""" - cdf = [] - for idx in x.index: - qpd = self.qpd.loc[idx, :].values[0] - _cdf = qpd.cdf(x.loc[idx, :]) - cdf.append(_cdf) - cdf_arr = np.array(cdf) - return pd.DataFrame(cdf_arr, index=x.index, columns=x.columns) + cdf = cdf_func(x, self.qpd, self.index) + return pd.DataFrame(cdf, index=x.index, columns=x.columns) @classmethod def get_test_params(cls, parameter_set="default"): @@ -470,260 +417,101 @@ def get_test_params(cls, parameter_set="default"): params1 = { "alpha": 0.2, "version": "normal", - "qv_low": 0.2, - "qv_median": 0.5, - "qv_high": 0.8, - "lower": 0.0, - "upper": 1.0, + "qv_low": -0.3, + "qv_median": 0.0, + "qv_high": 0.3, + "lower": -0.5, + "upper": 0.5, } params2 = { "alpha": 0.2, "version": "normal", - "qv_low": [0.2, 0.2, 0.2], - "qv_median": [0.5, 0.5, 0.5], - "qv_high": [0.8, 0.8, 0.8], - "lower": 0.0, - "upper": 1.0, + "qv_low": [-0.3, -0.3, -0.3], + "qv_median": [0.0, 0.0, 0.0], + "qv_high": [0.3, 0.3, 0.3], + "lower": -0.5, + "upper": 0.5, "index": pd.Index([1, 2, 5]), "columns": pd.Index(["a"]), } return [params1, params2] -class QPD_U(BaseDistribution): - """Johnson Quantile-Parameterized Distributions with unbounded mode. - - see https://repositories.lib.utexas.edu/bitstream/handle/2152 - /63037/HADLOCK-DISSERTATION-2017.pdf - (Due to the Python keyword, the parameter lambda from - this reference is named kappa below). - A distribution is parameterized by a symmetric-percentile triplet (SPT). - - Parameters - ---------- - alpha : float - lower quantile of SPT (upper is ``1 - alpha``) - qv_low : float or array_like[float] - quantile function value of ``alpha`` - qv_median : float or array_like[float] - quantile function value of quantile 0.5 - qv_high : float or array_like[float] - quantile function value of quantile ``1 - alpha`` - version: str - options are ``normal`` (default) or ``logistic`` - dist_shape: str - parameter modifying the logistic base distribution via - sinh/arcsinh-scaling (only active in sinhlogistic version) - - Example - ------- - >>> from skpro.distributions.qpd import QPD_U # doctest: +SKIP - - >>> qpd = QPD_U( - ... alpha=0.2, - ... qv_low=[1, 2], - ... qv_median=[3, 4], - ... qv_high=[5, 6], - ... ) # doctest: +SKIP - - >>> qpd.mean() # doctest: +SKIP - """ - - _tags = { - # packaging info - # -------------- - "authors": ["setoguchi-naoki", "felix-wick"], - "maintainers": ["setoguchi-naoki"], - "python_dependencies": "cyclic_boosting>=1.2.5", - # estimator tags - # -------------- - "capabilities:approx": [], - "capabilities:exact": ["mean", "var", "cdf", "ppf"], - "distr:measuretype": "continuous", - } - - def __init__( - self, - alpha: float, - qv_low: float or object, - qv_median: float or object, - qv_high: float or object, - version: Optional[str] = "normal", - dist_shape: Optional[float] = 0.0, - index=None, - columns=None, - ): - self.qpd = [] - self.alpha = alpha - self.qv_low = qv_low - self.qv_median = qv_median - self.qv_high = qv_high - self.version = version - self.dist_shape = dist_shape - self.index = index - self.columns = columns - - super().__init__(index=index, columns=columns) - - from cyclic_boosting.quantile_matching import J_QPD_extended_U - - params = [alpha, qv_low, qv_median, qv_high] - for idx, p in enumerate(params): - if isinstance(p, float): - params[idx] = np.array([p]) - elif ( - isinstance(p, tuple) or isinstance(p, list) or isinstance(p, np.ndarray) - ): - params[idx] = np.array(p) - else: - raise ValueError("data type is not float or array_like object") - - alpha, qv_low, qv_median, qv_high = params[:] - if index is None: - index = pd.RangeIndex(qv_low.shape[0]) - self.index = index - - if columns is None: - columns = pd.RangeIndex(1) - self.columns = columns - - if version == "normal": - self.phi = norm() - elif version == "logistic": - self.phi = logistic() - else: - raise Exception("Invalid version.") - - if (np.any(qv_low > qv_median)) or np.any(qv_high < qv_median): - warnings.warn( - "The SPT values are not monotonically increasing, " - "each SPT is sorted by value", - stacklevel=2, - ) - idx = np.where((qv_low > qv_median), True, False) + np.where( - (qv_high < qv_median), True, False - ) - un_orderd_idx = np.argwhere(idx > 0).tolist() - warnings.warn(f"sorted index {un_orderd_idx}", stacklevel=2) - for idx in un_orderd_idx: - low, mid, high = sorted([qv_low[idx], qv_median[idx], qv_high[idx]]) - qv_low[idx] = low - qv_median[idx] = mid - qv_high[idx] = high - - iter = np.nditer(qv_low, flags=["c_index"]) - for _i in iter: - jqpd = J_QPD_extended_U( - alpha=alpha, - qv_low=qv_low[iter.index], - qv_median=qv_median[iter.index], - qv_high=qv_high[iter.index], - version=version, - shape=dist_shape, - ) - self.qpd.append(jqpd) - self.qpd = pd.DataFrame(self.qpd, index=self.index) - - def mean(self, lower=0.0, upper=np.inf): - """Return expected value of the distribution. - - Returns - ------- - pd.DataFrame with same rows, columns as `self` - expected value of distribution (entry-wise) - """ - loc = [] - for idx in self.index: - qpd = self.qpd.loc[idx, :].values[0] - l, _ = quad(exp_func, args=(qpd), a=lower, b=upper) - loc.append(l) - loc_arr = np.array(loc) - return pd.DataFrame(loc_arr, index=self.index, columns=self.columns) - - def var(self, lower=0.0, upper=np.inf): - """Return element/entry-wise variance of the distribution. - - Returns - ------- - pd.DataFrame with same rows, columns as `self` - variance of distribution (entry-wise) - """ - mean = self.mean() - var = [] - for idx in self.index: - mu = mean.loc[idx, :].to_numpy() - qpd = self.qpd.loc[idx, :].values[0] - l, _ = quad(var_func, args=(mu, qpd), a=lower, b=upper) - var.append(l) - var_arr = np.array(var) - return pd.DataFrame(var_arr, index=self.index, columns=self.columns) - - def pdf(self, x: pd.DataFrame): - """Probability density function. - - this fucntion transform cdf to pdf - because j-qpd's pdf calculation is bit complex - """ - pdf = [] - for idx in x.index: - qpd = self.qpd.loc[idx, :].values[0] - _x = x.loc[idx, :] - _pdf = derivative(qpd.cdf, _x, dx=1e-6) - pdf.append(_pdf) - pdf_arr = np.array(pdf) - return pd.DataFrame(pdf_arr, index=x.index, columns=x.columns) - - def ppf(self, p: pd.DataFrame): - """Quantile function = percent point function = inverse cdf.""" - ppf = [] - for idx in p.index: - qpd = self.qpd.loc[idx, :].values[0] - _ppf = qpd.ppf(p.loc[idx, :]) - ppf.append(_ppf) - ppf_arr = np.array(ppf) - return pd.DataFrame(ppf_arr, index=p.index, columns=p.columns) - - def cdf(self, x: pd.DataFrame): - """Cumulative distribution function.""" - cdf = [] - for idx in x.index: - qpd = self.qpd.loc[idx, :].values[0] - _cdf = qpd.cdf(x.loc[idx, :]) - cdf.append(_cdf) - cdf_arr = np.array(cdf) - return pd.DataFrame(cdf_arr, index=x.index, columns=x.columns) - - @classmethod - def get_test_params(cls, parameter_set="default"): - """Return testing parameter settings for the estimator.""" - params1 = { - "alpha": 0.2, - "version": "normal", - "qv_low": 0.2, - "qv_median": 0.5, - "qv_high": 0.8, - } - params2 = { - "alpha": 0.2, - "version": "normal", - "qv_low": [0.2, 0.2, 0.2], - "qv_median": [0.5, 0.5, 0.5], - "qv_high": [0.8, 0.8, 0.8], - "index": pd.Index([1, 2, 5]), - "columns": pd.Index(["a"]), - } - return [params1, params2] +def calc_pdf(x: np.ndarray, qpd: J_QPD_S | J_QPD_B) -> np.ndarray: + """Return pdf value for all samples.""" + dx = x[1] - x[0] + derivative = FinDiff(1, dx, 1) + cdf = qpd.cdf(x).T + if cdf.ndim < 2: + cdf = cdf[np.newaxis, :] + pdf = np.asarray(derivative(cdf)) + return pdf -def exp_func(x, qpd): +def exp_func(lower: float, upper: float, qpd: J_QPD_S | J_QPD_B, size: int): """Return Expectation.""" - # TODO: scipy.integrate will be removed in scipy 1.12.0 - pdf = derivative(qpd.cdf, x, dx=1e-6) - return x * pdf + x = np.linspace(lower, upper, num=int(1e3)) + pdf_arr = calc_pdf(x, qpd) + x = np.tile(x, (size, 1)) + loc_arr = np.trapz(x * pdf_arr, x, dx=1e-6, axis=1) + return loc_arr -def var_func(x, mu, qpd): +def var_func( + mu: np.ndarray, lower: float, upper: float, qpd: J_QPD_S | J_QPD_B, size: int +): """Return Variance.""" - # TODO: scipy.integrate will be removed in scipy 1.12.0 - pdf = derivative(qpd.cdf, x, dx=1e-6) - return ((x - mu) ** 2) * pdf + x = np.linspace(lower, upper, num=int(1e3)) + pdf_arr = calc_pdf(x, qpd) + x = np.tile(x, (size, 1)) + var_arr = np.trapz(((x - mu) ** 2) * pdf_arr, x, dx=1e-6, axis=1) + return var_arr + + +def pdf_func(x: DataFrame, qpd: J_QPD_S | J_QPD_B, index: Index): + """Return pdf value.""" + x_value = np.unique(x.values) + pdf = np.zeros((x.index.shape[0], len(x.columns))) + for v in x_value: + x0 = np.linspace(v, v + 1e-3, num=3) + pdf_arr = calc_pdf(x0, qpd)[:, 0] + if pdf_arr.ndim < 1: + pdf_arr = pdf_arr[np.newaxis] + rows, cols = np.where(x.values == v) + for r, c in zip(rows, cols): + id = x.index[r] + target = index.get_loc(id) + pdf[r][c] = pdf_arr[target] + return pdf + + +def ppf_func(x: DataFrame, qpd: J_QPD_S | J_QPD_B, index: Index): + """Return ppf value.""" + quantiles = np.unique(x.values) + ppf = np.zeros((x.index.shape[0], len(x.columns))) + for q in quantiles: + ppf_arr = qpd.ppf(q).T + if ppf_arr.ndim < 1: + ppf_arr = ppf_arr[np.newaxis] + rows, cols = np.where(x.values == q) + for r, c in zip(rows, cols): + id = x.index[r] + target = index.get_loc(id) + ppf[r][c] = ppf_arr[target] + return pd.DataFrame(ppf, index=x.index, columns=x.columns) + + +def cdf_func(x: DataFrame, qpd: J_QPD_S | J_QPD_B, index: Index): + """Return cdf value.""" + x_value = np.unique(x.values) + cdf = np.zeros((x.index.shape[0], len(x.columns))) + for v in x_value: + cdf_arr = qpd.cdf(v).T + if cdf_arr.ndim < 1: + cdf_arr = cdf_arr[np.newaxis] + rows, cols = np.where(x.values == v) + for r, c in zip(rows, cols): + id = x.index[r] + target = index.get_loc(id) + cdf[r][c] = cdf_arr[target] + return pd.DataFrame(cdf, index=x.index, columns=x.columns) diff --git a/skpro/distributions/tests/test_qpd.py b/skpro/distributions/tests/test_qpd.py index 049fe5936..0df932515 100644 --- a/skpro/distributions/tests/test_qpd.py +++ b/skpro/distributions/tests/test_qpd.py @@ -2,18 +2,16 @@ import pytest -from skpro.distributions.qpd import QPD_B, QPD_S, QPD_U +from skpro.distributions.qpd import QPD_B, QPD_S from skpro.tests.test_switch import run_test_for_class @pytest.mark.skipif( not run_test_for_class(QPD_B), - reason="run test only if softdeps are present and incrementally (if requested)", + reason="run test only if softdeps are present and incrementally (if requested)", # ) def test_qpd_b_simple_use(): """Test simple use of qpd with bounded mode.""" - from skpro.distributions.qpd import QPD_B - qpd = QPD_B( alpha=0.2, qv_low=[1, 2], @@ -32,8 +30,6 @@ def test_qpd_b_simple_use(): ) def test_qpd_s_simple_use(): """Test simple use of qpd with semi-bounded mode.""" - from skpro.distributions.qpd import QPD_S - qpd = QPD_S( alpha=0.2, qv_low=[1, 2], @@ -43,21 +39,3 @@ def test_qpd_s_simple_use(): ) qpd.mean() - - -@pytest.mark.skipif( - not run_test_for_class(QPD_U), - reason="run test only if softdeps are present and incrementally (if requested)", -) -def test_qpd_u_simple_use(): - """Test simple use of qpd with unbounded mode.""" - from skpro.distributions.qpd import QPD_U - - qpd = QPD_U( - alpha=0.2, - qv_low=[1, 2], - qv_median=[3, 4], - qv_high=[5, 6], - ) - - qpd.mean() diff --git a/skpro/regression/cyclic_boosting.py b/skpro/regression/cyclic_boosting.py index d5dbc055b..00d0e55e8 100644 --- a/skpro/regression/cyclic_boosting.py +++ b/skpro/regression/cyclic_boosting.py @@ -6,6 +6,7 @@ Please read the official document for its detail https://cyclic-boosting.readthedocs.io/en/latest/ """ + # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) __author__ = [ @@ -17,7 +18,7 @@ import numpy as np import pandas as pd -from skpro.distributions.qpd import QPD_B, QPD_S, QPD_U +from skpro.distributions.qpd import QPD_B, QPD_S from skpro.regression.base import BaseProbaRegressor @@ -90,7 +91,7 @@ class CyclicBoosting(BaseProbaRegressor): "authors": ["setoguchi-naoki", "felix-wick"], "maintainers": ["setoguchi-naoki"], "estimator_type": "regressor_proba", - "python_dependencies": "cyclic_boosting>=1.2.5", + "python_dependencies": "cyclic_boosting>=1.4.0", # estimator tags # -------------- "capability:multioutput": False, @@ -105,7 +106,7 @@ def __init__( feature_properties=None, alpha=0.2, mode="multiplicative", - bound="U", + bound="S", lower=0.0, upper=1.0, maximal_iterations=10, @@ -282,9 +283,7 @@ def _predict_proba(self, X): "index": index, "columns": y_cols, } - if self.bound == "U": - qpd = QPD_U(**params) - elif self.bound == "S": + if self.bound == "S": params["lower"] = self.lower qpd = QPD_S(**params) elif self.bound == "B": @@ -292,7 +291,7 @@ def _predict_proba(self, X): params["upper"] = self.upper qpd = QPD_B(**params) else: - raise ValueError("bound need to be 'U' or 'S' or 'B'") + raise ValueError("bound need to be 'S' or 'B'") return qpd @@ -451,5 +450,19 @@ def get_test_params(cls, parameter_set="default"): `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. `create_test_instance` uses the first (or only) dictionary in `params` """ - param1 = {"alpha": 0.3, "mode": "additive", "bound": "S", "lower": 0.0} - return [param1] + param1 = { + "alpha": 0.2, + "mode": "additive", + "bound": "S", + "lower": 0.0, + "maximal_iterations": 5, + } + param2 = { + "alpha": 0.2, + "mode": "additive", + "bound": "B", + "lower": 0.0, + "upper": 1000, + "maximal_iterations": 5, + } + return [param1, param2] diff --git a/skpro/regression/tests/test_cyclic_boosting.py b/skpro/regression/tests/test_cyclic_boosting.py index c9e0e28c0..15e93b616 100644 --- a/skpro/regression/tests/test_cyclic_boosting.py +++ b/skpro/regression/tests/test_cyclic_boosting.py @@ -20,7 +20,7 @@ def test_cyclic_boosting_simple_use(): y = pd.DataFrame(y) X = X.iloc[:200] y = y.iloc[:200] - X_train, X_test, y_train, y_test = train_test_split(X, y) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) reg_proba = CyclicBoosting() reg_proba.fit(X_train, y_train) @@ -43,7 +43,7 @@ def test_cyclic_boosting_with_manual_paramaters(): y = pd.DataFrame(y) X = X.iloc[:200] y = y.iloc[:200] - X_train, X_test, y_train, y_test = train_test_split(X, y) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) features = [ "age", @@ -53,23 +53,17 @@ def test_cyclic_boosting_with_manual_paramaters(): "s1", "s2", "s3", - "s4", - "s5", - "s6", ("age", "sex"), ] fp = { - "age": flags.IS_CONTINUOUS, - "sex": flags.IS_CONTINUOUS, + "age": flags.IS_UNORDERED, + "sex": flags.IS_UNORDERED, "bmi": flags.IS_CONTINUOUS, "bp": flags.IS_CONTINUOUS, "s1": flags.IS_CONTINUOUS, "s2": flags.IS_CONTINUOUS, "s3": flags.IS_CONTINUOUS, - "s4": flags.IS_CONTINUOUS, - "s5": flags.IS_CONTINUOUS, - "s6": flags.IS_CONTINUOUS, } reg_proba = CyclicBoosting( From bf59ee88624b455f3e63d3edbf4f954cd7dd0ac1 Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Wed, 3 Apr 2024 12:14:07 +0900 Subject: [PATCH 04/52] remove unnessesary data type --- skpro/distributions/qpd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 5e7e0d297..e32bd4cb7 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -11,7 +11,7 @@ import typing import warnings -from typing import Optional, Sequence, Union +from typing import Sequence if typing.TYPE_CHECKING: from cyclic_boosting.quantile_matching import J_QPD_S, J_QPD_B From cbeb8772a51c8b89707e16331911c71cf74d4689 Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Wed, 3 Apr 2024 12:22:18 +0900 Subject: [PATCH 05/52] update python dependency --- skpro/distributions/qpd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index e32bd4cb7..e28acd30e 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -69,7 +69,7 @@ class QPD_S(BaseDistribution): # -------------- "authors": ["setoguchi-naoki", "felix-wick"], "maintainers": ["setoguchi-naoki"], - "python_dependencies": "cyclic_boosting>=1.2.5", + "python_dependencies": "cyclic_boosting>=1.4.0, findiff", # estimator tags # -------------- "capabilities:approx": [], From 087748b0e69499bbef280e74f85b940f444b731e Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Wed, 3 Apr 2024 12:53:34 +0900 Subject: [PATCH 06/52] minor change --- skpro/distributions/qpd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index e28acd30e..0874f13f8 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -69,7 +69,7 @@ class QPD_S(BaseDistribution): # -------------- "authors": ["setoguchi-naoki", "felix-wick"], "maintainers": ["setoguchi-naoki"], - "python_dependencies": "cyclic_boosting>=1.4.0, findiff", + "python_dependencies": "cyclic_boosting>=1.4.0; findiff", # estimator tags # -------------- "capabilities:approx": [], From e2459c7f5fe223bdd804468d05fb3d60b7aaff03 Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Wed, 3 Apr 2024 12:56:53 +0900 Subject: [PATCH 07/52] move findiff into function --- skpro/distributions/qpd.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 0874f13f8..ffe1609a2 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -19,7 +19,6 @@ import numpy as np import pandas as pd -from findiff import FinDiff from scipy.stats import logistic, norm from skpro.distributions.base import BaseDistribution @@ -439,6 +438,8 @@ def get_test_params(cls, parameter_set="default"): def calc_pdf(x: np.ndarray, qpd: J_QPD_S | J_QPD_B) -> np.ndarray: """Return pdf value for all samples.""" + from findiff import FinDiff + dx = x[1] - x[0] derivative = FinDiff(1, dx, 1) cdf = qpd.cdf(x).T From 00fef17a90d091303dff719844864fbe1609b943 Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Wed, 3 Apr 2024 13:04:20 +0900 Subject: [PATCH 08/52] minor change --- skpro/distributions/qpd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index ffe1609a2..47d9c09f7 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -68,7 +68,7 @@ class QPD_S(BaseDistribution): # -------------- "authors": ["setoguchi-naoki", "felix-wick"], "maintainers": ["setoguchi-naoki"], - "python_dependencies": "cyclic_boosting>=1.4.0; findiff", + "python_dependencies": ["cyclic_boosting>=1.4.0", "findiff"], # estimator tags # -------------- "capabilities:approx": [], @@ -273,7 +273,7 @@ class QPD_B(BaseDistribution): # -------------- "authors": ["setoguchi-naoki", "felix-wick"], "maintainers": ["setoguchi-naoki"], - "python_dependencies": "cyclic_boosting>=1.2.5", + "python_dependencies": ["cyclic_boosting>=1.4.0", "findiff"], # estimator tags # -------------- "capabilities:approx": [], From dc2e81b64c81273d41fc46a5d79f953019c14e1c Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Wed, 3 Apr 2024 13:12:57 +0900 Subject: [PATCH 09/52] remove QPD_U --- docs/source/api_reference/distributions.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/api_reference/distributions.rst b/docs/source/api_reference/distributions.rst index 2f6372e4b..f8c1b6fc7 100644 --- a/docs/source/api_reference/distributions.rst +++ b/docs/source/api_reference/distributions.rst @@ -46,7 +46,6 @@ Non-parametric and empirical distributions :template: class.rst Empirical - QPD_U QPD_S QPD_B From d0dd460da4c1ccf31bb5a116f23a9005d312a1c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 18 Apr 2024 10:50:39 +0100 Subject: [PATCH 10/52] move base to folder --- skpro/distributions/base/__init__.py | 7 +++++++ skpro/distributions/{base.py => base/_base.py} | 0 2 files changed, 7 insertions(+) create mode 100644 skpro/distributions/base/__init__.py rename skpro/distributions/{base.py => base/_base.py} (100%) diff --git a/skpro/distributions/base/__init__.py b/skpro/distributions/base/__init__.py new file mode 100644 index 000000000..22fe3335d --- /dev/null +++ b/skpro/distributions/base/__init__.py @@ -0,0 +1,7 @@ +"""Probability distribution objects.""" +# copyright: skpro developers, BSD-3-Clause License (see LICENSE file) +# adapted from sktime + +__all__ = ["BaseDistribution"] + +from skpro.distributions.base._base import BaseDistribution diff --git a/skpro/distributions/base.py b/skpro/distributions/base/_base.py similarity index 100% rename from skpro/distributions/base.py rename to skpro/distributions/base/_base.py From b60afc419748a106d738b9177da80eaa62b1c534 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 18 Apr 2024 11:13:26 +0100 Subject: [PATCH 11/52] delegate class --- skpro/distributions/base/__init__.py | 4 +- skpro/distributions/base/_delegate.py | 209 ++++++++++++++++++++++++++ 2 files changed, 212 insertions(+), 1 deletion(-) create mode 100644 skpro/distributions/base/_delegate.py diff --git a/skpro/distributions/base/__init__.py b/skpro/distributions/base/__init__.py index 22fe3335d..a9e981e67 100644 --- a/skpro/distributions/base/__init__.py +++ b/skpro/distributions/base/__init__.py @@ -2,6 +2,8 @@ # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) # adapted from sktime -__all__ = ["BaseDistribution"] +__all__ = ["BaseDistribution", "_DelegatedProbaRegressor"] from skpro.distributions.base._base import BaseDistribution +from skpro.distributions.base._delegate import _DelegatedProbaRegressor + diff --git a/skpro/distributions/base/_delegate.py b/skpro/distributions/base/_delegate.py new file mode 100644 index 000000000..7896d2f22 --- /dev/null +++ b/skpro/distributions/base/_delegate.py @@ -0,0 +1,209 @@ +"""Delegator mixin that delegates all methods to wrapped distribution. + +Useful for building estimators where all but one or a few methods are delegated. For +that purpose, inherit from this estimator and then override only the methods that +are not delegated. +""" +# copyright: skpro developers, BSD-3-Clause License (see LICENSE file) + +__author__ = ["fkiraly"] +__all__ = ["_DelegatedDistribution"] + +from copy import deepcopy + +from skpro.distributions.base._base import BaseDistribution + + +class _DelegatedProbaRegressor(BaseDistribution): + """Delegator mixin that delegates all methods to wrapped estimator. + + Delegates inner methods to a wrapped estimator. + Wrapped estimator is value of attribute with name self._delegate_name. + By default, this is "estimator_", i.e., delegates to self.estimator_ + To override delegation, override _delegate_name attribute in child class. + + Delegates the following methods: + _fit, _predict, + _predict_interval, _predict_quantiles, _predict_var, _predict_proba + + Does NOT delegate get_params, set_params. + get_params, set_params will hence use one additional nesting level by default. + + Does NOT delegate or copy tags, this should be done in a child class if required. + """ + + # attribute for _DelegatedProbaRegressor, which then delegates + # all non-overridden methods are same as of getattr(self, _delegate_name) + # see further details in _DelegatedRegressor docstring + _delegate_name = "estimator_" + + def _get_delegate(self): + return getattr(self, self._delegate_name) + + def _iloc(self, rowidx=None, colidx=None): + cls = self.__class__ + + delegate = self._get_delegate() + delegate_subset = delegate.iloc[rowidx, colidx] + delegate_subset_params = deepcopy(delegate_subset.get_params()) + + return cls(**delegate_subset_params) + + def pdf(self, x): + r"""Probability density function. + + Let :math:`X` be a random variables with the distribution of `self`, + taking values in `(N, n)` `DataFrame`-s + Let :math:`x\in \mathbb{R}^{N\times n}`. + By :math:`p_{X_{ij}}`, denote the marginal pdf of :math:`X` at the + :math:`(i,j)`-th entry. + + The output of this method, for input `x` representing :math:`x`, + is a `DataFrame` with same columns and indices as `self`, + and entries :math:`p_{X_{ij}}(x_{ij})`. + + Parameters + ---------- + x : `pandas.DataFrame` or 2D np.ndarray + representing :math:`x`, as above + + Returns + ------- + `DataFrame` with same columns and index as `self` + containing :math:`p_{X_{ij}}(x_{ij})`, as above + """ + delegate = self._get_delegate() + return delegate.pdf(x) + + def log_pdf(self, x): + r"""Logarithmic probability density function. + + Numerically more stable than calling pdf and then taking logartihms. + + Let :math:`X` be a random variables with the distribution of `self`, + taking values in `(N, n)` `DataFrame`-s + Let :math:`x\in \mathbb{R}^{N\times n}`. + By :math:`p_{X_{ij}}`, denote the marginal pdf of :math:`X` at the + :math:`(i,j)`-th entry. + + The output of this method, for input `x` representing :math:`x`, + is a `DataFrame` with same columns and indices as `self`, + and entries :math:`\log p_{X_{ij}}(x_{ij})`. + + If `self` has a mixed or discrete distribution, this returns + the weighted continuous part of `self`'s distribution instead of the pdf, + i.e., the marginal pdf integrate to the weight of the continuous part. + + Parameters + ---------- + x : `pandas.DataFrame` or 2D np.ndarray + representing :math:`x`, as above + + Returns + ------- + `DataFrame` with same columns and index as `self` + containing :math:`\log p_{X_{ij}}(x_{ij})`, as above + """ + delegate = self._get_delegate() + return delegate.log_pdf(x) + + def cdf(self, x): + """Cumulative distribution function.""" + delegate = self._get_delegate() + return delegate.cdf(x) + + def ppf(self, p): + """Quantile function = percent point function = inverse cdf.""" + delegate = self._get_delegate() + return delegate.ppf(p) + + def energy(self, x=None): + r"""Energy of self, w.r.t. self or a constant frame x. + + Let :math:`X, Y` be i.i.d. random variables with the distribution of `self`. + + If `x` is `None`, returns :math:`\mathbb{E}[|X-Y|]` (for each row), + "self-energy" (of the row marginal distribution). + If `x` is passed, returns :math:`\mathbb{E}[|X-x|]` (for each row), + "energy wrt x" (of the row marginal distribution). + + Parameters + ---------- + x : None or pd.DataFrame, optional, default=None + if pd.DataFrame, must have same rows and columns as `self` + + Returns + ------- + pd.DataFrame with same rows as `self`, single column `"energy"` + each row contains one float, self-energy/energy as described above. + """ + delegate = self._get_delegate() + return delegate.energy(x=x) + + def mean(self): + r"""Return expected value of the distribution. + + Let :math:`X` be a random variable with the distribution of `self`. + Returns the expectation :math:`\mathbb{E}[X]` + + Returns + ------- + pd.DataFrame with same rows, columns as `self` + expected value of distribution (entry-wise) + """ + delegate = self._get_delegate() + return delegate.mean() + + def var(self): + r"""Return element/entry-wise variance of the distribution. + + Let :math:`X` be a random variable with the distribution of `self`. + Returns :math:`\mathbb{V}[X] = \mathbb{E}\left(X - \mathbb{E}[X]\right)^2` + + Returns + ------- + pd.DataFrame with same rows, columns as `self` + variance of distribution (entry-wise) + """ + delegate = self._get_delegate() + return delegate.var() + + def pdfnorm(self, a=2): + r"""a-norm of pdf, defaults to 2-norm. + + computes a-norm of the entry marginal pdf, i.e., + :math:`\mathbb{E}[p_X(X)^{a-1}] = \int p(x)^a dx`, + where :math:`X` is a random variable distributed according to the entry marginal + of `self`, and :math:`p_X` is its pdf + + Parameters + ---------- + a: int or float, optional, default=2 + + Returns + ------- + pd.DataFrame with same rows and columns as `self` + each entry is :math:`\mathbb{E}[p_X(X)^{a-1}] = \int p(x)^a dx`, see above + """ + delegate = self._get_delegate() + return delegate.pdfnorm(a=a) + + def sample(self, n_samples=None): + """Sample from the distribution. + + Parameters + ---------- + n_samples : int, optional, default = None + + Returns + ------- + if `n_samples` is `None`: + returns a sample that contains a single sample from `self`, + in `pd.DataFrame` mtype format convention, with `index` and `columns` as `self` + if n_samples is `int`: + returns a `pd.DataFrame` that contains `n_samples` i.i.d. samples from `self`, + in `pd-multiindex` mtype format convention, with same `columns` as `self`, + and `MultiIndex` that is product of `RangeIndex(n_samples)` and `self.index` + """ + delegate = self._get_delegate() + return delegate.sample(n_samples=n_samples) From 24ef82fc0fb14a91c8db388b353c98218b81fd06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 18 Apr 2024 11:17:28 +0100 Subject: [PATCH 12/52] start work --- skpro/distributions/qpd.py | 55 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 746206ce2..eabfc9d95 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -18,6 +18,61 @@ from skpro.distributions.base import BaseDistribution +class QPD_S(BaseDistribution): + """Johnson Quantile-Parameterized Distributions with semi-bounded mode. + + see https://repositories.lib.utexas.edu/bitstream/handle/2152 + /63037/HADLOCK-DISSERTATION-2017.pdf + (Due to the Python keyword, the parameter lambda from + this reference is named kappa below.) + A distribution is parameterized by a symmetric-percentile triplet (SPT). + + Parameters + ---------- + alpha : float + lower quantile of SPT (upper is ``1 - alpha``) + qv_low : float or array_like[float] + quantile function value of ``alpha`` + qv_median : float or array_like[float] + quantile function value of quantile 0.5 + qv_high : float or array_like[float] + quantile function value of quantile ``1 - alpha`` + lower : float + lower bound of semi-bounded range (default is 0) + version: str + options are ``normal`` (default) or ``logistic`` + dist_shape: str + parameter modifying the logistic base distribution via + sinh/arcsinh-scaling (only active in sinhlogistic version) + + Example + ------- + >>> from skpro.distributions.qpd import QPD_S # doctest: +SKIP + + >>> qpd = QPD_S( + ... alpha=0.2, + ... qv_low=[1, 2], + ... qv_median=[3, 4], + ... qv_high=[5, 6], + ... lower=0 + ... ) # doctest: +SKIP + + >>> qpd.mean() # doctest: +SKIP + """ + _tags = { + # packaging info + # -------------- + "authors": ["setoguchi-naoki", "felix-wick", "fkiraly"], + "maintainers": ["setoguchi-naoki"], + "python_dependencies": "cyclic_boosting>=1.2.5", + # estimator tags + # -------------- + "capabilities:approx": ["pdfnorm", "energy"], + "capabilities:exact": ["mean", "var", "cdf", "ppf", "pdf", "log_pdf"], + "distr:measuretype": "continuous", + } + + class QPD_S(BaseDistribution): """Johnson Quantile-Parameterized Distributions with semi-bounded mode. From 6ea8e1cd0fa07f56c1840646f51a41c30987f696 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 18 Apr 2024 11:32:22 +0100 Subject: [PATCH 13/52] corrected name --- skpro/distributions/base/__init__.py | 4 ++-- skpro/distributions/base/_delegate.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/skpro/distributions/base/__init__.py b/skpro/distributions/base/__init__.py index a9e981e67..3d558397b 100644 --- a/skpro/distributions/base/__init__.py +++ b/skpro/distributions/base/__init__.py @@ -2,8 +2,8 @@ # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) # adapted from sktime -__all__ = ["BaseDistribution", "_DelegatedProbaRegressor"] +__all__ = ["BaseDistribution", "_DelegatedDistribution"] from skpro.distributions.base._base import BaseDistribution -from skpro.distributions.base._delegate import _DelegatedProbaRegressor +from skpro.distributions.base._delegate import _DelegatedDistribution diff --git a/skpro/distributions/base/_delegate.py b/skpro/distributions/base/_delegate.py index 7896d2f22..867c7d805 100644 --- a/skpro/distributions/base/_delegate.py +++ b/skpro/distributions/base/_delegate.py @@ -14,7 +14,7 @@ from skpro.distributions.base._base import BaseDistribution -class _DelegatedProbaRegressor(BaseDistribution): +class _DelegatedDistribution(BaseDistribution): """Delegator mixin that delegates all methods to wrapped estimator. Delegates inner methods to a wrapped estimator. From 669fbe4a7ec9844743aca6ff70bad2efddadcac9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 18 Apr 2024 11:37:11 +0100 Subject: [PATCH 14/52] docstr --- skpro/distributions/qpd.py | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index eabfc9d95..2a892111a 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -18,17 +18,24 @@ from skpro.distributions.base import BaseDistribution -class QPD_S(BaseDistribution): - """Johnson Quantile-Parameterized Distributions with semi-bounded mode. +class QPD_Johnson(BaseDistribution): + """Johnson Quantile-Parameterized Distribution. + + This class allows selection of the mode bounding type, + i.e. semi-bounded, bounded, or unbounded. + + A Johnson QPD distribution is parameterized by a symmetric-percentile triplet (SPT), + at quantiles alpha, 0.5, and 1-alpha, respectively. see https://repositories.lib.utexas.edu/bitstream/handle/2152 /63037/HADLOCK-DISSERTATION-2017.pdf - (Due to the Python keyword, the parameter lambda from - this reference is named kappa below.) - A distribution is parameterized by a symmetric-percentile triplet (SPT). + Parameter names are as in the reference, except for the parameter lambda, + which is renamed to kappa, as lambda is a reserved keyword in python. Parameters ---------- + bounding : str, one of 'S' (default), 'B', 'U' + mode bounding type, i.e. semi-bounded (S), bounded (B), or unbounded (U) alpha : float lower quantile of SPT (upper is ``1 - alpha``) qv_low : float or array_like[float] @@ -39,9 +46,9 @@ class QPD_S(BaseDistribution): quantile function value of quantile ``1 - alpha`` lower : float lower bound of semi-bounded range (default is 0) - version: str - options are ``normal`` (default) or ``logistic`` - dist_shape: str + version: str, one of ``'normal'`` (default), ``'logistic'`` + options are ``'normal'`` (default) or ``'logistic'`` + dist_shape: float, optional, default=0.0 parameter modifying the logistic base distribution via sinh/arcsinh-scaling (only active in sinhlogistic version) @@ -72,6 +79,18 @@ class QPD_S(BaseDistribution): "distr:measuretype": "continuous", } + def __init__( + self, + alpha: float, + qv_low: float or object, + qv_median: float or object, + qv_high: float or object, + lower: Optional[float] = 0.0, + version: Optional[str] = "normal", + dist_shape: Optional[float] = 0.0, + index=None, + columns=None, + ): class QPD_S(BaseDistribution): """Johnson Quantile-Parameterized Distributions with semi-bounded mode. From b84c6f701776947cbf8417a6f302d01344f85068 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 18 Apr 2024 11:49:44 +0100 Subject: [PATCH 15/52] complete --- docs/source/api_reference/distributions.rst | 1 + skpro/distributions/base/_delegate.py | 2 +- skpro/distributions/qpd.py | 103 ++++++++++++++++++-- 3 files changed, 95 insertions(+), 11 deletions(-) diff --git a/docs/source/api_reference/distributions.rst b/docs/source/api_reference/distributions.rst index 6d380ccec..d2c001c76 100644 --- a/docs/source/api_reference/distributions.rst +++ b/docs/source/api_reference/distributions.rst @@ -64,6 +64,7 @@ Non-parametric and empirical distributions :template: class.rst Empirical + QPD_Johnson QPD_U QPD_S QPD_B diff --git a/skpro/distributions/base/_delegate.py b/skpro/distributions/base/_delegate.py index 867c7d805..6af4dcf05 100644 --- a/skpro/distributions/base/_delegate.py +++ b/skpro/distributions/base/_delegate.py @@ -35,7 +35,7 @@ class _DelegatedDistribution(BaseDistribution): # attribute for _DelegatedProbaRegressor, which then delegates # all non-overridden methods are same as of getattr(self, _delegate_name) # see further details in _DelegatedRegressor docstring - _delegate_name = "estimator_" + _delegate_name = "delegate_" def _get_delegate(self): return getattr(self, self._delegate_name) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 2a892111a..d3b7e5712 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -21,9 +21,6 @@ class QPD_Johnson(BaseDistribution): """Johnson Quantile-Parameterized Distribution. - This class allows selection of the mode bounding type, - i.e. semi-bounded, bounded, or unbounded. - A Johnson QPD distribution is parameterized by a symmetric-percentile triplet (SPT), at quantiles alpha, 0.5, and 1-alpha, respectively. @@ -32,10 +29,15 @@ class QPD_Johnson(BaseDistribution): Parameter names are as in the reference, except for the parameter lambda, which is renamed to kappa, as lambda is a reserved keyword in python. + This class allows selection of the mode bounding type, + i.e. semi-bounded, bounded, or unbounded. + + * if neither ``lower`` nor ``upper`` bound is given, the mode is unbounded + * if only ``lower`` bound is given, the mode is semi-bounded + * if both ``lower`` and ``upper`` bounds are given, the mode is bounded + Parameters ---------- - bounding : str, one of 'S' (default), 'B', 'U' - mode bounding type, i.e. semi-bounded (S), bounded (B), or unbounded (U) alpha : float lower quantile of SPT (upper is ``1 - alpha``) qv_low : float or array_like[float] @@ -44,8 +46,10 @@ class QPD_Johnson(BaseDistribution): quantile function value of quantile 0.5 qv_high : float or array_like[float] quantile function value of quantile ``1 - alpha`` - lower : float - lower bound of semi-bounded range (default is 0) + lower : float, default = None (no lower bound) + lower bound of semi-bounded range or bounded range + upper : float, default = None (no upper bound) + upper bound of bounded range version: str, one of ``'normal'`` (default), ``'logistic'`` options are ``'normal'`` (default) or ``'logistic'`` dist_shape: float, optional, default=0.0 @@ -54,9 +58,9 @@ class QPD_Johnson(BaseDistribution): Example ------- - >>> from skpro.distributions.qpd import QPD_S # doctest: +SKIP + >>> from skpro.distributions.qpd import QPD_Johnson # doctest: +SKIP - >>> qpd = QPD_S( + >>> qpd = QPD_Johnson( ... alpha=0.2, ... qv_low=[1, 2], ... qv_median=[3, 4], @@ -85,12 +89,91 @@ def __init__( qv_low: float or object, qv_median: float or object, qv_high: float or object, - lower: Optional[float] = 0.0, + lower: Optional[float] = None, + upper: Optional[float] = None, version: Optional[str] = "normal", dist_shape: Optional[float] = 0.0, index=None, columns=None, ): + self.alpha = alpha + self.qv_low = qv_low + self.qv_median = qv_median + self.qv_high = qv_high + self.lower = lower + self.upper = upper + self.version = version + self.dist_shape = dist_shape + self.index = index + self.columns = columns + + if lower is None: + delegate_cls = QPD_U + extra_params = {} + elif upper is None: + delegate_cls = QPD_S + extra_params = {"lower": lower} + else: + delegate_cls = QPD_B + extra_params = {"lower": lower, "upper": upper} + + params = { + "alpha": alpha, + "qv_low": qv_low, + "qv_median": qv_median, + "qv_high": qv_high, + "version": version, + "dist_shape": dist_shape, + "index": index, + "columns": columns, + **extra_params, + } + + self.delegate_ = delegate_cls(**params) + + self.index = self.delegate_.index + self.columns = self.delegate_.columns + + super().__init__(index=self.index, columns=self.columns) + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator.""" + params1 = { + "alpha": 0.2, + "version": "normal", + "qv_low": 0.2, + "qv_median": 0.5, + "qv_high": 0.8, + } + params2 = { + "alpha": 0.1, + "version": "normal", + "qv_low": [0.2, 0.2, 0.2], + "qv_median": [0.5, 0.5, 0.5], + "qv_high": [0.8, 0.8, 0.8], + "index": pd.Index([1, 2, 5]), + "columns": pd.Index(["a"]), + } + params3 = { + "alpha": 0.1, + "version": "normal", + "qv_low": [0.1, 0.2, 0.3], + "qv_median": [0.4, 0.5, 0.6], + "qv_high": [0.7, 0.8, 0.9], + "lower": 0.05, + } + params4 = { + "alpha": 0.12, + "version": "logistic", + "qv_low": [0.25, 0.2, 0.22], + "qv_median": [0.45, 0.51, 0.54], + "qv_high": [0.85, 0.83, 0.81], + "lower": 0.05, + "upper": 0.95, + } + return [params1, params2, params3, params4] + class QPD_S(BaseDistribution): """Johnson Quantile-Parameterized Distributions with semi-bounded mode. From 744f2733651acf9e13536a5e5a047e6f5ec9abb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 18 Apr 2024 11:53:14 +0100 Subject: [PATCH 16/52] Update _delegate.py --- skpro/distributions/base/_delegate.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/skpro/distributions/base/_delegate.py b/skpro/distributions/base/_delegate.py index 867c7d805..337258ce2 100644 --- a/skpro/distributions/base/_delegate.py +++ b/skpro/distributions/base/_delegate.py @@ -32,10 +32,10 @@ class _DelegatedDistribution(BaseDistribution): Does NOT delegate or copy tags, this should be done in a child class if required. """ - # attribute for _DelegatedProbaRegressor, which then delegates + # attribute for _DelegatedDistribution, which then delegates # all non-overridden methods are same as of getattr(self, _delegate_name) - # see further details in _DelegatedRegressor docstring - _delegate_name = "estimator_" + # see further details in _DelegatedDistribution docstring + _delegate_name = "delegate_" def _get_delegate(self): return getattr(self, self._delegate_name) From c4247bfb2350a4c0d9d952593c55113e96cd491a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 18 Apr 2024 11:53:42 +0100 Subject: [PATCH 17/52] Update _delegate.py --- skpro/distributions/base/_delegate.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/skpro/distributions/base/_delegate.py b/skpro/distributions/base/_delegate.py index 337258ce2..551100192 100644 --- a/skpro/distributions/base/_delegate.py +++ b/skpro/distributions/base/_delegate.py @@ -23,8 +23,7 @@ class _DelegatedDistribution(BaseDistribution): To override delegation, override _delegate_name attribute in child class. Delegates the following methods: - _fit, _predict, - _predict_interval, _predict_quantiles, _predict_var, _predict_proba + _iloc, pdf, log_pdf, cdf, ppf, energy, mean, var, pdfnorm, sample Does NOT delegate get_params, set_params. get_params, set_params will hence use one additional nesting level by default. From 99772f89e1a3a648b9747ac09094a4b761890417 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 18 Apr 2024 12:49:01 +0100 Subject: [PATCH 18/52] linting --- skpro/distributions/base/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/skpro/distributions/base/__init__.py b/skpro/distributions/base/__init__.py index 3d558397b..4c56bc6b9 100644 --- a/skpro/distributions/base/__init__.py +++ b/skpro/distributions/base/__init__.py @@ -4,6 +4,5 @@ __all__ = ["BaseDistribution", "_DelegatedDistribution"] -from skpro.distributions.base._base import BaseDistribution +from skpro.distributions.base._base import BaseDistribution from skpro.distributions.base._delegate import _DelegatedDistribution - From 55ea736640c0ce9dd98fd29d91b8be8edacdde6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 18 Apr 2024 14:52:19 +0100 Subject: [PATCH 19/52] Update qpd.py --- skpro/distributions/qpd.py | 1 + 1 file changed, 1 insertion(+) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index d3b7e5712..fc21a89c8 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -70,6 +70,7 @@ class QPD_Johnson(BaseDistribution): >>> qpd.mean() # doctest: +SKIP """ + _tags = { # packaging info # -------------- From ca7753576dfa87fa8bd5abb78898c46a8029fa33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 18 Apr 2024 16:29:22 +0100 Subject: [PATCH 20/52] Update _base.py --- skpro/distributions/base/_base.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/skpro/distributions/base/_base.py b/skpro/distributions/base/_base.py index 82791962a..81b3fce9e 100644 --- a/skpro/distributions/base/_base.py +++ b/skpro/distributions/base/_base.py @@ -96,10 +96,11 @@ def _subset_params(self, rowidx, colidx): subset_param_dict = {} for param, val in params.items(): - if val is not None: - arr = np.array(val) - else: - arr = None + if val is None: + subset_param_dict[param] = None + continue + # else: + arr = np.array(val) # if len(arr.shape) == 0: # do nothing with arr if len(arr.shape) >= 1 and rowidx is not None: From 01cdb7f39edc3fd35060f12a3c039694722d0d9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 18 Apr 2024 19:09:56 +0100 Subject: [PATCH 21/52] should be delegate --- skpro/distributions/qpd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index fc21a89c8..49447cc77 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -15,10 +15,10 @@ from scipy.misc import derivative from scipy.stats import logistic, norm -from skpro.distributions.base import BaseDistribution +from skpro.distributions.base import _DelegatedDistribution -class QPD_Johnson(BaseDistribution): +class QPD_Johnson(_DelegatedDistribution): """Johnson Quantile-Parameterized Distribution. A Johnson QPD distribution is parameterized by a symmetric-percentile triplet (SPT), From d308d603b2ad97fad65b5f87fbf40964c83b7b20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 18 Apr 2024 19:11:04 +0100 Subject: [PATCH 22/52] Update qpd.py --- skpro/distributions/qpd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 49447cc77..4f1e9828d 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -15,7 +15,7 @@ from scipy.misc import derivative from scipy.stats import logistic, norm -from skpro.distributions.base import _DelegatedDistribution +from skpro.distributions.base import _DelegatedDistribution, BaseDistribution class QPD_Johnson(_DelegatedDistribution): From 72b3c18b687ab4c676824dd6311e48e6a6946798 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 18 Apr 2024 19:11:54 +0100 Subject: [PATCH 23/52] Update qpd.py --- skpro/distributions/qpd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 4f1e9828d..389872e8d 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -15,7 +15,7 @@ from scipy.misc import derivative from scipy.stats import logistic, norm -from skpro.distributions.base import _DelegatedDistribution, BaseDistribution +from skpro.distributions.base import BaseDistribution, _DelegatedDistribution class QPD_Johnson(_DelegatedDistribution): From f7428b15bdee23a9b3406d0a918e7b88f1dd3b3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 18 Apr 2024 22:54:24 +0100 Subject: [PATCH 24/52] fix broadcasting --- skpro/distributions/qpd.py | 48 ++++++++++---------------------------- 1 file changed, 12 insertions(+), 36 deletions(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 746206ce2..fc0132a7a 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -100,18 +100,10 @@ def __init__( from cyclic_boosting.quantile_matching import J_QPD_extended_S - params = [alpha, qv_low, qv_median, qv_high] - for idx, p in enumerate(params): - if isinstance(p, float): - params[idx] = np.array([p]) - elif ( - isinstance(p, tuple) or isinstance(p, list) or isinstance(p, np.ndarray) - ): - params[idx] = np.array(p) - else: - raise ValueError("data type is not float or array_like object") - - alpha, qv_low, qv_median, qv_high = params[:] + alpha, qv_low, qv_median, qv_high = self._get_bc_params( + alpha, qv_low, qv_median, qv_high + ) + if index is None: index = pd.RangeIndex(qv_low.shape[0]) self.index = index @@ -336,18 +328,10 @@ def __init__( from cyclic_boosting.quantile_matching import J_QPD_extended_B - params = [alpha, qv_low, qv_median, qv_high] - for idx, p in enumerate(params): - if isinstance(p, float): - params[idx] = np.array([p]) - elif ( - isinstance(p, tuple) or isinstance(p, list) or isinstance(p, np.ndarray) - ): - params[idx] = np.array(p) - else: - raise ValueError("data type is not float or array_like object") - - alpha, qv_low, qv_median, qv_high = params[:] + alpha, qv_low, qv_median, qv_high = self._get_bc_params( + alpha, qv_low, qv_median, qv_high + ) + if index is None: index = pd.RangeIndex(qv_low.shape[0]) self.index = index @@ -567,18 +551,10 @@ def __init__( from cyclic_boosting.quantile_matching import J_QPD_extended_U - params = [alpha, qv_low, qv_median, qv_high] - for idx, p in enumerate(params): - if isinstance(p, float): - params[idx] = np.array([p]) - elif ( - isinstance(p, tuple) or isinstance(p, list) or isinstance(p, np.ndarray) - ): - params[idx] = np.array(p) - else: - raise ValueError("data type is not float or array_like object") - - alpha, qv_low, qv_median, qv_high = params[:] + alpha, qv_low, qv_median, qv_high = self._get_bc_params( + alpha, qv_low, qv_median, qv_high + ) + if index is None: index = pd.RangeIndex(qv_low.shape[0]) self.index = index From e9d36412dd5fea21be9fe685b6af14d8f67de1d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 18 Apr 2024 23:07:04 +0100 Subject: [PATCH 25/52] broadcast for qpd status quo --- skpro/distributions/base.py | 17 +++++++++++++++-- skpro/distributions/qpd.py | 6 +++--- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/skpro/distributions/base.py b/skpro/distributions/base.py index 82791962a..9196fcd79 100644 --- a/skpro/distributions/base.py +++ b/skpro/distributions/base.py @@ -168,7 +168,7 @@ def _method_error_msg(self, method="this method", severity="warn", fill_in=None) else: return msg - def _get_bc_params(self, *args, dtype=None): + def _get_bc_params(self, *args, dtype=None, oned_as="row"): """Fully broadcast tuple of parameters given param shapes and index, columns. Parameters @@ -180,6 +180,9 @@ def _get_bc_params(self, *args, dtype=None): dtype : str, optional broadcasted arrays are cast to all have datatype `dtype`. If None, then no datatype casting is done. + oned_as : str, optional, "row" (default) or "col" + If 'row', then 1D arrays are treated as row vectors. If 'column', then 1D + arrays are treated as column vectors. Returns ------- @@ -196,11 +199,21 @@ def _get_bc_params(self, *args, dtype=None): args = tuple(params.values()) number_of_params = len(args) + def row_to_col(arr): + """Convert 1D arrays to 2D col arrays, leave 2D arrays unchanged.""" + if arr.ndim == 1: + return arr.reshape(-1, 1) + return arr + + args_as_np = [np.array(arg) for arg in args] + if oned_as == "col": + args_as_np = [row_to_col(arg) for arg in args_as_np] + if hasattr(self, "index") and self.index is not None: args += (self.index.to_numpy().reshape(-1, 1),) if hasattr(self, "columns") and self.columns is not None: args += (self.columns.to_numpy(),) - bc = np.broadcast_arrays(*args) + bc = np.broadcast_arrays(*args_as_np) if dtype is not None: bc = [array.astype(dtype) for array in bc] return bc[:number_of_params] diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index fc0132a7a..5e7bd7258 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -101,7 +101,7 @@ def __init__( from cyclic_boosting.quantile_matching import J_QPD_extended_S alpha, qv_low, qv_median, qv_high = self._get_bc_params( - alpha, qv_low, qv_median, qv_high + alpha, qv_low, qv_median, qv_high, oned_as="col" ) if index is None: @@ -329,7 +329,7 @@ def __init__( from cyclic_boosting.quantile_matching import J_QPD_extended_B alpha, qv_low, qv_median, qv_high = self._get_bc_params( - alpha, qv_low, qv_median, qv_high + alpha, qv_low, qv_median, qv_high, oned_as="col" ) if index is None: @@ -552,7 +552,7 @@ def __init__( from cyclic_boosting.quantile_matching import J_QPD_extended_U alpha, qv_low, qv_median, qv_high = self._get_bc_params( - alpha, qv_low, qv_median, qv_high + alpha, qv_low, qv_median, qv_high, oned_as="col" ) if index is None: From a1b4b6be5ff001b544b18bf721e4e408b7d163d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 18 Apr 2024 23:16:54 +0100 Subject: [PATCH 26/52] fix qpd --- skpro/distributions/base.py | 4 ++-- skpro/distributions/qpd.py | 18 ++++++++++++------ 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/skpro/distributions/base.py b/skpro/distributions/base.py index 9196fcd79..5b5bd2dbd 100644 --- a/skpro/distributions/base.py +++ b/skpro/distributions/base.py @@ -210,9 +210,9 @@ def row_to_col(arr): args_as_np = [row_to_col(arg) for arg in args_as_np] if hasattr(self, "index") and self.index is not None: - args += (self.index.to_numpy().reshape(-1, 1),) + args_as_np += (self.index.to_numpy().reshape(-1, 1),) if hasattr(self, "columns") and self.columns is not None: - args += (self.columns.to_numpy(),) + args_as_np += (self.columns.to_numpy(),) bc = np.broadcast_arrays(*args_as_np) if dtype is not None: bc = [array.astype(dtype) for array in bc] diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 5e7bd7258..5e4ef18af 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -105,11 +105,11 @@ def __init__( ) if index is None: - index = pd.RangeIndex(qv_low.shape[0]) + index = pd.RangeIndex(alpha.shape[0]) self.index = index if columns is None: - columns = pd.RangeIndex(1) + columns = pd.RangeIndex(alpha.shape[1]) self.columns = columns if version == "normal": @@ -228,6 +228,8 @@ def get_test_params(cls, parameter_set="default"): "qv_low": 0.2, "qv_median": 0.5, "qv_high": 0.8, + "index": pd.Index([1, 2, 5]), + "columns": pd.Index(["a"]), } params2 = { "alpha": 0.2, @@ -333,11 +335,11 @@ def __init__( ) if index is None: - index = pd.RangeIndex(qv_low.shape[0]) + index = pd.RangeIndex(alpha.shape[0]) self.index = index if columns is None: - columns = pd.RangeIndex(1) + columns = pd.RangeIndex(alpha.shape[1]) self.columns = columns if version == "normal": @@ -459,6 +461,8 @@ def get_test_params(cls, parameter_set="default"): "qv_high": 0.8, "lower": 0.0, "upper": 1.0, + "index": pd.Index([1, 2, 5]), + "columns": pd.Index(["a"]), } params2 = { "alpha": 0.2, @@ -556,11 +560,11 @@ def __init__( ) if index is None: - index = pd.RangeIndex(qv_low.shape[0]) + index = pd.RangeIndex(alpha.shape[0]) self.index = index if columns is None: - columns = pd.RangeIndex(1) + columns = pd.RangeIndex(alpha.shape[1]) self.columns = columns if version == "normal": @@ -678,6 +682,8 @@ def get_test_params(cls, parameter_set="default"): "qv_low": 0.2, "qv_median": 0.5, "qv_high": 0.8, + "index": pd.Index([1, 2, 5]), + "columns": pd.Index(["a"]), } params2 = { "alpha": 0.2, From ea0eb960058d681932ac17b21f988b17f888aa27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 18 Apr 2024 23:33:12 +0100 Subject: [PATCH 27/52] flatten --- skpro/distributions/qpd.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 5e4ef18af..ccd3619cd 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -103,6 +103,10 @@ def __init__( alpha, qv_low, qv_median, qv_high = self._get_bc_params( alpha, qv_low, qv_median, qv_high, oned_as="col" ) + alpha = alpha.flatten() + qv_low = qv_low.flatten() + qv_median = qv_median.flatten() + qv_high = qv_high.flatten() if index is None: index = pd.RangeIndex(alpha.shape[0]) @@ -333,6 +337,10 @@ def __init__( alpha, qv_low, qv_median, qv_high = self._get_bc_params( alpha, qv_low, qv_median, qv_high, oned_as="col" ) + alpha = alpha.flatten() + qv_low = qv_low.flatten() + qv_median = qv_median.flatten() + qv_high = qv_high.flatten() if index is None: index = pd.RangeIndex(alpha.shape[0]) @@ -558,6 +566,10 @@ def __init__( alpha, qv_low, qv_median, qv_high = self._get_bc_params( alpha, qv_low, qv_median, qv_high, oned_as="col" ) + alpha = alpha.flatten() + qv_low = qv_low.flatten() + qv_median = qv_median.flatten() + qv_high = qv_high.flatten() if index is None: index = pd.RangeIndex(alpha.shape[0]) From e7e13d48758a4f221b1ee1f01e2793f118a8c629 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Fri, 19 Apr 2024 09:05:54 +0100 Subject: [PATCH 28/52] move to one loc, treat alpha --- skpro/distributions/qpd.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index ccd3619cd..0866e9971 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -100,13 +100,9 @@ def __init__( from cyclic_boosting.quantile_matching import J_QPD_extended_S - alpha, qv_low, qv_median, qv_high = self._get_bc_params( - alpha, qv_low, qv_median, qv_high, oned_as="col" + alpha, qv_low, qv_median, qv_high = _prep_qpd_params( + self, alpha, qv_low, qv_median, qv_high, oned_as="col" ) - alpha = alpha.flatten() - qv_low = qv_low.flatten() - qv_median = qv_median.flatten() - qv_high = qv_high.flatten() if index is None: index = pd.RangeIndex(alpha.shape[0]) @@ -334,13 +330,9 @@ def __init__( from cyclic_boosting.quantile_matching import J_QPD_extended_B - alpha, qv_low, qv_median, qv_high = self._get_bc_params( - alpha, qv_low, qv_median, qv_high, oned_as="col" + alpha, qv_low, qv_median, qv_high = _prep_qpd_params( + self, alpha, qv_low, qv_median, qv_high, oned_as="col" ) - alpha = alpha.flatten() - qv_low = qv_low.flatten() - qv_median = qv_median.flatten() - qv_high = qv_high.flatten() if index is None: index = pd.RangeIndex(alpha.shape[0]) @@ -563,13 +555,9 @@ def __init__( from cyclic_boosting.quantile_matching import J_QPD_extended_U - alpha, qv_low, qv_median, qv_high = self._get_bc_params( - alpha, qv_low, qv_median, qv_high, oned_as="col" + alpha, qv_low, qv_median, qv_high = _prep_qpd_params( + self, alpha, qv_low, qv_median, qv_high, oned_as="col" ) - alpha = alpha.flatten() - qv_low = qv_low.flatten() - qv_median = qv_median.flatten() - qv_high = qv_high.flatten() if index is None: index = pd.RangeIndex(alpha.shape[0]) @@ -721,3 +709,16 @@ def var_func(x, mu, qpd): # TODO: scipy.integrate will be removed in scipy 1.12.0 pdf = derivative(qpd.cdf, x, dx=1e-6) return ((x - mu) ** 2) * pdf + + +def _prep_qpd_params(self, alpha, qv_low, qv_median, qv_high): + """Prepare parameters for Johnson Quantile-Parameterized Distributions.""" + if not isinstance(alpha, np.ndarray): + alpha = np.array([alpha]) + qv_low, qv_median, qv_high = BaseDistribution._get_bc_params( + self, qv_low, qv_median, qv_high, oned_as="col" + ) + qv_low = qv_low.flatten() + qv_median = qv_median.flatten() + qv_high = qv_high.flatten() + return alpha, qv_low, qv_median, qv_high From 323e58c4ebdf6461aeab702279aa3a03f6600d7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Fri, 19 Apr 2024 09:14:35 +0100 Subject: [PATCH 29/52] Update qpd.py --- skpro/distributions/qpd.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 0866e9971..94981104e 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -101,7 +101,7 @@ def __init__( from cyclic_boosting.quantile_matching import J_QPD_extended_S alpha, qv_low, qv_median, qv_high = _prep_qpd_params( - self, alpha, qv_low, qv_median, qv_high, oned_as="col" + self, alpha, qv_low, qv_median, qv_high ) if index is None: @@ -331,7 +331,7 @@ def __init__( from cyclic_boosting.quantile_matching import J_QPD_extended_B alpha, qv_low, qv_median, qv_high = _prep_qpd_params( - self, alpha, qv_low, qv_median, qv_high, oned_as="col" + self, alpha, qv_low, qv_median, qv_high ) if index is None: @@ -556,7 +556,7 @@ def __init__( from cyclic_boosting.quantile_matching import J_QPD_extended_U alpha, qv_low, qv_median, qv_high = _prep_qpd_params( - self, alpha, qv_low, qv_median, qv_high, oned_as="col" + self, alpha, qv_low, qv_median, qv_high ) if index is None: From f06963abd1dab4187e34058192a3dd49ca00544e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Fri, 19 Apr 2024 09:25:03 +0100 Subject: [PATCH 30/52] Update qpd.py --- skpro/distributions/qpd.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 94981104e..ec17f9dc3 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -109,7 +109,7 @@ def __init__( self.index = index if columns is None: - columns = pd.RangeIndex(alpha.shape[1]) + columns = pd.RangeIndex(1) self.columns = columns if version == "normal": @@ -339,7 +339,7 @@ def __init__( self.index = index if columns is None: - columns = pd.RangeIndex(alpha.shape[1]) + columns = pd.RangeIndex(1) self.columns = columns if version == "normal": @@ -564,7 +564,7 @@ def __init__( self.index = index if columns is None: - columns = pd.RangeIndex(alpha.shape[1]) + columns = pd.RangeIndex(1) self.columns = columns if version == "normal": From 770cf3f9ea8bb3bf25fca5f6e6fa25687dc61d0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Fri, 19 Apr 2024 09:41:01 +0100 Subject: [PATCH 31/52] Update qpd.py --- skpro/distributions/qpd.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index ec17f9dc3..690ba538c 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -105,7 +105,7 @@ def __init__( ) if index is None: - index = pd.RangeIndex(alpha.shape[0]) + index = pd.RangeIndex(qv_low.shape[0]) self.index = index if columns is None: @@ -335,7 +335,7 @@ def __init__( ) if index is None: - index = pd.RangeIndex(alpha.shape[0]) + index = pd.RangeIndex(qv_low.shape[0]) self.index = index if columns is None: @@ -560,7 +560,7 @@ def __init__( ) if index is None: - index = pd.RangeIndex(alpha.shape[0]) + index = pd.RangeIndex(qv_low.shape[0]) self.index = index if columns is None: From e4c947db71b88523287d9456ac52f2fa570226f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Fri, 19 Apr 2024 10:08:48 +0100 Subject: [PATCH 32/52] Update test_all_distrs.py --- skpro/distributions/tests/test_all_distrs.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/skpro/distributions/tests/test_all_distrs.py b/skpro/distributions/tests/test_all_distrs.py index d1b18a76f..84cb5ca59 100644 --- a/skpro/distributions/tests/test_all_distrs.py +++ b/skpro/distributions/tests/test_all_distrs.py @@ -59,6 +59,11 @@ def _has_capability(distr, method): class TestAllDistributions(PackageConfig, DistributionFixtureGenerator, QuickTester): """Module level tests for all skpro parameter fitters.""" + # TEMPORARY skip for CyclicBoosting and QPD classes + # due to silent failures on main, se #190 + exclude_objects = ["QPD_B"] + # remove this when fixing failures to re-enable testing + def test_shape(self, object_instance): """Test index, columns, len and shape of distribution.""" d = object_instance From ddddc95ec8c2df61ca9d12e13dc00da5604e09a6 Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Thu, 25 Apr 2024 15:19:31 +0900 Subject: [PATCH 33/52] remove findiff, It is included in cyclic-boosting --- pyproject.toml | 1 - skpro/distributions/qpd.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4dc064c01..e6a799bf9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,6 @@ all_extras = [ "tabulate", "uncertainties", "cyclic-boosting>=1.4.0; python_version < '3.12'", - "findiff" ] dev = [ diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 47d9c09f7..e9f378fac 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -68,7 +68,7 @@ class QPD_S(BaseDistribution): # -------------- "authors": ["setoguchi-naoki", "felix-wick"], "maintainers": ["setoguchi-naoki"], - "python_dependencies": ["cyclic_boosting>=1.4.0", "findiff"], + "python_dependencies": ["cyclic_boosting>=1.4.0"], # estimator tags # -------------- "capabilities:approx": [], @@ -273,7 +273,7 @@ class QPD_B(BaseDistribution): # -------------- "authors": ["setoguchi-naoki", "felix-wick"], "maintainers": ["setoguchi-naoki"], - "python_dependencies": ["cyclic_boosting>=1.4.0", "findiff"], + "python_dependencies": ["cyclic_boosting>=1.4.0"], # estimator tags # -------------- "capabilities:approx": [], From db033e05d069c46be308b5d6ac4ac39a382b165d Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Mon, 29 Apr 2024 19:52:28 +0900 Subject: [PATCH 34/52] put back QPD_U --- skpro/distributions/qpd.py | 344 +++++++++++++++++++++----- skpro/distributions/tests/test_qpd.py | 19 +- 2 files changed, 301 insertions(+), 62 deletions(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index e9f378fac..396f23598 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -11,9 +11,10 @@ import typing import warnings -from typing import Sequence if typing.TYPE_CHECKING: + from typing import Sequence, Optional + from cyclic_boosting.quantile_matching import J_QPD_S, J_QPD_B from pandas import DataFrame, Index @@ -101,16 +102,10 @@ def __init__( from cyclic_boosting.quantile_matching import J_QPD_S - params = [alpha, qv_low, qv_median, qv_high] - for idx, p in enumerate(params): - if isinstance(p, float): - params[idx] = np.array([p]) - elif isinstance(p, (tuple, list, np.ndarray)): - params[idx] = np.array(p) - else: - raise ValueError("data type is not float or array_like object") + alpha, qv_low, qv_median, qv_high = _prep_qpd_params( + self, alpha, qv_low, qv_median, qv_high + ) - alpha, qv_low, qv_median, qv_high = params[:] if index is None: index = pd.RangeIndex(qv_low.shape[0]) self.index = index @@ -164,7 +159,9 @@ def mean(self, lower: float = None, upper: float = None): lower = self.lower if not upper: upper = 1e3 - loc = exp_func(lower, upper, self.qpd, self.index.shape[0]) + x = np.linspace(lower, upper, num=int(1e3)) + cdf_arr = self.qpd.cdf(x).T + loc = exp_func(x, cdf_arr, self.index.shape[0]) return pd.DataFrame(loc, index=self.index, columns=self.columns) def var(self, lower: float = None, upper: float = None): @@ -180,7 +177,9 @@ def var(self, lower: float = None, upper: float = None): if not upper: upper = 1e3 mean = self.mean(lower, upper).values - var = var_func(mean, lower, upper, self.qpd, self.index.shape[0]) + x = np.linspace(lower, upper, num=int(1e3)) + cdf_arr = self.qpd.cdf(x).T + var = var_func(x, mean, cdf_arr, self.index.shape[0]) return pd.DataFrame(var, index=self.index, columns=self.columns) def pdf(self, x: pd.DataFrame): @@ -189,18 +188,15 @@ def pdf(self, x: pd.DataFrame): this fucntion transform cdf to pdf because j-qpd's pdf calculation is bit complex """ - pdf = pdf_func(x, self.qpd, self.index) - return pd.DataFrame(pdf, index=x.index, columns=x.columns) + return pdf_func(x, self.qpd, self.index) def ppf(self, p: pd.DataFrame): """Quantile function = percent point function = inverse cdf.""" - ppf = ppf_func(p, self.qpd, self.index) - return pd.DataFrame(ppf, index=p.index, columns=p.columns) + return ppf_func(p, self.qpd, self.index) def cdf(self, x: pd.DataFrame): """Cumulative distribution function.""" - cdf = cdf_func(x, self.qpd, self.index) - return pd.DataFrame(cdf, index=x.index, columns=x.columns) + return cdf_func(x, self.qpd, self.index) @classmethod def get_test_params(cls, parameter_set="default"): @@ -212,6 +208,8 @@ def get_test_params(cls, parameter_set="default"): "qv_median": 0.0, "qv_high": 0.3, "lower": -0.5, + "index": pd.RangeIndex(3), + "columns": pd.Index(["a"]), } params2 = { "alpha": 0.2, @@ -220,7 +218,7 @@ def get_test_params(cls, parameter_set="default"): "qv_median": [0.0, 0.0, 0.0], "qv_high": [0.3, 0.3, 0.3], "lower": -0.5, - "index": pd.Index([1, 2, 5]), + "index": pd.RangeIndex(3), "columns": pd.Index(["a"]), } return [params1, params2] @@ -308,16 +306,10 @@ def __init__( from cyclic_boosting.quantile_matching import J_QPD_B - params = [alpha, qv_low, qv_median, qv_high] - for idx, p in enumerate(params): - if isinstance(p, float): - params[idx] = np.array([p]) - elif isinstance(p, (tuple, list, np.ndarray)): - params[idx] = np.array(p) - else: - raise ValueError("data type is not float or array_like object") + alpha, qv_low, qv_median, qv_high = _prep_qpd_params( + self, alpha, qv_low, qv_median, qv_high + ) - alpha, qv_low, qv_median, qv_high = params[:] if index is None: index = pd.RangeIndex(qv_low.shape[0]) self.index = index @@ -372,7 +364,9 @@ def mean(self, lower: float = None, upper: float = None): lower = self.lower if not upper: upper = self.upper - loc = exp_func(lower, upper, self.qpd, self.index.shape[0]) + x = np.linspace(lower, upper, num=int(1e3)) + cdf_arr = self.qpd.cdf(x).T + loc = exp_func(x, cdf_arr, self.index.shape[0]) return pd.DataFrame(loc, index=self.index, columns=self.columns) def var(self, lower: float = None, upper: float = None): @@ -386,9 +380,11 @@ def var(self, lower: float = None, upper: float = None): if not lower: lower = self.lower if not upper: - upper = self.upper + upper = 1e3 mean = self.mean(lower, upper).values - var = var_func(mean, lower, upper, self.qpd, self.index.shape[0]) + x = np.linspace(lower, upper, num=int(1e3)) + cdf_arr = self.qpd.cdf(x).T + var = var_func(x, mean, cdf_arr, self.index.shape[0]) return pd.DataFrame(var, index=self.index, columns=self.columns) def pdf(self, x: pd.DataFrame): @@ -397,18 +393,15 @@ def pdf(self, x: pd.DataFrame): this fucntion transform cdf to pdf because j-qpd's pdf calculation is bit complex """ - pdf = pdf_func(x, self.qpd, self.index) - return pd.DataFrame(pdf, index=x.index, columns=x.columns) + return pdf_func(x, self.qpd, self.index) def ppf(self, p: pd.DataFrame): """Quantile function = percent point function = inverse cdf.""" - ppf = ppf_func(p, self.qpd, self.index) - return pd.DataFrame(ppf, index=p.index, columns=p.columns) + return ppf_func(p, self.qpd, self.index) def cdf(self, x: pd.DataFrame): """Cumulative distribution function.""" - cdf = cdf_func(x, self.qpd, self.index) - return pd.DataFrame(cdf, index=x.index, columns=x.columns) + return cdf_func(x, self.qpd, self.index) @classmethod def get_test_params(cls, parameter_set="default"): @@ -421,6 +414,8 @@ def get_test_params(cls, parameter_set="default"): "qv_high": 0.3, "lower": -0.5, "upper": 0.5, + "index": pd.RangeIndex(3), + "columns": pd.Index(["a"]), } params2 = { "alpha": 0.2, @@ -430,70 +425,278 @@ def get_test_params(cls, parameter_set="default"): "qv_high": [0.3, 0.3, 0.3], "lower": -0.5, "upper": 0.5, - "index": pd.Index([1, 2, 5]), + "index": pd.RangeIndex(3), "columns": pd.Index(["a"]), } return [params1, params2] -def calc_pdf(x: np.ndarray, qpd: J_QPD_S | J_QPD_B) -> np.ndarray: +class QPD_U(BaseDistribution): + """Johnson Quantile-Parameterized Distributions with bounded mode. + see https://repositories.lib.utexas.edu/bitstream/handle/2152 + /63037/HADLOCK-DISSERTATION-2017.pdf + (Due to the Python keyword, the parameter lambda from + this reference is named kappa below). + A distribution is parameterized by a symmetric-percentile triplet (SPT). + + Parameters + ---------- + alpha : float + lower quantile of SPT (upper is ``1 - alpha``) + qv_low : float or array_like[float] + quantile function value of ``alpha`` + qv_median : float or array_like[float] + quantile function value of quantile 0.5 + qv_high : float or array_like[float] + quantile function value of quantile ``1 - alpha`` + version: str + options are ``normal`` (default) or ``logistic`` + dist_shape: str + parameter modifying the logistic base distribution via + sinh/arcsinh-scaling (only active in sinhlogistic version) + + Example + ------- + >>> from skpro.distributions.qpd import QPD_U # doctest: +SKIP + + >>> qpd = QPD_U( + ... alpha=0.2, + ... qv_low=[1, 2], + ... qv_median=[3, 4], + ... qv_high=[5, 6], + ... ) # doctest: +SKIP + + >>> qpd.mean() # doctest: +SKIP + """ + + _tags = { + # packaging info + # -------------- + "authors": ["setoguchi-naoki", "felix-wick"], + "maintainers": ["setoguchi-naoki"], + "python_dependencies": "cyclic_boosting>=1.2.5", + # estimator tags + # -------------- + "capabilities:approx": [], + "capabilities:exact": ["mean", "var", "cdf", "ppf"], + "distr:measuretype": "continuous", + } + + def __init__( + self, + alpha: float, + qv_low: float | Sequence, + qv_median: float | Sequence, + qv_high: float | Sequence, + version: str | None = "normal", + dist_shape: Optional[float] = 0.0, + index=None, + columns=None, + ): + self.qpd = [] + self.alpha = alpha + self.qv_low = qv_low + self.qv_median = qv_median + self.qv_high = qv_high + self.version = version + self.dist_shape = dist_shape + self.index = index + self.columns = columns + + super().__init__(index=index, columns=columns) + + from cyclic_boosting.quantile_matching import J_QPD_extended_U + + alpha, qv_low, qv_median, qv_high = _prep_qpd_params( + self, alpha, qv_low, qv_median, qv_high + ) + + if index is None: + index = pd.RangeIndex(qv_low.shape[0]) + self.index = index + + if columns is None: + columns = pd.RangeIndex(1) + self.columns = columns + + if version == "normal": + self.phi = norm() + elif version == "logistic": + self.phi = logistic() + else: + raise Exception("Invalid version.") + + if (np.any(qv_low > qv_median)) or np.any(qv_high < qv_median): + warnings.warn( + "The SPT values are not monotonically increasing, " + "each SPT is sorted by value", + stacklevel=2, + ) + idx = np.where((qv_low > qv_median), True, False) + np.where( + (qv_high < qv_median), True, False + ) + un_orderd_idx = np.argwhere(idx > 0).tolist() + warnings.warn(f"sorted index {un_orderd_idx}", stacklevel=2) + for idx in un_orderd_idx: + low, mid, high = sorted([qv_low[idx], qv_median[idx], qv_high[idx]]) + qv_low[idx] = low + qv_median[idx] = mid + qv_high[idx] = high + + iter = np.nditer(qv_low, flags=["c_index"]) + for _i in iter: + jqpd = J_QPD_extended_U( + alpha=alpha, + qv_low=qv_low[iter.index], + qv_median=qv_median[iter.index], + qv_high=qv_high[iter.index], + version=version, + shape=dist_shape, + ) + self.qpd.append(jqpd) + self.qpd = pd.DataFrame(self.qpd, index=self.index) + + def mean(self, lower: float = -1e3, upper: float = 1e3): + """Return expected value of the distribution. + + Returns + ------- + pd.DataFrame with same rows, columns as `self` + expected value of distribution (entry-wise) + """ + cdf_arr = [] + x = np.linspace(lower, upper, num=int(1e3)) + for idx in self.index: + qpd = self.qpd.loc[idx, :].values[0] + cdf_arr.append(qpd.cdf(x)) + cdf_arr = np.asarray(cdf_arr) + loc = exp_func(x, cdf_arr, self.index.shape[0]) + return pd.DataFrame(loc, index=self.index, columns=self.columns) + + def var(self, lower: float = -1e3, upper: float = 1e3): + """Return element/entry-wise variance of the distribution. + + Returns + ------- + pd.DataFrame with same rows, columns as `self` + variance of distribution (entry-wise) + """ + mean_arr = self.mean(lower, upper).values + cdf_arr = [] + x = np.linspace(lower, upper, num=int(1e3)) + for idx in self.index: + qpd = self.qpd.loc[idx, :].values[0] + cdf_arr.append(qpd.cdf(x)) + cdf_arr = np.asarray(cdf_arr) + var_arr = var_func(x, mean_arr, cdf_arr, self.index.shape[0]) + return pd.DataFrame(var_arr, index=self.index, columns=self.columns) + + def pdf(self, x: pd.DataFrame): + """Probability density function. + + this fucntion transform cdf to pdf + because j-qpd's pdf calculation is bit complex + """ + return pdf_func(x, self.qpd, self.index) + + def ppf(self, p: pd.DataFrame): + """Quantile function = percent point function = inverse cdf.""" + return ppf_func(p, self.qpd, self.index) + + def cdf(self, x: pd.DataFrame): + """Cumulative distribution function.""" + return cdf_func(x, self.qpd, self.index) + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator.""" + params1 = { + "alpha": 0.2, + "version": "normal", + "qv_low": 0.2, + "qv_median": 0.5, + "qv_high": 0.8, + "index": pd.RangeIndex(3), + "columns": pd.Index(["a"]), + } + params2 = { + "alpha": 0.2, + "version": "normal", + "qv_low": [0.2, 0.2, 0.2], + "qv_median": [0.5, 0.5, 0.5], + "qv_high": [0.8, 0.8, 0.8], + "index": pd.RangeIndex(3), + "columns": pd.Index(["a"]), + } + return [params1, params2] + + +def calc_pdf(x: np.ndarray, cdf: np.ndarray) -> np.ndarray: """Return pdf value for all samples.""" from findiff import FinDiff dx = x[1] - x[0] derivative = FinDiff(1, dx, 1) - cdf = qpd.cdf(x).T if cdf.ndim < 2: cdf = cdf[np.newaxis, :] pdf = np.asarray(derivative(cdf)) return pdf -def exp_func(lower: float, upper: float, qpd: J_QPD_S | J_QPD_B, size: int): +def exp_func(x: np.ndarray, cdf: np.ndarray, size: int): """Return Expectation.""" - x = np.linspace(lower, upper, num=int(1e3)) - pdf_arr = calc_pdf(x, qpd) + pdf_arr = calc_pdf(x, cdf) x = np.tile(x, (size, 1)) - loc_arr = np.trapz(x * pdf_arr, x, dx=1e-6, axis=1) - return loc_arr + loc = np.trapz(x * pdf_arr, x, dx=1e-6, axis=1) + return loc -def var_func( - mu: np.ndarray, lower: float, upper: float, qpd: J_QPD_S | J_QPD_B, size: int -): +def var_func(x: np.ndarray, mu: np.ndarray, cdf: np.ndarray, size: int): """Return Variance.""" - x = np.linspace(lower, upper, num=int(1e3)) - pdf_arr = calc_pdf(x, qpd) + pdf_arr = calc_pdf(x, cdf) x = np.tile(x, (size, 1)) - var_arr = np.trapz(((x - mu) ** 2) * pdf_arr, x, dx=1e-6, axis=1) - return var_arr + var = np.trapz(((x - mu) ** 2) * pdf_arr, x, dx=1e-6, axis=1) + return var -def pdf_func(x: DataFrame, qpd: J_QPD_S | J_QPD_B, index: Index): +def pdf_func(x: DataFrame, dist: J_QPD_S | J_QPD_B | pd.DataFrame, index: Index): """Return pdf value.""" - x_value = np.unique(x.values) + qpd = dist.values if isinstance(dist, pd.DataFrame) else dist + prob_var = np.unique(x.values) pdf = np.zeros((x.index.shape[0], len(x.columns))) - for v in x_value: + for v in prob_var: + # all qpds x0 = np.linspace(v, v + 1e-3, num=3) - pdf_arr = calc_pdf(x0, qpd)[:, 0] + if isinstance(dist, pd.DataFrame): + cdf_arr = np.asarray([func[0].cdf(x0) for func in qpd]) + else: + cdf_arr = qpd.cdf(x0).T + pdf_arr = calc_pdf(x0, cdf_arr)[:, 0] if pdf_arr.ndim < 1: pdf_arr = pdf_arr[np.newaxis] + # pick up rows, cols = np.where(x.values == v) for r, c in zip(rows, cols): id = x.index[r] target = index.get_loc(id) pdf[r][c] = pdf_arr[target] - return pdf + return pd.DataFrame(pdf, index=x.index, columns=x.columns) -def ppf_func(x: DataFrame, qpd: J_QPD_S | J_QPD_B, index: Index): +def ppf_func(x: DataFrame, dist: J_QPD_S | J_QPD_B | pd.DataFrame, index: Index): """Return ppf value.""" + qpd = dist.values if isinstance(dist, pd.DataFrame) else dist quantiles = np.unique(x.values) ppf = np.zeros((x.index.shape[0], len(x.columns))) for q in quantiles: - ppf_arr = qpd.ppf(q).T + # all qpds + if isinstance(dist, pd.DataFrame): + ppf_arr = np.asarray([func[0].ppf(q) for func in qpd]) + else: + ppf_arr = qpd.ppf(q).T if ppf_arr.ndim < 1: ppf_arr = ppf_arr[np.newaxis] + # pick up rows, cols = np.where(x.values == q) for r, c in zip(rows, cols): id = x.index[r] @@ -502,17 +705,36 @@ def ppf_func(x: DataFrame, qpd: J_QPD_S | J_QPD_B, index: Index): return pd.DataFrame(ppf, index=x.index, columns=x.columns) -def cdf_func(x: DataFrame, qpd: J_QPD_S | J_QPD_B, index: Index): +def cdf_func(x: DataFrame, dist: J_QPD_S | J_QPD_B | pd.DataFrame, index: Index): """Return cdf value.""" + qpd = dist.values if isinstance(dist, pd.DataFrame) else dist x_value = np.unique(x.values) cdf = np.zeros((x.index.shape[0], len(x.columns))) for v in x_value: - cdf_arr = qpd.cdf(v).T + # all qpds + if isinstance(dist, pd.DataFrame): + cdf_arr = np.asarray([func[0].cdf(v) for func in qpd]) + else: + cdf_arr = qpd.cdf(v).T if cdf_arr.ndim < 1: cdf_arr = cdf_arr[np.newaxis] + # pick up rows, cols = np.where(x.values == v) for r, c in zip(rows, cols): id = x.index[r] target = index.get_loc(id) cdf[r][c] = cdf_arr[target] return pd.DataFrame(cdf, index=x.index, columns=x.columns) + + +def _prep_qpd_params(self, alpha, qv_low, qv_median, qv_high): + """Prepare parameters for Johnson Quantile-Parameterized Distributions.""" + if not isinstance(alpha, np.ndarray): + alpha = np.array([alpha]) + qv_low, qv_median, qv_high = BaseDistribution._get_bc_params( + self, qv_low, qv_median, qv_high, oned_as="col" + ) + qv_low = qv_low.flatten() + qv_median = qv_median.flatten() + qv_high = qv_high.flatten() + return alpha, qv_low, qv_median, qv_high diff --git a/skpro/distributions/tests/test_qpd.py b/skpro/distributions/tests/test_qpd.py index 0df932515..88719fec1 100644 --- a/skpro/distributions/tests/test_qpd.py +++ b/skpro/distributions/tests/test_qpd.py @@ -2,7 +2,8 @@ import pytest -from skpro.distributions.qpd import QPD_B, QPD_S +from skpro.distributions.qpd import QPD_B, QPD_S, QPD_U + from skpro.tests.test_switch import run_test_for_class @@ -39,3 +40,19 @@ def test_qpd_s_simple_use(): ) qpd.mean() + + +@pytest.mark.skipif( + not run_test_for_class(QPD_U), + reason="run test only if softdeps are present and incrementally (if requested)", +) +def test_qpd_u_simple_use(): + """Test simple use of qpd with un-bounded mode.""" + qpd = QPD_U( + alpha=0.2, + qv_low=[1, 2], + qv_median=[3, 4], + qv_high=[5, 6], + ) + + qpd.mean() From 2e3d9e76e84aacf2f2f705196046f6c0939e3337 Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Mon, 29 Apr 2024 19:53:15 +0900 Subject: [PATCH 35/52] put back QPD_U class --- skpro/regression/cyclic_boosting.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/skpro/regression/cyclic_boosting.py b/skpro/regression/cyclic_boosting.py index 00d0e55e8..42b6a33e5 100644 --- a/skpro/regression/cyclic_boosting.py +++ b/skpro/regression/cyclic_boosting.py @@ -18,7 +18,7 @@ import numpy as np import pandas as pd -from skpro.distributions.qpd import QPD_B, QPD_S +from skpro.distributions.qpd import QPD_B, QPD_S, QPD_U from skpro.regression.base import BaseProbaRegressor @@ -290,8 +290,10 @@ def _predict_proba(self, X): params["lower"] = self.lower params["upper"] = self.upper qpd = QPD_B(**params) + elif self.bound == "U": + qpd = QPD_U(**params) else: - raise ValueError("bound need to be 'S' or 'B'") + raise ValueError("bound need to be 'S' or 'B' or 'U") return qpd From 027794686d627aab01a6d1131713fa1f8d5897a3 Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Mon, 29 Apr 2024 21:35:14 +0900 Subject: [PATCH 36/52] Consolidated interface to be QPD Johnson --- skpro/distributions/qpd.py | 43 +++++++++------ skpro/regression/cyclic_boosting.py | 54 ++++++++++--------- .../regression/tests/test_cyclic_boosting.py | 1 - 3 files changed, 56 insertions(+), 42 deletions(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 41c149758..cbebeac73 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -13,7 +13,7 @@ import warnings if typing.TYPE_CHECKING: - from typing import Sequence, Optional + from typing import Sequence, Optional, Union from cyclic_boosting.quantile_matching import J_QPD_S, J_QPD_B from pandas import DataFrame, Index @@ -94,9 +94,9 @@ class QPD_Johnson(_DelegatedDistribution): def __init__( self, alpha: float, - qv_low: float or object, - qv_median: float or object, - qv_high: float or object, + qv_low: Union[float, Sequence], + qv_median: Union[float, Sequence], + qv_high: Union[float, Sequence], lower: Optional[float] = None, upper: Optional[float] = None, version: Optional[str] = "normal", @@ -117,7 +117,7 @@ def __init__( if lower is None: delegate_cls = QPD_U - extra_params = {} + extra_params = {"dist_shape": dist_shape} elif upper is None: delegate_cls = QPD_S extra_params = {"lower": lower} @@ -131,7 +131,6 @@ def __init__( "qv_median": qv_median, "qv_high": qv_high, "version": version, - "dist_shape": dist_shape, "index": index, "columns": columns, **extra_params, @@ -238,9 +237,9 @@ class QPD_S(BaseDistribution): def __init__( self, alpha: float, - qv_low: float | Sequence, - qv_median: float | Sequence, - qv_high: float | Sequence, + qv_low: Union[float, Sequence], + qv_median: Union[float, Sequence], + qv_high: Union[float, Sequence], lower: float, version: str | None = "normal", index=None, @@ -308,6 +307,8 @@ def __init__( def mean(self, lower: float = None, upper: float = None): """Return expected value of the distribution. + Please set the upper and lower limits of the random variable correctly. + Returns ------- pd.DataFrame with same rows, columns as `self` @@ -325,6 +326,8 @@ def mean(self, lower: float = None, upper: float = None): def var(self, lower: float = None, upper: float = None): """Return element/entry-wise variance of the distribution. + Please set the upper and lower limits of the random variable correctly. + Returns ------- pd.DataFrame with same rows, columns as `self` @@ -440,9 +443,9 @@ class QPD_B(BaseDistribution): def __init__( self, alpha: float, - qv_low: float | Sequence, - qv_median: float | Sequence, - qv_high: float | Sequence, + qv_low: Union[float, Sequence], + qv_median: Union[float, Sequence], + qv_high: Union[float, Sequence], lower: float, upper: float, version: str | None = "normal", @@ -513,6 +516,8 @@ def __init__( def mean(self, lower: float = None, upper: float = None): """Return expected value of the distribution. + Please set the upper and lower limits of the random variable correctly. + Returns ------- pd.DataFrame with same rows, columns as `self` @@ -530,6 +535,8 @@ def mean(self, lower: float = None, upper: float = None): def var(self, lower: float = None, upper: float = None): """Return element/entry-wise variance of the distribution. + Please set the upper and lower limits of the random variable correctly. + Returns ------- pd.DataFrame with same rows, columns as `self` @@ -538,7 +545,7 @@ def var(self, lower: float = None, upper: float = None): if not lower: lower = self.lower if not upper: - upper = 1e3 + upper = self.upper mean = self.mean(lower, upper).values x = np.linspace(lower, upper, num=int(1e3)) cdf_arr = self.qpd.cdf(x).T @@ -643,9 +650,9 @@ class QPD_U(BaseDistribution): def __init__( self, alpha: float, - qv_low: float | Sequence, - qv_median: float | Sequence, - qv_high: float | Sequence, + qv_low: Union[float, Sequence], + qv_median: Union[float, Sequence], + qv_high: Union[float, Sequence], version: str | None = "normal", dist_shape: Optional[float] = 0.0, index=None, @@ -717,6 +724,8 @@ def __init__( def mean(self, lower: float = -1e3, upper: float = 1e3): """Return expected value of the distribution. + Please set the upper and lower limits of the random variable correctly. + Returns ------- pd.DataFrame with same rows, columns as `self` @@ -734,6 +743,8 @@ def mean(self, lower: float = -1e3, upper: float = 1e3): def var(self, lower: float = -1e3, upper: float = 1e3): """Return element/entry-wise variance of the distribution. + Please set the upper and lower limits of the random variable correctly. + Returns ------- pd.DataFrame with same rows, columns as `self` diff --git a/skpro/regression/cyclic_boosting.py b/skpro/regression/cyclic_boosting.py index 42b6a33e5..f67e5cbee 100644 --- a/skpro/regression/cyclic_boosting.py +++ b/skpro/regression/cyclic_boosting.py @@ -7,18 +7,24 @@ https://cyclic-boosting.readthedocs.io/en/latest/ """ +from __future__ import annotations + # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) __author__ = [ "setoguchi-naoki" ] # interface only. Cyclic boosting authors in cyclic_boosting package +import typing import warnings +if typing.TYPE_CHECKING: + from typing import Optional + import numpy as np import pandas as pd -from skpro.distributions.qpd import QPD_B, QPD_S, QPD_U +from skpro.distributions.qpd import QPD_Johnson from skpro.regression.base import BaseProbaRegressor @@ -48,14 +54,18 @@ class CyclicBoosting(BaseProbaRegressor): lower quantile for QPD's parameter alpha mode : str, default='multiplicative' the type of quantile regressor. 'multiplicative' or 'additive' - bound : str, default='U' - Different modes defined by supported target range, options are ``S`` - (semi-bound), ``B`` (bound), and ``U`` (unbound). - lower : float, default=0.0 + lower : float, default=None lower bound of supported range (only active for bound and semi-bound - modes) - upper : float, default=1.0 - upper bound of supported range (only active for bound mode) + modes). If neither 'lower' nor 'upper' is specified, `QPD_U` will be used as + unbound-mode + upper : float, default=None + upper bound of supported range (only active for bound mode). If neither + 'lower' nor 'upper' is specified, `QPD_U` will be used as unbound-mode + version: str, one of ``'normal'`` (default), ``'logistic'`` + options are ``'normal'`` (default) or ``'logistic'`` + dist_shape: float, optional, default=0.0 + parameter modifying the logistic base distribution via + sinh/arcsinh-scaling (only active in sinhlogistic version) maximal_iterations : int, default=10 number of iterations @@ -106,9 +116,10 @@ def __init__( feature_properties=None, alpha=0.2, mode="multiplicative", - bound="S", - lower=0.0, - upper=1.0, + lower=None, + upper=None, + version: Optional[str] = "normal", + dist_shape: Optional[float] = 0.0, maximal_iterations=10, ): self.feature_groups = feature_groups @@ -119,9 +130,10 @@ def __init__( self.quantile_est = list() self.qpd = None self.mode = mode - self.bound = bound self.lower = lower self.upper = upper + self.version = version + self.dist_shape = dist_shape self.maximal_iterations = maximal_iterations super().__init__() @@ -280,20 +292,14 @@ def _predict_proba(self, X): "qv_low": self.quantile_values[0], "qv_median": self.quantile_values[1], "qv_high": self.quantile_values[2], + "lower": self.lower, + "upper": self.upper, + "version": self.version, + "dist_shape": self.dist_shape, "index": index, "columns": y_cols, } - if self.bound == "S": - params["lower"] = self.lower - qpd = QPD_S(**params) - elif self.bound == "B": - params["lower"] = self.lower - params["upper"] = self.upper - qpd = QPD_B(**params) - elif self.bound == "U": - qpd = QPD_U(**params) - else: - raise ValueError("bound need to be 'S' or 'B' or 'U") + qpd = QPD_Johnson(**params) return qpd @@ -455,14 +461,12 @@ def get_test_params(cls, parameter_set="default"): param1 = { "alpha": 0.2, "mode": "additive", - "bound": "S", "lower": 0.0, "maximal_iterations": 5, } param2 = { "alpha": 0.2, "mode": "additive", - "bound": "B", "lower": 0.0, "upper": 1000, "maximal_iterations": 5, diff --git a/skpro/regression/tests/test_cyclic_boosting.py b/skpro/regression/tests/test_cyclic_boosting.py index 15e93b616..3731850c3 100644 --- a/skpro/regression/tests/test_cyclic_boosting.py +++ b/skpro/regression/tests/test_cyclic_boosting.py @@ -72,7 +72,6 @@ def test_cyclic_boosting_with_manual_paramaters(): maximal_iterations=5, alpha=0.25, mode="additive", - bound="S", lower=0.0, ) reg_proba.fit(X_train, y_train) From 2c8aa4a8ff2aff1911e4d70d2ea1a6f7de63a98d Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Mon, 29 Apr 2024 21:49:58 +0900 Subject: [PATCH 37/52] documentation --- skpro/distributions/__init__.py | 3 ++- skpro/regression/cyclic_boosting.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/skpro/distributions/__init__.py b/skpro/distributions/__init__.py index 01a829c82..79f663a1c 100644 --- a/skpro/distributions/__init__.py +++ b/skpro/distributions/__init__.py @@ -15,6 +15,7 @@ "QPD_S", "QPD_B", "QPD_U", + "QPD_Johnson", "TDistribution", "Weibull", ] @@ -26,7 +27,7 @@ from skpro.distributions.mixture import Mixture from skpro.distributions.normal import Normal from skpro.distributions.poisson import Poisson -from skpro.distributions.qpd import QPD_B, QPD_S, QPD_U +from skpro.distributions.qpd import QPD_B, QPD_S, QPD_U, QPD_Johnson from skpro.distributions.qpd_empirical import QPD_Empirical from skpro.distributions.t import TDistribution from skpro.distributions.weibull import Weibull diff --git a/skpro/regression/cyclic_boosting.py b/skpro/regression/cyclic_boosting.py index f67e5cbee..6f152b3a1 100644 --- a/skpro/regression/cyclic_boosting.py +++ b/skpro/regression/cyclic_boosting.py @@ -83,7 +83,7 @@ class CyclicBoosting(BaseProbaRegressor): Johnson Quantile-Parameterized Distributions instance Example - -------- + ------- >>> from skpro.regression.cyclic_boosting import CyclicBoosting >>> from sklearn.datasets import load_diabetes # doctest: +SKIP >>> from sklearn.model_selection import train_test_split # doctest: +SKIP From 0e1f65fceb47e7082f8cdcdd1f6768a5512894c0 Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Mon, 29 Apr 2024 22:32:21 +0900 Subject: [PATCH 38/52] remove log_pdf --- skpro/distributions/qpd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index fd598ba30..380b0d93e 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -87,7 +87,7 @@ class QPD_Johnson(_DelegatedDistribution): # estimator tags # -------------- "capabilities:approx": ["pdfnorm", "energy"], - "capabilities:exact": ["mean", "var", "cdf", "ppf", "pdf", "log_pdf"], + "capabilities:exact": ["mean", "var", "cdf", "ppf", "pdf"], "distr:measuretype": "continuous", } From e96c74380b4deaf0382d3cf1e958abcb924ba17f Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Mon, 29 Apr 2024 23:47:40 +0900 Subject: [PATCH 39/52] formatting --- skpro/distributions/qpd.py | 43 ++++++++++++++------------- skpro/distributions/tests/test_qpd.py | 1 - skpro/regression/cyclic_boosting.py | 11 ++----- 3 files changed, 25 insertions(+), 30 deletions(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 380b0d93e..c0be8ea74 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -13,7 +13,7 @@ import warnings if typing.TYPE_CHECKING: - from typing import Sequence, Optional, Union + from typing import Sequence, Union from cyclic_boosting.quantile_matching import J_QPD_S, J_QPD_B from pandas import DataFrame, Index @@ -94,13 +94,13 @@ class QPD_Johnson(_DelegatedDistribution): def __init__( self, alpha: float, - qv_low: Union[float, Sequence], - qv_median: Union[float, Sequence], - qv_high: Union[float, Sequence], - lower: Optional[float] = None, - upper: Optional[float] = None, - version: Optional[str] = "normal", - dist_shape: Optional[float] = 0.0, + qv_low: float | Sequence, + qv_median: float | Sequence, + qv_high: float | Sequence, + lower: float | None = None, + upper: float | None = None, + version: str | None = "normal", + dist_shape: float | None = 0.0, index=None, columns=None, ): @@ -230,16 +230,16 @@ class QPD_S(BaseDistribution): # estimator tags # -------------- "capabilities:approx": ["pdfnorm", "energy"], - "capabilities:exact": ["mean", "var", "cdf", "ppf", "pdf", "log_pdf"], + "capabilities:exact": ["mean", "var", "cdf", "ppf", "pdf"], "distr:measuretype": "continuous", } def __init__( self, alpha: float, - qv_low: Union[float, Sequence], - qv_median: Union[float, Sequence], - qv_high: Union[float, Sequence], + qv_low: float | Sequence, + qv_median: float | Sequence, + qv_high: float | Sequence, lower: float, version: str | None = "normal", index=None, @@ -438,16 +438,16 @@ class QPD_B(BaseDistribution): # estimator tags # -------------- "capabilities:approx": ["pdfnorm", "energy"], - "capabilities:exact": ["mean", "var", "cdf", "ppf", "pdf", "log_pdf"], + "capabilities:exact": ["mean", "var", "cdf", "ppf", "pdf"], "distr:measuretype": "continuous", } def __init__( self, alpha: float, - qv_low: Union[float, Sequence], - qv_median: Union[float, Sequence], - qv_high: Union[float, Sequence], + qv_low: float | Sequence, + qv_median: float | Sequence, + qv_high: float | Sequence, lower: float, upper: float, version: str | None = "normal", @@ -600,6 +600,7 @@ def get_test_params(cls, parameter_set="default"): class QPD_U(BaseDistribution): """Johnson Quantile-Parameterized Distributions with bounded mode. + see https://repositories.lib.utexas.edu/bitstream/handle/2152 /63037/HADLOCK-DISSERTATION-2017.pdf (Due to the Python keyword, the parameter lambda from @@ -645,18 +646,18 @@ class QPD_U(BaseDistribution): # estimator tags # -------------- "capabilities:approx": ["pdfnorm", "energy"], - "capabilities:exact": ["mean", "var", "cdf", "ppf", "pdf", "log_pdf"], + "capabilities:exact": ["mean", "var", "cdf", "ppf", "pdf"], "distr:measuretype": "continuous", } def __init__( self, alpha: float, - qv_low: Union[float, Sequence], - qv_median: Union[float, Sequence], - qv_high: Union[float, Sequence], + qv_low: float | Sequence, + qv_median: float | Sequence, + qv_high: float | Sequence, version: str | None = "normal", - dist_shape: Optional[float] = 0.0, + dist_shape: float | None = 0.0, index=None, columns=None, ): diff --git a/skpro/distributions/tests/test_qpd.py b/skpro/distributions/tests/test_qpd.py index 88719fec1..78c418c1a 100644 --- a/skpro/distributions/tests/test_qpd.py +++ b/skpro/distributions/tests/test_qpd.py @@ -3,7 +3,6 @@ import pytest from skpro.distributions.qpd import QPD_B, QPD_S, QPD_U - from skpro.tests.test_switch import run_test_for_class diff --git a/skpro/regression/cyclic_boosting.py b/skpro/regression/cyclic_boosting.py index 6f152b3a1..873a7e08d 100644 --- a/skpro/regression/cyclic_boosting.py +++ b/skpro/regression/cyclic_boosting.py @@ -7,19 +7,14 @@ https://cyclic-boosting.readthedocs.io/en/latest/ """ -from __future__ import annotations - # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) __author__ = [ "setoguchi-naoki" ] # interface only. Cyclic boosting authors in cyclic_boosting package -import typing import warnings - -if typing.TYPE_CHECKING: - from typing import Optional +from typing import Union import numpy as np import pandas as pd @@ -118,8 +113,8 @@ def __init__( mode="multiplicative", lower=None, upper=None, - version: Optional[str] = "normal", - dist_shape: Optional[float] = 0.0, + version: Union[str, None] = "normal", + dist_shape: Union[float, None] = 0.0, maximal_iterations=10, ): self.feature_groups = feature_groups From bb328a91395b68269273f950beda45a73bfed3f8 Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Tue, 30 Apr 2024 10:37:44 +0900 Subject: [PATCH 40/52] formatting --- skpro/distributions/qpd.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index c0be8ea74..5cdcaa55c 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -13,7 +13,7 @@ import warnings if typing.TYPE_CHECKING: - from typing import Sequence, Union + from typing import Sequence from cyclic_boosting.quantile_matching import J_QPD_S, J_QPD_B from pandas import DataFrame, Index @@ -306,7 +306,7 @@ def __init__( version=version, ) - def mean(self, lower: float = None, upper: float = None): + def _mean(self, lower: float = None, upper: float = None): """Return expected value of the distribution. Please set the upper and lower limits of the random variable correctly. @@ -325,7 +325,7 @@ def mean(self, lower: float = None, upper: float = None): loc = exp_func(x, cdf_arr, self.index.shape[0]) return pd.DataFrame(loc, index=self.index, columns=self.columns) - def var(self, lower: float = None, upper: float = None): + def _var(self, lower: float = None, upper: float = None): """Return element/entry-wise variance of the distribution. Please set the upper and lower limits of the random variable correctly. @@ -345,7 +345,7 @@ def var(self, lower: float = None, upper: float = None): var = var_func(x, mean, cdf_arr, self.index.shape[0]) return pd.DataFrame(var, index=self.index, columns=self.columns) - def pdf(self, x: pd.DataFrame): + def _pdf(self, x: pd.DataFrame): """Probability density function. this fucntion transform cdf to pdf @@ -353,11 +353,11 @@ def pdf(self, x: pd.DataFrame): """ return pdf_func(x, self.qpd, self.index) - def ppf(self, p: pd.DataFrame): + def _ppf(self, p: pd.DataFrame): """Quantile function = percent point function = inverse cdf.""" return ppf_func(p, self.qpd, self.index) - def cdf(self, x: pd.DataFrame): + def _cdf(self, x: pd.DataFrame): """Cumulative distribution function.""" return cdf_func(x, self.qpd, self.index) @@ -515,7 +515,7 @@ def __init__( version=version, ) - def mean(self, lower: float = None, upper: float = None): + def _mean(self, lower: float = None, upper: float = None): """Return expected value of the distribution. Please set the upper and lower limits of the random variable correctly. @@ -534,7 +534,7 @@ def mean(self, lower: float = None, upper: float = None): loc = exp_func(x, cdf_arr, self.index.shape[0]) return pd.DataFrame(loc, index=self.index, columns=self.columns) - def var(self, lower: float = None, upper: float = None): + def _var(self, lower: float = None, upper: float = None): """Return element/entry-wise variance of the distribution. Please set the upper and lower limits of the random variable correctly. @@ -554,7 +554,7 @@ def var(self, lower: float = None, upper: float = None): var = var_func(x, mean, cdf_arr, self.index.shape[0]) return pd.DataFrame(var, index=self.index, columns=self.columns) - def pdf(self, x: pd.DataFrame): + def _pdf(self, x: pd.DataFrame): """Probability density function. this fucntion transform cdf to pdf @@ -562,11 +562,11 @@ def pdf(self, x: pd.DataFrame): """ return pdf_func(x, self.qpd, self.index) - def ppf(self, p: pd.DataFrame): + def _ppf(self, p: pd.DataFrame): """Quantile function = percent point function = inverse cdf.""" return ppf_func(p, self.qpd, self.index) - def cdf(self, x: pd.DataFrame): + def _cdf(self, x: pd.DataFrame): """Cumulative distribution function.""" return cdf_func(x, self.qpd, self.index) @@ -724,7 +724,7 @@ def __init__( self.qpd.append(jqpd) self.qpd = pd.DataFrame(self.qpd, index=self.index) - def mean(self, lower: float = -1e3, upper: float = 1e3): + def _mean(self, lower: float = -1e3, upper: float = 1e3): """Return expected value of the distribution. Please set the upper and lower limits of the random variable correctly. @@ -743,7 +743,7 @@ def mean(self, lower: float = -1e3, upper: float = 1e3): loc = exp_func(x, cdf_arr, self.index.shape[0]) return pd.DataFrame(loc, index=self.index, columns=self.columns) - def var(self, lower: float = -1e3, upper: float = 1e3): + def _var(self, lower: float = -1e3, upper: float = 1e3): """Return element/entry-wise variance of the distribution. Please set the upper and lower limits of the random variable correctly. @@ -763,7 +763,7 @@ def var(self, lower: float = -1e3, upper: float = 1e3): var_arr = var_func(x, mean_arr, cdf_arr, self.index.shape[0]) return pd.DataFrame(var_arr, index=self.index, columns=self.columns) - def pdf(self, x: pd.DataFrame): + def _pdf(self, x: pd.DataFrame): """Probability density function. this fucntion transform cdf to pdf @@ -771,11 +771,11 @@ def pdf(self, x: pd.DataFrame): """ return pdf_func(x, self.qpd, self.index) - def ppf(self, p: pd.DataFrame): + def _ppf(self, p: pd.DataFrame): """Quantile function = percent point function = inverse cdf.""" return ppf_func(p, self.qpd, self.index) - def cdf(self, x: pd.DataFrame): + def _cdf(self, x: pd.DataFrame): """Cumulative distribution function.""" return cdf_func(x, self.qpd, self.index) From 65272bcfebb362244e9d677170153d85dbf89649 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Tue, 30 Apr 2024 13:04:53 +0100 Subject: [PATCH 41/52] Update qpd.py --- skpro/distributions/qpd.py | 1 + 1 file changed, 1 insertion(+) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 5cdcaa55c..ac889c85a 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -89,6 +89,7 @@ class QPD_Johnson(_DelegatedDistribution): "capabilities:approx": ["pdfnorm", "energy"], "capabilities:exact": ["mean", "var", "cdf", "ppf", "pdf"], "distr:measuretype": "continuous", + "broadcast_init": "on", } def __init__( From ab2224b95881e7f1266c03d0de616804dc72090f Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Wed, 1 May 2024 09:47:44 +0900 Subject: [PATCH 42/52] new API for distributions --- skpro/distributions/qpd.py | 279 +++++++++++++++++++------------------ 1 file changed, 145 insertions(+), 134 deletions(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 5cdcaa55c..b684d2cd2 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -16,7 +16,6 @@ from typing import Sequence from cyclic_boosting.quantile_matching import J_QPD_S, J_QPD_B - from pandas import DataFrame, Index import numpy as np import pandas as pd @@ -53,10 +52,14 @@ class QPD_Johnson(_DelegatedDistribution): quantile function value of quantile 0.5 qv_high : float or array_like[float] quantile function value of quantile ``1 - alpha`` - lower : float, default = None (no lower bound) - lower bound of semi-bounded range or bounded range - upper : float, default = None (no upper bound) - upper bound of bounded range + lower : float + lower bound of bounded range for QPD. + This is used when estimating QPD and calculating + expectation and variance + upper : float, default = None + upper bound of bounded range for QPD. + This is used when estimating QPD and calculating + expectation and variance version: str, one of ``'normal'`` (default), ``'logistic'`` options are ``'normal'`` (default) or ``'logistic'`` dist_shape: float, optional, default=0.0 @@ -202,7 +205,12 @@ class QPD_S(BaseDistribution): qv_high : float or array_like[float] quantile function value of quantile ``1 - alpha`` lower : float - lower bound of semi-bounded range + lower bound of semi-bounded range. + This is used when estimating QPD and calculating + expectation and variance + upper : float, default = None + upper bound of probability density function to + calculate expected value and variance version: str options are ``normal`` (default) or ``logistic`` @@ -241,6 +249,7 @@ def __init__( qv_median: float | Sequence, qv_high: float | Sequence, lower: float, + upper: float = None, version: str | None = "normal", index=None, columns=None, @@ -251,12 +260,11 @@ def __init__( self.qv_median = qv_median self.qv_high = qv_high self.lower = lower + self.upper = upper if upper else 1e6 self.version = version self.index = index self.columns = columns - super().__init__(index=index, columns=columns) - from cyclic_boosting.quantile_matching import J_QPD_S alpha, qv_low, qv_median, qv_high = _prep_qpd_params( @@ -305,8 +313,9 @@ def __init__( l=self.lower, version=version, ) + super().__init__(index=index, columns=columns) - def _mean(self, lower: float = None, upper: float = None): + def _mean(self): """Return expected value of the distribution. Please set the upper and lower limits of the random variable correctly. @@ -316,16 +325,16 @@ def _mean(self, lower: float = None, upper: float = None): pd.DataFrame with same rows, columns as `self` expected value of distribution (entry-wise) """ - if not lower: - lower = self.lower - if not upper: - upper = 1e3 - x = np.linspace(lower, upper, num=int(1e3)) + params = self.get_params(deep=False) + lower = params["lower"] + upper = params["upper"] + index = params["index"] + x = np.linspace(lower, upper, num=int(1e6)) cdf_arr = self.qpd.cdf(x).T - loc = exp_func(x, cdf_arr, self.index.shape[0]) - return pd.DataFrame(loc, index=self.index, columns=self.columns) + loc = exp_func(x, cdf_arr, index.shape[0]) + return loc - def _var(self, lower: float = None, upper: float = None): + def _var(self): """Return element/entry-wise variance of the distribution. Please set the upper and lower limits of the random variable correctly. @@ -335,31 +344,31 @@ def _var(self, lower: float = None, upper: float = None): pd.DataFrame with same rows, columns as `self` variance of distribution (entry-wise) """ - if not lower: - lower = self.lower - if not upper: - upper = 1e3 - mean = self.mean(lower, upper).values - x = np.linspace(lower, upper, num=int(1e3)) + params = self.get_params(deep=False) + lower = params["lower"] + upper = params["upper"] + index = params["index"] + mean = self.mean().values + x = np.linspace(lower, upper, num=int(1e6)) cdf_arr = self.qpd.cdf(x).T - var = var_func(x, mean, cdf_arr, self.index.shape[0]) - return pd.DataFrame(var, index=self.index, columns=self.columns) + var = var_func(x, mean, cdf_arr, index.shape[0]) + return var - def _pdf(self, x: pd.DataFrame): + def _pdf(self, x: np.ndarray): """Probability density function. this fucntion transform cdf to pdf because j-qpd's pdf calculation is bit complex """ - return pdf_func(x, self.qpd, self.index) + return pdf_func(x, self.qpd) - def _ppf(self, p: pd.DataFrame): + def _ppf(self, p: np.ndarray): """Quantile function = percent point function = inverse cdf.""" - return ppf_func(p, self.qpd, self.index) + return ppf_func(p, self.qpd) - def _cdf(self, x: pd.DataFrame): + def _cdf(self, x: np.ndarray): """Cumulative distribution function.""" - return cdf_func(x, self.qpd, self.index) + return cdf_func(x, self.qpd) @classmethod def get_test_params(cls, parameter_set="default"): @@ -407,9 +416,13 @@ class QPD_B(BaseDistribution): qv_high : float or array_like[float] quantile function value of quantile ``1 - alpha`` lower : float - lower bound of semi-bounded range - upper : float - upper bound of supported range + lower bound of semi-bounded range. + This is used when estimating QPD and calculating + expectation and variance + upper : float, default = None + upper bound of semi-bounded range. + This is used when estimating QPD and calculating + expectation and variance version: str, optional, default="normal" options are ``normal`` (default) or ``logistic`` @@ -454,7 +467,7 @@ def __init__( index=None, columns=None, ): - self.qpd = [] + # self.qpd = [] self.alpha = alpha self.qv_low = qv_low self.qv_median = qv_median @@ -465,8 +478,6 @@ def __init__( self.index = index self.columns = columns - super().__init__(index=index, columns=columns) - from cyclic_boosting.quantile_matching import J_QPD_B alpha, qv_low, qv_median, qv_high = _prep_qpd_params( @@ -515,7 +526,9 @@ def __init__( version=version, ) - def _mean(self, lower: float = None, upper: float = None): + super().__init__(index=index, columns=columns) + + def _mean(self): """Return expected value of the distribution. Please set the upper and lower limits of the random variable correctly. @@ -525,16 +538,16 @@ def _mean(self, lower: float = None, upper: float = None): pd.DataFrame with same rows, columns as `self` expected value of distribution (entry-wise) """ - if not lower: - lower = self.lower - if not upper: - upper = self.upper - x = np.linspace(lower, upper, num=int(1e3)) + params = self.get_params(deep=False) + lower = params["lower"] + upper = params["upper"] + index = params["index"] + x = np.linspace(lower, upper, num=int(1e6)) cdf_arr = self.qpd.cdf(x).T - loc = exp_func(x, cdf_arr, self.index.shape[0]) - return pd.DataFrame(loc, index=self.index, columns=self.columns) + loc = exp_func(x, cdf_arr, index.shape[0]) + return loc - def _var(self, lower: float = None, upper: float = None): + def _var(self): """Return element/entry-wise variance of the distribution. Please set the upper and lower limits of the random variable correctly. @@ -544,31 +557,31 @@ def _var(self, lower: float = None, upper: float = None): pd.DataFrame with same rows, columns as `self` variance of distribution (entry-wise) """ - if not lower: - lower = self.lower - if not upper: - upper = self.upper - mean = self.mean(lower, upper).values - x = np.linspace(lower, upper, num=int(1e3)) + params = self.get_params(deep=False) + lower = params["lower"] + upper = params["upper"] + index = params["index"] + mean = self.mean().values + x = np.linspace(lower, upper, num=int(1e6)) cdf_arr = self.qpd.cdf(x).T - var = var_func(x, mean, cdf_arr, self.index.shape[0]) - return pd.DataFrame(var, index=self.index, columns=self.columns) + var = var_func(x, mean, cdf_arr, index.shape[0]) + return var - def _pdf(self, x: pd.DataFrame): + def _pdf(self, x: np.ndarray): """Probability density function. this fucntion transform cdf to pdf because j-qpd's pdf calculation is bit complex """ - return pdf_func(x, self.qpd, self.index) + return pdf_func(x, self.qpd) - def _ppf(self, p: pd.DataFrame): + def _ppf(self, p: np.ndarray): """Quantile function = percent point function = inverse cdf.""" - return ppf_func(p, self.qpd, self.index) + return ppf_func(p, self.qpd) - def _cdf(self, x: pd.DataFrame): + def _cdf(self, x: np.ndarray): """Cumulative distribution function.""" - return cdf_func(x, self.qpd, self.index) + return cdf_func(x, self.qpd) @classmethod def get_test_params(cls, parameter_set="default"): @@ -617,6 +630,13 @@ class QPD_U(BaseDistribution): quantile function value of quantile 0.5 qv_high : float or array_like[float] quantile function value of quantile ``1 - alpha`` + lower : float + lower bound of probability density function to + calculate expected value and variance + expectation and variance + upper : float, default = None + upper bound of probability density function to + calculate expected value and variance version: str, optional, default="normal" options are ``normal`` (default) or ``logistic`` dist_shape: float, optional, default=0.0 @@ -656,6 +676,8 @@ def __init__( qv_low: float | Sequence, qv_median: float | Sequence, qv_high: float | Sequence, + lower: float = None, + upper: float = None, version: str | None = "normal", dist_shape: float | None = 0.0, index=None, @@ -666,13 +688,13 @@ def __init__( self.qv_low = qv_low self.qv_median = qv_median self.qv_high = qv_high + self.lower = lower if lower else -1e6 + self.upper = upper if upper else 1e6 self.version = version self.dist_shape = dist_shape self.index = index self.columns = columns - super().__init__(index=index, columns=columns) - from cyclic_boosting.quantile_matching import J_QPD_extended_U alpha, qv_low, qv_median, qv_high = _prep_qpd_params( @@ -724,7 +746,9 @@ def __init__( self.qpd.append(jqpd) self.qpd = pd.DataFrame(self.qpd, index=self.index) - def _mean(self, lower: float = -1e3, upper: float = 1e3): + super().__init__(index=index, columns=columns) + + def _mean(self): """Return expected value of the distribution. Please set the upper and lower limits of the random variable correctly. @@ -734,16 +758,21 @@ def _mean(self, lower: float = -1e3, upper: float = 1e3): pd.DataFrame with same rows, columns as `self` expected value of distribution (entry-wise) """ + params = self.get_params(deep=False) + lower = params["lower"] + upper = params["upper"] + index = params["index"] + columns = params["columns"] cdf_arr = [] - x = np.linspace(lower, upper, num=int(1e3)) + x = np.linspace(lower, upper, num=int(1e6)) for idx in self.index: qpd = self.qpd.loc[idx, :].values[0] cdf_arr.append(qpd.cdf(x)) cdf_arr = np.asarray(cdf_arr) - loc = exp_func(x, cdf_arr, self.index.shape[0]) - return pd.DataFrame(loc, index=self.index, columns=self.columns) + loc = exp_func(x, cdf_arr, index.shape[0]) + return pd.DataFrame(loc, index=index, columns=columns) - def _var(self, lower: float = -1e3, upper: float = 1e3): + def _var(self): """Return element/entry-wise variance of the distribution. Please set the upper and lower limits of the random variable correctly. @@ -753,31 +782,35 @@ def _var(self, lower: float = -1e3, upper: float = 1e3): pd.DataFrame with same rows, columns as `self` variance of distribution (entry-wise) """ - mean_arr = self.mean(lower, upper).values - cdf_arr = [] - x = np.linspace(lower, upper, num=int(1e3)) + params = self.get_params(deep=False) + lower = params["lower"] + upper = params["upper"] + index = params["index"] + mean = self.mean().values + cdf_list = [] + x = np.linspace(lower, upper, num=int(1e6)) for idx in self.index: qpd = self.qpd.loc[idx, :].values[0] - cdf_arr.append(qpd.cdf(x)) - cdf_arr = np.asarray(cdf_arr) - var_arr = var_func(x, mean_arr, cdf_arr, self.index.shape[0]) - return pd.DataFrame(var_arr, index=self.index, columns=self.columns) + cdf_list.append(qpd.cdf(x)) + cdf = np.asarray(cdf_list) + var = var_func(x, mean, cdf, index.shape[0]) + return var - def _pdf(self, x: pd.DataFrame): + def _pdf(self, x: np.ndarray): """Probability density function. this fucntion transform cdf to pdf because j-qpd's pdf calculation is bit complex """ - return pdf_func(x, self.qpd, self.index) + return pdf_func(x, self.qpd) - def _ppf(self, p: pd.DataFrame): + def _ppf(self, p: np.ndarray): """Quantile function = percent point function = inverse cdf.""" - return ppf_func(p, self.qpd, self.index) + return ppf_func(p, self.qpd) - def _cdf(self, x: pd.DataFrame): + def _cdf(self, x: np.ndarray): """Cumulative distribution function.""" - return cdf_func(x, self.qpd, self.index) + return cdf_func(x, self.qpd) @classmethod def get_test_params(cls, parameter_set="default"): @@ -817,86 +850,64 @@ def calc_pdf(x: np.ndarray, cdf: np.ndarray) -> np.ndarray: def exp_func(x: np.ndarray, cdf: np.ndarray, size: int): """Return Expectation.""" - pdf_arr = calc_pdf(x, cdf) + pdf = calc_pdf(x, cdf) x = np.tile(x, (size, 1)) - loc = np.trapz(x * pdf_arr, x, dx=1e-6, axis=1) + loc = np.trapz(x * pdf, x, dx=1e-6, axis=1) return loc def var_func(x: np.ndarray, mu: np.ndarray, cdf: np.ndarray, size: int): """Return Variance.""" - pdf_arr = calc_pdf(x, cdf) + pdf = calc_pdf(x, cdf) x = np.tile(x, (size, 1)) - var = np.trapz(((x - mu) ** 2) * pdf_arr, x, dx=1e-6, axis=1) + var = np.trapz(((x - mu) ** 2) * pdf, x, dx=1e-6, axis=1) return var -def pdf_func(x: DataFrame, dist: J_QPD_S | J_QPD_B | pd.DataFrame, index: Index): +def pdf_func(x: np.ndarray, dist: J_QPD_S | J_QPD_B | pd.DataFrame): """Return pdf value.""" qpd = dist.values if isinstance(dist, pd.DataFrame) else dist - prob_var = np.unique(x.values) - pdf = np.zeros((x.index.shape[0], len(x.columns))) + prob_var = np.unique(x) for v in prob_var: - # all qpds x0 = np.linspace(v, v + 1e-3, num=3) if isinstance(dist, pd.DataFrame): - cdf_arr = np.asarray([func[0].cdf(x0) for func in qpd]) + cdf = np.asarray([func[0].cdf(x0) for func in qpd]) else: - cdf_arr = qpd.cdf(x0).T - pdf_arr = calc_pdf(x0, cdf_arr)[:, 0] - if pdf_arr.ndim < 1: - pdf_arr = pdf_arr[np.newaxis] - # pick up - rows, cols = np.where(x.values == v) - for r, c in zip(rows, cols): - id = x.index[r] - target = index.get_loc(id) - pdf[r][c] = pdf_arr[target] - return pd.DataFrame(pdf, index=x.index, columns=x.columns) - - -def ppf_func(x: DataFrame, dist: J_QPD_S | J_QPD_B | pd.DataFrame, index: Index): + cdf = qpd.cdf(x0).T + pdf = calc_pdf(x0, cdf)[:, 0] + if pdf.ndim < 1: + pdf = pdf[np.newaxis] + return pdf + + +def ppf_func(x: np.ndarray, dist: J_QPD_S | J_QPD_B | pd.DataFrame): """Return ppf value.""" qpd = dist.values if isinstance(dist, pd.DataFrame) else dist - quantiles = np.unique(x.values) - ppf = np.zeros((x.index.shape[0], len(x.columns))) + quantiles = np.unique(x) + ppf = np.zeros((x.shape[0], x.shape[1])) for q in quantiles: - # all qpds if isinstance(dist, pd.DataFrame): - ppf_arr = np.asarray([func[0].ppf(q) for func in qpd]) + ppf = np.asarray([func[0].ppf(q) for func in qpd]) else: - ppf_arr = qpd.ppf(q).T - if ppf_arr.ndim < 1: - ppf_arr = ppf_arr[np.newaxis] - # pick up - rows, cols = np.where(x.values == q) - for r, c in zip(rows, cols): - id = x.index[r] - target = index.get_loc(id) - ppf[r][c] = ppf_arr[target] - return pd.DataFrame(ppf, index=x.index, columns=x.columns) - - -def cdf_func(x: DataFrame, dist: J_QPD_S | J_QPD_B | pd.DataFrame, index: Index): + ppf = qpd.ppf(q).T + if ppf.ndim < 1: + ppf = ppf[np.newaxis] + return ppf + + +def cdf_func(x: np.ndarray, dist: J_QPD_S | J_QPD_B | pd.DataFrame): """Return cdf value.""" qpd = dist.values if isinstance(dist, pd.DataFrame) else dist - x_value = np.unique(x.values) - cdf = np.zeros((x.index.shape[0], len(x.columns))) + x_value = np.unique(x) + cdf = np.zeros((x.shape[0], x.shape[1])) for v in x_value: - # all qpds if isinstance(dist, pd.DataFrame): - cdf_arr = np.asarray([func[0].cdf(v) for func in qpd]) + cdf = np.asarray([func[0].cdf(v) for func in qpd]) else: - cdf_arr = qpd.cdf(v).T - if cdf_arr.ndim < 1: - cdf_arr = cdf_arr[np.newaxis] - # pick up - rows, cols = np.where(x.values == v) - for r, c in zip(rows, cols): - id = x.index[r] - target = index.get_loc(id) - cdf[r][c] = cdf_arr[target] - return pd.DataFrame(cdf, index=x.index, columns=x.columns) + cdf = qpd.cdf(v).T + if cdf.ndim < 1: + cdf = cdf[np.newaxis] + return cdf def _prep_qpd_params(self, alpha, qv_low, qv_median, qv_high): From 38c4ddd0ea1206bec6ade1fdca654a5b1b4ab76d Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Wed, 1 May 2024 10:30:35 +0900 Subject: [PATCH 43/52] remove unnecessary code --- skpro/distributions/qpd.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 89cc56786..f88f1f9c9 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -885,7 +885,6 @@ def ppf_func(x: np.ndarray, dist: J_QPD_S | J_QPD_B | pd.DataFrame): """Return ppf value.""" qpd = dist.values if isinstance(dist, pd.DataFrame) else dist quantiles = np.unique(x) - ppf = np.zeros((x.shape[0], x.shape[1])) for q in quantiles: if isinstance(dist, pd.DataFrame): ppf = np.asarray([func[0].ppf(q) for func in qpd]) @@ -900,7 +899,6 @@ def cdf_func(x: np.ndarray, dist: J_QPD_S | J_QPD_B | pd.DataFrame): """Return cdf value.""" qpd = dist.values if isinstance(dist, pd.DataFrame) else dist x_value = np.unique(x) - cdf = np.zeros((x.shape[0], x.shape[1])) for v in x_value: if isinstance(dist, pd.DataFrame): cdf = np.asarray([func[0].cdf(v) for func in qpd]) From 0c453016990fa0059fd670f26468e2ef25e5d16d Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Wed, 1 May 2024 10:53:33 +0900 Subject: [PATCH 44/52] mod tags --- skpro/distributions/qpd.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index f88f1f9c9..8457593e9 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -86,13 +86,13 @@ class QPD_Johnson(_DelegatedDistribution): # -------------- "authors": ["setoguchi-naoki", "felix-wick", "fkiraly"], "maintainers": ["setoguchi-naoki"], - "python_dependencies": "cyclic_boosting>=1.2.5", + "python_dependencies": ["cyclic_boosting>=1.4.0", "findiff"], # estimator tags # -------------- "capabilities:approx": ["pdfnorm", "energy"], "capabilities:exact": ["mean", "var", "cdf", "ppf", "pdf"], "distr:measuretype": "continuous", - "broadcast_init": "on", + # "broadcast_init": "on", } def __init__( @@ -235,7 +235,7 @@ class QPD_S(BaseDistribution): # -------------- "authors": ["setoguchi-naoki", "felix-wick"], "maintainers": ["setoguchi-naoki"], - "python_dependencies": ["cyclic_boosting>=1.4.0"], + "python_dependencies": ["cyclic_boosting>=1.4.0", "findiff"], # estimator tags # -------------- "capabilities:approx": ["pdfnorm", "energy"], @@ -448,7 +448,7 @@ class QPD_B(BaseDistribution): # -------------- "authors": ["setoguchi-naoki", "felix-wick"], "maintainers": ["setoguchi-naoki"], - "python_dependencies": ["cyclic_boosting>=1.4.0"], + "python_dependencies": ["cyclic_boosting>=1.4.0", "findiff"], # estimator tags # -------------- "capabilities:approx": ["pdfnorm", "energy"], @@ -663,7 +663,7 @@ class QPD_U(BaseDistribution): # -------------- "authors": ["setoguchi-naoki", "felix-wick"], "maintainers": ["setoguchi-naoki"], - "python_dependencies": "cyclic_boosting>=1.2.5", + "python_dependencies": ["cyclic_boosting>=1.4.0", "findiff"], # estimator tags # -------------- "capabilities:approx": ["pdfnorm", "energy"], From cad61aea3f59ecb79010502ab01b7eb716581d17 Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Wed, 1 May 2024 10:54:31 +0900 Subject: [PATCH 45/52] may be included in future versions of cyclic-boosting --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 8dfeefd89..3db040805 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,7 @@ all_extras = [ "tabulate", "uncertainties", "cyclic-boosting>=1.4.0; python_version < '3.12'", + "findiff", ] dev = [ From 9c859a5a1128797abc9a38922a004a0c029dc6fb Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Wed, 1 May 2024 14:57:58 +0900 Subject: [PATCH 46/52] fix bug --- skpro/distributions/qpd.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 8457593e9..3410addfe 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -250,7 +250,7 @@ def __init__( qv_median: float | Sequence, qv_high: float | Sequence, lower: float, - upper: float = None, + upper: float = 1e6, version: str | None = "normal", index=None, columns=None, @@ -261,7 +261,7 @@ def __init__( self.qv_median = qv_median self.qv_high = qv_high self.lower = lower - self.upper = upper if upper else 1e6 + self.upper = upper self.version = version self.index = index self.columns = columns @@ -631,7 +631,7 @@ class QPD_U(BaseDistribution): quantile function value of quantile 0.5 qv_high : float or array_like[float] quantile function value of quantile ``1 - alpha`` - lower : float + lower : float, default = None lower bound of probability density function to calculate expected value and variance expectation and variance @@ -677,8 +677,8 @@ def __init__( qv_low: float | Sequence, qv_median: float | Sequence, qv_high: float | Sequence, - lower: float = None, - upper: float = None, + lower: float = -1e6, + upper: float = 1e6, version: str | None = "normal", dist_shape: float | None = 0.0, index=None, @@ -689,8 +689,8 @@ def __init__( self.qv_low = qv_low self.qv_median = qv_median self.qv_high = qv_high - self.lower = lower if lower else -1e6 - self.upper = upper if upper else 1e6 + self.lower = lower + self.upper = upper self.version = version self.dist_shape = dist_shape self.index = index @@ -819,18 +819,18 @@ def get_test_params(cls, parameter_set="default"): params1 = { "alpha": 0.2, "version": "normal", - "qv_low": 0.2, - "qv_median": 0.5, - "qv_high": 0.8, + "qv_low": -0.3, + "qv_median": 0.0, + "qv_high": 0.3, "index": pd.RangeIndex(3), "columns": pd.Index(["a"]), } params2 = { "alpha": 0.2, "version": "normal", - "qv_low": [0.2, 0.2, 0.2], - "qv_median": [0.5, 0.5, 0.5], - "qv_high": [0.8, 0.8, 0.8], + "qv_low": [-0.3, -0.3, -0.3], + "qv_median": [0.0, 0.0, 0.0], + "qv_high": [0.3, 0.3, 0.3], "index": pd.RangeIndex(3), "columns": pd.Index(["a"]), } From 333b69a97efb194f4403702418c9868721cf849d Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Tue, 7 May 2024 18:39:38 +0900 Subject: [PATCH 47/52] resolve shape mismatch --- skpro/distributions/qpd.py | 177 ++++++++++++++++++------------------- 1 file changed, 88 insertions(+), 89 deletions(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 3410addfe..43b4415be 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -11,10 +11,9 @@ import typing import warnings +from typing import Sequence if typing.TYPE_CHECKING: - from typing import Sequence - from cyclic_boosting.quantile_matching import J_QPD_S, J_QPD_B import numpy as np @@ -92,7 +91,6 @@ class QPD_Johnson(_DelegatedDistribution): "capabilities:approx": ["pdfnorm", "energy"], "capabilities:exact": ["mean", "var", "cdf", "ppf", "pdf"], "distr:measuretype": "continuous", - # "broadcast_init": "on", } def __init__( @@ -250,12 +248,11 @@ def __init__( qv_median: float | Sequence, qv_high: float | Sequence, lower: float, - upper: float = 1e6, + upper: float = 1e3, version: str | None = "normal", index=None, columns=None, ): - self.qpd = [] self.alpha = alpha self.qv_low = qv_low self.qv_median = qv_median @@ -268,9 +265,7 @@ def __init__( from cyclic_boosting.quantile_matching import J_QPD_S - alpha, qv_low, qv_median, qv_high = _prep_qpd_params( - self, alpha, qv_low, qv_median, qv_high - ) + qv_low, qv_median, qv_high = _prep_qpd_params(qv_low, qv_median, qv_high) if index is None: index = pd.RangeIndex(qv_low.shape[0]) @@ -330,9 +325,11 @@ def _mean(self): lower = params["lower"] upper = params["upper"] index = params["index"] - x = np.linspace(lower, upper, num=int(1e6)) - cdf_arr = self.qpd.cdf(x).T - loc = exp_func(x, cdf_arr, index.shape[0]) + x = np.linspace(lower, upper, num=int(1e3)) + cdf = self.qpd.cdf(x) + if cdf.ndim < 2: + cdf = cdf[:, np.newaxis] + loc = exp_func(x, cdf.T, index.shape[0]) return loc def _var(self): @@ -350,9 +347,11 @@ def _var(self): upper = params["upper"] index = params["index"] mean = self.mean().values - x = np.linspace(lower, upper, num=int(1e6)) - cdf_arr = self.qpd.cdf(x).T - var = var_func(x, mean, cdf_arr, index.shape[0]) + x = np.linspace(lower, upper, num=int(1e3)) + cdf = self.qpd.cdf(x) + if cdf.ndim < 2: + cdf = cdf[:, np.newaxis] + var = var_func(x, mean, cdf.T, index.shape[0]) return var def _pdf(self, x: np.ndarray): @@ -381,7 +380,7 @@ def get_test_params(cls, parameter_set="default"): "qv_median": 0.0, "qv_high": 0.3, "lower": -0.5, - "index": pd.RangeIndex(3), + "index": pd.RangeIndex(1), "columns": pd.Index(["a"]), } params2 = { @@ -468,7 +467,6 @@ def __init__( index=None, columns=None, ): - # self.qpd = [] self.alpha = alpha self.qv_low = qv_low self.qv_median = qv_median @@ -481,9 +479,7 @@ def __init__( from cyclic_boosting.quantile_matching import J_QPD_B - alpha, qv_low, qv_median, qv_high = _prep_qpd_params( - self, alpha, qv_low, qv_median, qv_high - ) + qv_low, qv_median, qv_high = _prep_qpd_params(qv_low, qv_median, qv_high) if index is None: index = pd.RangeIndex(qv_low.shape[0]) @@ -543,9 +539,11 @@ def _mean(self): lower = params["lower"] upper = params["upper"] index = params["index"] - x = np.linspace(lower, upper, num=int(1e6)) - cdf_arr = self.qpd.cdf(x).T - loc = exp_func(x, cdf_arr, index.shape[0]) + x = np.linspace(lower, upper, num=int(1e3)) + cdf = self.qpd.cdf(x) + if cdf.ndim < 2: + cdf = cdf[:, np.newaxis] + loc = exp_func(x, cdf.T, index.shape[0]) return loc def _var(self): @@ -563,9 +561,11 @@ def _var(self): upper = params["upper"] index = params["index"] mean = self.mean().values - x = np.linspace(lower, upper, num=int(1e6)) - cdf_arr = self.qpd.cdf(x).T - var = var_func(x, mean, cdf_arr, index.shape[0]) + x = np.linspace(lower, upper, num=int(1e3)) + cdf = self.qpd.cdf(x) + if cdf.ndim < 2: + cdf = cdf[:, np.newaxis] + var = var_func(x, mean, cdf.T, index.shape[0]) return var def _pdf(self, x: np.ndarray): @@ -595,7 +595,7 @@ def get_test_params(cls, parameter_set="default"): "qv_high": 0.3, "lower": -0.5, "upper": 0.5, - "index": pd.RangeIndex(3), + "index": pd.RangeIndex(1), "columns": pd.Index(["a"]), } params2 = { @@ -677,8 +677,8 @@ def __init__( qv_low: float | Sequence, qv_median: float | Sequence, qv_high: float | Sequence, - lower: float = -1e6, - upper: float = 1e6, + lower: float = -1e3, + upper: float = 1e3, version: str | None = "normal", dist_shape: float | None = 0.0, index=None, @@ -698,9 +698,7 @@ def __init__( from cyclic_boosting.quantile_matching import J_QPD_extended_U - alpha, qv_low, qv_median, qv_high = _prep_qpd_params( - self, alpha, qv_low, qv_median, qv_high - ) + qv_low, qv_median, qv_high = _prep_qpd_params(qv_low, qv_median, qv_high) if index is None: index = pd.RangeIndex(qv_low.shape[0]) @@ -745,7 +743,6 @@ def __init__( shape=dist_shape, ) self.qpd.append(jqpd) - self.qpd = pd.DataFrame(self.qpd, index=self.index) super().__init__(index=index, columns=columns) @@ -763,15 +760,15 @@ def _mean(self): lower = params["lower"] upper = params["upper"] index = params["index"] - columns = params["columns"] cdf_arr = [] - x = np.linspace(lower, upper, num=int(1e6)) - for idx in self.index: - qpd = self.qpd.loc[idx, :].values[0] + x = np.linspace(lower, upper, num=int(1e3)) + for qpd in self.qpd: cdf_arr.append(qpd.cdf(x)) - cdf_arr = np.asarray(cdf_arr) - loc = exp_func(x, cdf_arr, index.shape[0]) - return pd.DataFrame(loc, index=index, columns=columns) + cdf = np.asarray(cdf_arr) + if cdf.ndim < 2: + cdf = cdf[:, np.newaxis] + loc = exp_func(x, cdf, index.shape[0]) + return loc def _var(self): """Return element/entry-wise variance of the distribution. @@ -789,11 +786,12 @@ def _var(self): index = params["index"] mean = self.mean().values cdf_list = [] - x = np.linspace(lower, upper, num=int(1e6)) - for idx in self.index: - qpd = self.qpd.loc[idx, :].values[0] + x = np.linspace(lower, upper, num=int(1e3)) + for qpd in self.qpd: cdf_list.append(qpd.cdf(x)) cdf = np.asarray(cdf_list) + if cdf.ndim < 2: + cdf = cdf[:, np.newaxis] var = var_func(x, mean, cdf, index.shape[0]) return var @@ -822,7 +820,7 @@ def get_test_params(cls, parameter_set="default"): "qv_low": -0.3, "qv_median": 0.0, "qv_high": 0.3, - "index": pd.RangeIndex(3), + "index": pd.RangeIndex(1), "columns": pd.Index(["a"]), } params2 = { @@ -837,21 +835,19 @@ def get_test_params(cls, parameter_set="default"): return [params1, params2] -def calc_pdf(x: np.ndarray, cdf: np.ndarray) -> np.ndarray: +def calc_pdf(cdf: np.ndarray) -> np.ndarray: """Return pdf value for all samples.""" from findiff import FinDiff - dx = x[1] - x[0] + dx = 1e-6 derivative = FinDiff(1, dx, 1) - if cdf.ndim < 2: - cdf = cdf[np.newaxis, :] pdf = np.asarray(derivative(cdf)) return pdf def exp_func(x: np.ndarray, cdf: np.ndarray, size: int): """Return Expectation.""" - pdf = calc_pdf(x, cdf) + pdf = calc_pdf(cdf) x = np.tile(x, (size, 1)) loc = np.trapz(x * pdf, x, dx=1e-6, axis=1) return loc @@ -859,64 +855,67 @@ def exp_func(x: np.ndarray, cdf: np.ndarray, size: int): def var_func(x: np.ndarray, mu: np.ndarray, cdf: np.ndarray, size: int): """Return Variance.""" - pdf = calc_pdf(x, cdf) + pdf = calc_pdf(cdf) x = np.tile(x, (size, 1)) var = np.trapz(((x - mu) ** 2) * pdf, x, dx=1e-6, axis=1) return var -def pdf_func(x: np.ndarray, dist: J_QPD_S | J_QPD_B | pd.DataFrame): +def pdf_func(x: np.ndarray, qpd: J_QPD_S | J_QPD_B | list): """Return pdf value.""" - qpd = dist.values if isinstance(dist, pd.DataFrame) else dist - prob_var = np.unique(x) - for v in prob_var: - x0 = np.linspace(v, v + 1e-3, num=3) - if isinstance(dist, pd.DataFrame): - cdf = np.asarray([func[0].cdf(x0) for func in qpd]) - else: - cdf = qpd.cdf(x0).T - pdf = calc_pdf(x0, cdf)[:, 0] - if pdf.ndim < 1: - pdf = pdf[np.newaxis] + pdf = np.zeros_like(x) + for r in range(x.shape[0]): + for c in range(x.shape[1]): + element = x[r][c] + x0 = np.linspace(element, element + 1e-3, num=3) + if isinstance(qpd, list): + cdf = np.asarray([func.cdf(x0) for func in qpd]) + cdf = cdf.reshape(cdf.shape[0], -1) + else: + cdf = qpd.cdf(x0) + if cdf.ndim < 2: + cdf = cdf[:, np.newaxis] + cdf = cdf.T + pdf_part = calc_pdf(cdf) + pdf[r][c] = pdf_part[0][0] return pdf -def ppf_func(x: np.ndarray, dist: J_QPD_S | J_QPD_B | pd.DataFrame): +def ppf_func(x: np.ndarray, qpd: J_QPD_S | J_QPD_B | list): """Return ppf value.""" - qpd = dist.values if isinstance(dist, pd.DataFrame) else dist - quantiles = np.unique(x) - for q in quantiles: - if isinstance(dist, pd.DataFrame): - ppf = np.asarray([func[0].ppf(q) for func in qpd]) - else: - ppf = qpd.ppf(q).T - if ppf.ndim < 1: + if isinstance(qpd, list): + ppf = np.asarray([func.ppf(x) for func in qpd]) + ppf = ppf.reshape(ppf.shape[0], -1) + else: + ppf = qpd.ppf(x) + if ppf.ndim < 2: ppf = ppf[np.newaxis] + ppf = ppf.T return ppf -def cdf_func(x: np.ndarray, dist: J_QPD_S | J_QPD_B | pd.DataFrame): +def cdf_func(x: np.ndarray, qpd: J_QPD_S | J_QPD_B | list): """Return cdf value.""" - qpd = dist.values if isinstance(dist, pd.DataFrame) else dist - x_value = np.unique(x) - for v in x_value: - if isinstance(dist, pd.DataFrame): - cdf = np.asarray([func[0].cdf(v) for func in qpd]) - else: - cdf = qpd.cdf(v).T - if cdf.ndim < 1: + if isinstance(qpd, list): + cdf = np.asarray([func.cdf(x) for func in qpd]) + cdf = cdf.reshape(cdf.shape[0], -1) + else: + cdf = qpd.cdf(x) + if cdf.ndim < 2: cdf = cdf[np.newaxis] + cdf = cdf.T return cdf -def _prep_qpd_params(self, alpha, qv_low, qv_median, qv_high): +def _prep_qpd_params(qv_low, qv_median, qv_high): """Prepare parameters for Johnson Quantile-Parameterized Distributions.""" - if not isinstance(alpha, np.ndarray): - alpha = np.array([alpha]) - qv_low, qv_median, qv_high = BaseDistribution._get_bc_params( - self, qv_low, qv_median, qv_high, oned_as="col" - ) - qv_low = qv_low.flatten() - qv_median = qv_median.flatten() - qv_high = qv_high.flatten() - return alpha, qv_low, qv_median, qv_high + qv = [qv_low, qv_median, qv_high] + for i, instance in enumerate(qv): + if isinstance(instance, float): + qv[i] = np.array([qv[i]]) + elif isinstance(instance, Sequence): + qv[i] = np.asarray(qv[i]) + qv_low = qv[0].flatten() + qv_median = qv[1].flatten() + qv_high = qv[2].flatten() + return qv_low, qv_median, qv_high From 8c599a55583a2d7f6394b7654dc058e2dd1dc791 Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Sun, 12 May 2024 23:12:57 +0900 Subject: [PATCH 48/52] mod input and resolve shape --- skpro/distributions/qpd.py | 99 ++++++++++++++++++++++++++--- skpro/regression/cyclic_boosting.py | 7 +- 2 files changed, 92 insertions(+), 14 deletions(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 43b4415be..5457bff7f 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -364,11 +364,37 @@ def _pdf(self, x: np.ndarray): def _ppf(self, p: np.ndarray): """Quantile function = percent point function = inverse cdf.""" - return ppf_func(p, self.qpd) + params = self.get_params(deep=False) + index = params["index"] + columns = params["columns"] + qv_low = params["qv_low"] + p_unique = np.unique(p) # de-broadcast + ppf_all = ppf_func(p_unique, self.qpd) + ppf_map = np.tile(p_unique, (qv_low.size, 1)).T + ppf = np.zeros((index.shape[0], len(columns))) + for r in range(p.shape[0]): + for c in range(p.shape[1]): + t = np.where(ppf_map[:, c] == p[r][c]) + ppf_part = ppf_all[t][c] + ppf[r][c] = ppf_part + return ppf def _cdf(self, x: np.ndarray): """Cumulative distribution function.""" - return cdf_func(x, self.qpd) + params = self.get_params(deep=False) + index = params["index"] + columns = params["columns"] + qv_low = params["qv_low"] + x_unique = np.unique(x) # de-broadcast + cdf_all = cdf_func(x_unique, self.qpd) + cdf_map = np.tile(x_unique, (qv_low.size, 1)).T + cdf = np.zeros((index.shape[0], len(columns))) + for r in range(x.shape[0]): + for c in range(x.shape[1]): + t = np.where(cdf_map[:, c] == x[r][c]) + cdf_part = cdf_all[t][c] + cdf[r][c] = cdf_part + return cdf @classmethod def get_test_params(cls, parameter_set="default"): @@ -578,11 +604,37 @@ def _pdf(self, x: np.ndarray): def _ppf(self, p: np.ndarray): """Quantile function = percent point function = inverse cdf.""" - return ppf_func(p, self.qpd) + params = self.get_params(deep=False) + index = params["index"] + columns = params["columns"] + qv_low = params["qv_low"] + p_unique = np.unique(p) # de-broadcast + ppf_all = ppf_func(p_unique, self.qpd) + ppf_map = np.tile(p_unique, (qv_low.size, 1)).T + ppf = np.zeros((index.shape[0], len(columns))) + for r in range(p.shape[0]): + for c in range(p.shape[1]): + t = np.where(ppf_map[:, c] == p[r][c]) + ppf_part = ppf_all[t][c] + ppf[r][c] = ppf_part + return ppf def _cdf(self, x: np.ndarray): """Cumulative distribution function.""" - return cdf_func(x, self.qpd) + params = self.get_params(deep=False) + index = params["index"] + columns = params["columns"] + qv_low = params["qv_low"] + x_unique = np.unique(x) # de-broadcast + cdf_all = cdf_func(x_unique, self.qpd) + cdf_map = np.tile(x_unique, (qv_low.size, 1)).T + cdf = np.zeros((index.shape[0], len(columns))) + for r in range(x.shape[0]): + for c in range(x.shape[1]): + t = np.where(cdf_map[:, c] == x[r][c]) + cdf_part = cdf_all[t][c] + cdf[r][c] = cdf_part + return cdf @classmethod def get_test_params(cls, parameter_set="default"): @@ -805,11 +857,37 @@ def _pdf(self, x: np.ndarray): def _ppf(self, p: np.ndarray): """Quantile function = percent point function = inverse cdf.""" - return ppf_func(p, self.qpd) + params = self.get_params(deep=False) + index = params["index"] + columns = params["columns"] + qv_low = params["qv_low"] + p_unique = np.unique(p) # de-broadcast + ppf_all = ppf_func(p_unique, self.qpd) + ppf_map = np.tile(p_unique, (qv_low.size, 1)).T + ppf = np.zeros((index.shape[0], len(columns))) + for r in range(p.shape[0]): + for c in range(p.shape[1]): + t = np.where(ppf_map[:, c] == p[r][c]) + ppf_part = ppf_all[t][c] + ppf[r][c] = ppf_part + return ppf def _cdf(self, x: np.ndarray): """Cumulative distribution function.""" - return cdf_func(x, self.qpd) + params = self.get_params(deep=False) + index = params["index"] + columns = params["columns"] + qv_low = params["qv_low"] + x_unique = np.unique(x) # de-broadcast + cdf_all = cdf_func(x_unique, self.qpd) + cdf_map = np.tile(x_unique, (qv_low.size, 1)).T + cdf = np.zeros((index.shape[0], len(columns))) + for r in range(x.shape[0]): + for c in range(x.shape[1]): + t = np.where(cdf_map[:, c] == x[r][c]) + cdf_part = cdf_all[t][c] + cdf[r][c] = cdf_part + return cdf @classmethod def get_test_params(cls, parameter_set="default"): @@ -874,7 +952,8 @@ def pdf_func(x: np.ndarray, qpd: J_QPD_S | J_QPD_B | list): else: cdf = qpd.cdf(x0) if cdf.ndim < 2: - cdf = cdf[:, np.newaxis] + for d in range(2 - cdf.ndim): + cdf = cdf[np.newaxis] cdf = cdf.T pdf_part = calc_pdf(cdf) pdf[r][c] = pdf_part[0][0] @@ -889,7 +968,8 @@ def ppf_func(x: np.ndarray, qpd: J_QPD_S | J_QPD_B | list): else: ppf = qpd.ppf(x) if ppf.ndim < 2: - ppf = ppf[np.newaxis] + for d in range(2 - ppf.ndim): + ppf = ppf[np.newaxis] ppf = ppf.T return ppf @@ -902,7 +982,8 @@ def cdf_func(x: np.ndarray, qpd: J_QPD_S | J_QPD_B | list): else: cdf = qpd.cdf(x) if cdf.ndim < 2: - cdf = cdf[np.newaxis] + for d in range(2 - cdf.ndim): + cdf = cdf[np.newaxis] cdf = cdf.T return cdf diff --git a/skpro/regression/cyclic_boosting.py b/skpro/regression/cyclic_boosting.py index 873a7e08d..19620db78 100644 --- a/skpro/regression/cyclic_boosting.py +++ b/skpro/regression/cyclic_boosting.py @@ -418,11 +418,8 @@ def _predict_quantiles(self, X, alpha): self.quantile_values = list() if is_given_proba: qpd = self.predict_proba(X.copy()) - if isinstance(quantiles, list): - quantile = [quantiles] - - p = pd.DataFrame(quantile, index=X.index, columns=columns) - quantiles = qpd.ppf(p) + pred = np.asarray([np.squeeze(qpd.ppf(q)) for q in quantiles]).T + quantiles = pd.DataFrame(pred, index=X.index, columns=columns) else: for est in self.quantile_est: From 265ac34843916f181fed597cce300715ac996b67 Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Sun, 12 May 2024 23:16:27 +0900 Subject: [PATCH 49/52] formatting --- skpro/distributions/qpd.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 5457bff7f..384933b7f 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -548,7 +548,6 @@ def __init__( u=self.upper, version=version, ) - super().__init__(index=index, columns=columns) def _mean(self): @@ -952,7 +951,7 @@ def pdf_func(x: np.ndarray, qpd: J_QPD_S | J_QPD_B | list): else: cdf = qpd.cdf(x0) if cdf.ndim < 2: - for d in range(2 - cdf.ndim): + for _ in range(2 - cdf.ndim): cdf = cdf[np.newaxis] cdf = cdf.T pdf_part = calc_pdf(cdf) @@ -968,7 +967,7 @@ def ppf_func(x: np.ndarray, qpd: J_QPD_S | J_QPD_B | list): else: ppf = qpd.ppf(x) if ppf.ndim < 2: - for d in range(2 - ppf.ndim): + for _ in range(2 - ppf.ndim): ppf = ppf[np.newaxis] ppf = ppf.T return ppf @@ -982,7 +981,7 @@ def cdf_func(x: np.ndarray, qpd: J_QPD_S | J_QPD_B | list): else: cdf = qpd.cdf(x) if cdf.ndim < 2: - for d in range(2 - cdf.ndim): + for _ in range(2 - cdf.ndim): cdf = cdf[np.newaxis] cdf = cdf.T return cdf From 823a32de38b2c16a10424c25a6bd46f0194f8cc6 Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Mon, 13 May 2024 00:04:42 +0900 Subject: [PATCH 50/52] miss commit --- skpro/distributions/qpd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 384933b7f..dd1cd30fa 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -952,7 +952,7 @@ def pdf_func(x: np.ndarray, qpd: J_QPD_S | J_QPD_B | list): cdf = qpd.cdf(x0) if cdf.ndim < 2: for _ in range(2 - cdf.ndim): - cdf = cdf[np.newaxis] + cdf = cdf[:, np.newaxis] cdf = cdf.T pdf_part = calc_pdf(cdf) pdf[r][c] = pdf_part[0][0] From 5773f27f361b6eb615d96aad104b865fce05bdd3 Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Mon, 13 May 2024 00:45:05 +0900 Subject: [PATCH 51/52] minor change --- skpro/distributions/qpd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index dd1cd30fa..d3c1a8129 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -175,7 +175,7 @@ def get_test_params(cls, parameter_set="default"): params4 = { "alpha": 0.12, "version": "logistic", - "qv_low": [0.25, 0.2, 0.22], + "qv_low": [0.15, 0.1, 0.15], "qv_median": [0.45, 0.51, 0.54], "qv_high": [0.85, 0.83, 0.81], "lower": 0.05, From e91ea5d98f4ddddbe48b29903cd3a83f424b3c4a Mon Sep 17 00:00:00 2001 From: setoguchi-naoki Date: Tue, 14 May 2024 17:46:21 +0900 Subject: [PATCH 52/52] docstring --- skpro/distributions/qpd.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index d3c1a8129..dc724c524 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -51,7 +51,7 @@ class QPD_Johnson(_DelegatedDistribution): quantile function value of quantile 0.5 qv_high : float or array_like[float] quantile function value of quantile ``1 - alpha`` - lower : float + lower : float, default = None lower bound of bounded range for QPD. This is used when estimating QPD and calculating expectation and variance @@ -207,7 +207,7 @@ class QPD_S(BaseDistribution): lower bound of semi-bounded range. This is used when estimating QPD and calculating expectation and variance - upper : float, default = None + upper : float, default = 1e3 upper bound of probability density function to calculate expected value and variance version: str @@ -445,7 +445,7 @@ class QPD_B(BaseDistribution): lower bound of semi-bounded range. This is used when estimating QPD and calculating expectation and variance - upper : float, default = None + upper : float upper bound of semi-bounded range. This is used when estimating QPD and calculating expectation and variance @@ -682,11 +682,11 @@ class QPD_U(BaseDistribution): quantile function value of quantile 0.5 qv_high : float or array_like[float] quantile function value of quantile ``1 - alpha`` - lower : float, default = None + lower : float, default = -1e3 lower bound of probability density function to calculate expected value and variance expectation and variance - upper : float, default = None + upper : float, default = 1e3 upper bound of probability density function to calculate expected value and variance version: str, optional, default="normal"