From 831938f46d49b0b4538800b443f7a8bcb762e50c Mon Sep 17 00:00:00 2001 From: statmlben Date: Thu, 22 Aug 2024 21:54:50 +0800 Subject: [PATCH] plqERM_Ridge done --- .ipynb_checkpoints/FairSVM-checkpoint.ipynb | 223 ++++++++++++ .ipynb_checkpoints/QR-checkpoint.ipynb | 123 +++++++ .ipynb_checkpoints/SVM-checkpoint.ipynb | 108 ++++++ README.md | 2 +- doc/source/examples/FairSVM.ipynb | 223 ++++++++++++ doc/source/examples/QR.ipynb | 123 +++++++ doc/source/examples/ReHLine_QR.ipynb | 176 ---------- doc/source/examples/ReHLine_SVM_FairSVM.ipynb | 187 ---------- doc/source/examples/SVM.ipynb | 108 ++++++ doc/source/tutorials.rst | 39 +++ rehline/__init__.py | 7 +- rehline/_base.py | 290 +++++++++++++++- rehline/_class.py | 324 +++++------------- 13 files changed, 1319 insertions(+), 614 deletions(-) create mode 100644 .ipynb_checkpoints/FairSVM-checkpoint.ipynb create mode 100644 .ipynb_checkpoints/QR-checkpoint.ipynb create mode 100644 .ipynb_checkpoints/SVM-checkpoint.ipynb create mode 100644 doc/source/examples/FairSVM.ipynb create mode 100644 doc/source/examples/QR.ipynb delete mode 100644 doc/source/examples/ReHLine_QR.ipynb delete mode 100644 doc/source/examples/ReHLine_SVM_FairSVM.ipynb create mode 100644 doc/source/examples/SVM.ipynb diff --git a/.ipynb_checkpoints/FairSVM-checkpoint.ipynb b/.ipynb_checkpoints/FairSVM-checkpoint.ipynb new file mode 100644 index 0000000..422100e --- /dev/null +++ b/.ipynb_checkpoints/FairSVM-checkpoint.ipynb @@ -0,0 +1,223 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "50711dda-105e-4714-937b-e8be06370605", + "metadata": {}, + "source": [ + "# **FairSVM**\n", + "\n", + "[![Slides](https://img.shields.io/badge/🦌-ReHLine-blueviolet)](https://rehline-python.readthedocs.io/en/latest/)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1f_7t1t6FNxAooQOmpyhHCOVq0IKgMxe-?usp=sharing)\n", + "\n", + "The FairSVM solves the following optimization problem:\n", + "\n", + "$$\n", + "\\begin{align}\n", + " & \\min_{\\mathbf{\\beta} \\in \\mathbb{R}^d} \\frac{C}{n} \\sum_{i=1}^n ( 1 - y_i \\mathbf{\\beta}^\\intercal \\mathbf{x}_i )_+ + \\frac{1}{2} \\| \\mathbf{\\beta} \\|_2^2, \\nonumber \\\\\n", + " \\text{subject to } & \\quad \\frac{1}{n} \\sum_{i=1}^n \\mathbf{z}_i \\mathbf{\\beta}^\\intercal \\mathbf{x}_i \\leq \\mathbf{\\rho}, \\quad \\frac{1}{n} \\sum_{i=1}^n \\mathbf{z}_i \\mathbf{\\beta}^\\intercal \\mathbf{x}_i \\geq -\\mathbf{\\rho},\n", + "\\end{align}\n", + "$$\n", + "\n", + "where:\n", + "\n", + "* $\\mathbf{x}_i \\in \\mathbb{R}^d$ is a feature vector\n", + "* $y_i \\in \\{-1, 1\\}$ is a binary label\n", + "* $\\mathbf{z}_i$ is a collection of **centered sensitive features**, such as gender and/or race, satisfying:\n", + "\n", + "$$\\sum_{i=1}^n z_{ij} = 0,$$\n", + "\n", + "* $\\mathbf{z}_i \\in \\mathbb{R}^{d_0}$ is a $d_0$-length sensitive feature vector\n", + "* $\\mathbf{\\rho} \\in \\mathbb{R}_+^{d_0}$ is a vector of constants that trade-off predictive accuracy and fairness\n", + "\n", + "The constraints limit the correlation between the sensitive features and the decision function, ensuring fairness in the predictions.\n", + "> **Note.** Since the hinge loss is a plq function, and fairness constraints are linear, thus we can solve it by `rehline.plqERM_Ridge`." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e66268fa-403d-402b-9ea1-fbfe7573af40", + "metadata": {}, + "outputs": [], + "source": [ + "## simulate data\n", + "from sklearn.datasets import make_classification\n", + "from sklearn.preprocessing import StandardScaler\n", + "import numpy as np\n", + "\n", + "scaler = StandardScaler()\n", + "\n", + "n, d = 10000, 5\n", + "X, y = make_classification(n_samples=n, n_features=d, n_redundant=0)\n", + "## convert y to +1/-1\n", + "y = 2*y - 1\n", + "X = scaler.fit_transform(X)\n", + "\n", + "## we take the first column of X as sensetive features, and tol is 0.1\n", + "X_sen = X[:,0]\n", + "tol_sen = 0.1" + ] + }, + { + "cell_type": "markdown", + "id": "6a576a09-b700-49cd-b500-219f3a6e40b0", + "metadata": {}, + "source": [ + "## SVM as baseline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "15531796-3a45-42b3-8a99-da0343be9d4d", + "metadata": {}, + "outputs": [], + "source": [ + "## we first run a SVM\n", + "from rehline import plqERM_Ridge\n", + "\n", + "clf = plqERM_Ridge(loss={'name': 'svm'}, C=1.0, max_iter=50000)\n", + "clf.fit(X=X, y=y)" + ] + }, + { + "cell_type": "markdown", + "id": "79bb275b-2dfd-4608-83e3-b4b4eb0fdb72", + "metadata": {}, + "source": [ + "## FairSVM" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c43509f7-031b-4620-bc5e-fb5aea2ef1c2", + "metadata": {}, + "outputs": [], + "source": [ + "## solve FairSVM via `plqERM_Ridge` by adding `constraint`\n", + "from rehline import plqERM_Ridge\n", + "\n", + "fclf = plqERM_Ridge(loss={'name': 'svm'},\n", + " constraint=[{'name': 'fair',\n", + " 'X_sen': X_sen,\n", + " 'tol_sen': tol_sen}],\n", + " C=1.0,\n", + " max_iter=50000)\n", + "fclf.fit(X=X, y=y)" + ] + }, + { + "cell_type": "markdown", + "id": "794ede1f-13a4-4889-b6d9-f19a61faa510", + "metadata": {}, + "source": [ + "## Results" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "05dc3921-1837-474e-9a6d-4555a94ddc30", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Model Train Performance Correlation with Sensitive Features\n", + " SVM 0.8853 2.535203\n", + "FairSVM 0.5856 0.100212\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "## score\n", + "score = clf.decision_function(X)\n", + "fscore = fclf.decision_function(X)\n", + "\n", + "svm_perf = len(y[score*y > 0])/n\n", + "fsvm_perf = len(y[fscore*y > 0])/n\n", + "\n", + "svm_corr = score.dot(X_sen) / n\n", + "fsvm_corr = fscore.dot(X_sen) / n\n", + "\n", + "# Create a pandas DataFrame to store the results\n", + "results = pd.DataFrame({\n", + " 'Model': ['SVM', 'FairSVM'],\n", + " 'Train Performance': [svm_perf, fsvm_perf],\n", + " 'Correlation with Sensitive Features': [svm_corr, fsvm_corr]\n", + "})\n", + "\n", + "# Print the results as a table\n", + "print(results.to_string(index=False))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ad5a863e-fbbb-4caf-876d-374f3ca9b891", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import seaborn as sns\n", + "import pandas as pd\n", + "import warnings\n", + "import matplotlib.pyplot as plt\n", + "warnings.filterwarnings(\"ignore\", \"is_categorical_dtype\")\n", + "warnings.filterwarnings(\"ignore\", \"use_inf_as_na\")\n", + "\n", + "df = pd.DataFrame({'score': score, 'fscore': fscore, 'y': y})\n", + "\n", + "sns.histplot(df, x=\"score\", hue=\"y\").set_title(\"SVM\")\n", + "plt.show()\n", + "sns.histplot(df, x=\"fscore\", hue=\"y\").set_title(\"FairSVM\")\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/.ipynb_checkpoints/QR-checkpoint.ipynb b/.ipynb_checkpoints/QR-checkpoint.ipynb new file mode 100644 index 0000000..75640d9 --- /dev/null +++ b/.ipynb_checkpoints/QR-checkpoint.ipynb @@ -0,0 +1,123 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e3a11293-4739-476e-a513-48a256d425a2", + "metadata": {}, + "source": [ + "## **Ridge Quantile Regression**\n", + "\n", + "[![Slides](https://img.shields.io/badge/🦌-ReHLine-blueviolet)](https://rehline-python.readthedocs.io/en/latest/)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1LwatjwjnMSB97eLVyuOiUY3sl3A3Ie__?usp=sharing)\n", + "\n", + "The regularized quantile regression solves the following optimization problem:\n", + "\n", + "$$\n", + "min_{\\beta \\in \\mathbb{R}^{d}} \\ C \\sum_{i=1}^n \\rho_\\kappa ( y_i - x^\\intercal_i \\beta ) + \\frac{1}{2} \\| \\beta \\|^2,\n", + "$$\n", + "\n", + "where $\\rho_\\kappa(u) = u\\cdot(\\kappa - \\mathbf{1}(u < 0))$ is the check loss,\n", + "$x_i \\in \\mathbb{R}^d$ is a feature vector, $y_i \\in \\mathbb{R}$ is the response variable.\n", + "\n", + "> **Note.** Since the check loss is a plq function, thus we can solve it by `rehline.plqERM_Ridge`." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b2dd4ce5-bc27-41a4-89ab-7920d393f377", + "metadata": {}, + "outputs": [], + "source": [ + "## simulate data\n", + "from sklearn.datasets import make_regression\n", + "from sklearn.preprocessing import StandardScaler\n", + "import numpy as np\n", + "\n", + "scaler = StandardScaler()\n", + "\n", + "n, d = 10000, 5\n", + "X, y = make_regression(n_samples=n, n_features=d, noise=1.0)\n", + "X = scaler.fit_transform(X)\n", + "## add intercept\n", + "X = np.hstack((X,np.ones((n,1))))\n", + "y = y/y.std()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "80129ee6-f886-4e27-a764-630f15826bca", + "metadata": {}, + "outputs": [], + "source": [ + "## solve QR with different `qt` via `plqERM_Ridge`\n", + "from rehline import plqERM_Ridge\n", + "\n", + "clf5 = plqERM_Ridge(loss={'name': 'QR', 'qt': 0.05}, C=10.0/n)\n", + "clf5.fit(X=X, y=y)\n", + "\n", + "clf95 = plqERM_Ridge(loss={'name': 'QR', 'qt': 0.95}, C=10.0/n)\n", + "clf95.fit(X=X, y=y)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1d8b90e9-6af9-4856-9751-6fe6fbc7665c", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "## plot QR results\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "import warnings\n", + "import matplotlib.pyplot as plt\n", + "warnings.filterwarnings(\"ignore\", \"is_categorical_dtype\")\n", + "\n", + "n_sample = 50\n", + "X_sample, y_sample = X[:n_sample], y[:n_sample]\n", + "q05_sample = clf5.decision_function(X_sample)\n", + "q95_sample = clf95.decision_function(X_sample)\n", + "\n", + "df = pd.DataFrame({'x0': X_sample[:,0], 'real_y': y_sample, 'q05': q05_sample, 'q95': q95_sample})\n", + "df = df.melt(id_vars='x0')\n", + "\n", + "sns.scatterplot(data=df, x='x0', y='value', hue='variable')\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/.ipynb_checkpoints/SVM-checkpoint.ipynb b/.ipynb_checkpoints/SVM-checkpoint.ipynb new file mode 100644 index 0000000..05e9fe0 --- /dev/null +++ b/.ipynb_checkpoints/SVM-checkpoint.ipynb @@ -0,0 +1,108 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fbcb401d-6ca6-4933-abd5-f8f504282416", + "metadata": {}, + "source": [ + "# **SVM**\n", + "\n", + "[![Slides](https://img.shields.io/badge/🦌-ReHLine-blueviolet)](https://rehline-python.readthedocs.io/en/latest/)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1f_7t1t6FNxAooQOmpyhHCOVq0IKgMxe-?usp=sharing)\n", + "\n", + "SVMs solve the following optimization problem:\n", + "$$\n", + " \\min_{\\mathbf{\\beta} \\in \\mathbb{R}^d} \\ C \\sum_{i=1}^n ( 1 - y_i \\mathbf{\\beta}^\\intercal \\mathbf{x}_i )_+ + \\frac{1}{2} \\| \\mathbf{\\beta} \\|_2^2\n", + "$$\n", + "where $\\mathbf{x}_i \\in \\mathbb{R}^d$ is a feature vector, and $y_i \\in \\{-1, 1\\}$ is a binary label.\n", + "\n", + "> **Note.** Since the hinge loss is a plq function, thus we can solve it by `rehline.plqERM_Ridge`." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "2dd1c096-e0df-492f-be63-8ac272007237", + "metadata": {}, + "outputs": [], + "source": [ + "## simulate data\n", + "from sklearn.datasets import make_classification\n", + "from sklearn.preprocessing import StandardScaler\n", + "import numpy as np\n", + "\n", + "scaler = StandardScaler()\n", + "\n", + "n, d = 10000, 5\n", + "X, y = make_classification(n_samples=n, n_features=d)\n", + "X = scaler.fit_transform(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "aece9fbe-f9be-40ae-8179-b44849fb0fd3", + "metadata": {}, + "outputs": [], + "source": [ + "## solve SVM via `plqERM_Ridge`\n", + "from rehline import plqERM_Ridge\n", + "\n", + "clf = plqERM_Ridge(loss={'name': 'svm'}, C=1.0)\n", + "clf.fit(X=X, y=y)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "93719987-c6b3-4a9b-9b40-c35e5bf90ef0", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import seaborn as sns\n", + "import pandas as pd\n", + "import warnings\n", + "import matplotlib.pyplot as plt\n", + "warnings.filterwarnings(\"ignore\", \"is_categorical_dtype\")\n", + "warnings.filterwarnings(\"ignore\", \"use_inf_as_na\")\n", + "\n", + "score = clf.decision_function(X)\n", + "df = pd.DataFrame({'score': score, 'y': y})\n", + "sns.histplot(df, x=\"score\", hue=\"y\")\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/README.md b/README.md index c90a90c..47b744d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# ReHLine +# ReHLine **ReHLine** is designed to be a computationally efficient and practically useful software package for large-scale empirical risk minimization (ERM) problems. diff --git a/doc/source/examples/FairSVM.ipynb b/doc/source/examples/FairSVM.ipynb new file mode 100644 index 0000000..422100e --- /dev/null +++ b/doc/source/examples/FairSVM.ipynb @@ -0,0 +1,223 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "50711dda-105e-4714-937b-e8be06370605", + "metadata": {}, + "source": [ + "# **FairSVM**\n", + "\n", + "[![Slides](https://img.shields.io/badge/🦌-ReHLine-blueviolet)](https://rehline-python.readthedocs.io/en/latest/)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1f_7t1t6FNxAooQOmpyhHCOVq0IKgMxe-?usp=sharing)\n", + "\n", + "The FairSVM solves the following optimization problem:\n", + "\n", + "$$\n", + "\\begin{align}\n", + " & \\min_{\\mathbf{\\beta} \\in \\mathbb{R}^d} \\frac{C}{n} \\sum_{i=1}^n ( 1 - y_i \\mathbf{\\beta}^\\intercal \\mathbf{x}_i )_+ + \\frac{1}{2} \\| \\mathbf{\\beta} \\|_2^2, \\nonumber \\\\\n", + " \\text{subject to } & \\quad \\frac{1}{n} \\sum_{i=1}^n \\mathbf{z}_i \\mathbf{\\beta}^\\intercal \\mathbf{x}_i \\leq \\mathbf{\\rho}, \\quad \\frac{1}{n} \\sum_{i=1}^n \\mathbf{z}_i \\mathbf{\\beta}^\\intercal \\mathbf{x}_i \\geq -\\mathbf{\\rho},\n", + "\\end{align}\n", + "$$\n", + "\n", + "where:\n", + "\n", + "* $\\mathbf{x}_i \\in \\mathbb{R}^d$ is a feature vector\n", + "* $y_i \\in \\{-1, 1\\}$ is a binary label\n", + "* $\\mathbf{z}_i$ is a collection of **centered sensitive features**, such as gender and/or race, satisfying:\n", + "\n", + "$$\\sum_{i=1}^n z_{ij} = 0,$$\n", + "\n", + "* $\\mathbf{z}_i \\in \\mathbb{R}^{d_0}$ is a $d_0$-length sensitive feature vector\n", + "* $\\mathbf{\\rho} \\in \\mathbb{R}_+^{d_0}$ is a vector of constants that trade-off predictive accuracy and fairness\n", + "\n", + "The constraints limit the correlation between the sensitive features and the decision function, ensuring fairness in the predictions.\n", + "> **Note.** Since the hinge loss is a plq function, and fairness constraints are linear, thus we can solve it by `rehline.plqERM_Ridge`." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e66268fa-403d-402b-9ea1-fbfe7573af40", + "metadata": {}, + "outputs": [], + "source": [ + "## simulate data\n", + "from sklearn.datasets import make_classification\n", + "from sklearn.preprocessing import StandardScaler\n", + "import numpy as np\n", + "\n", + "scaler = StandardScaler()\n", + "\n", + "n, d = 10000, 5\n", + "X, y = make_classification(n_samples=n, n_features=d, n_redundant=0)\n", + "## convert y to +1/-1\n", + "y = 2*y - 1\n", + "X = scaler.fit_transform(X)\n", + "\n", + "## we take the first column of X as sensetive features, and tol is 0.1\n", + "X_sen = X[:,0]\n", + "tol_sen = 0.1" + ] + }, + { + "cell_type": "markdown", + "id": "6a576a09-b700-49cd-b500-219f3a6e40b0", + "metadata": {}, + "source": [ + "## SVM as baseline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "15531796-3a45-42b3-8a99-da0343be9d4d", + "metadata": {}, + "outputs": [], + "source": [ + "## we first run a SVM\n", + "from rehline import plqERM_Ridge\n", + "\n", + "clf = plqERM_Ridge(loss={'name': 'svm'}, C=1.0, max_iter=50000)\n", + "clf.fit(X=X, y=y)" + ] + }, + { + "cell_type": "markdown", + "id": "79bb275b-2dfd-4608-83e3-b4b4eb0fdb72", + "metadata": {}, + "source": [ + "## FairSVM" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c43509f7-031b-4620-bc5e-fb5aea2ef1c2", + "metadata": {}, + "outputs": [], + "source": [ + "## solve FairSVM via `plqERM_Ridge` by adding `constraint`\n", + "from rehline import plqERM_Ridge\n", + "\n", + "fclf = plqERM_Ridge(loss={'name': 'svm'},\n", + " constraint=[{'name': 'fair',\n", + " 'X_sen': X_sen,\n", + " 'tol_sen': tol_sen}],\n", + " C=1.0,\n", + " max_iter=50000)\n", + "fclf.fit(X=X, y=y)" + ] + }, + { + "cell_type": "markdown", + "id": "794ede1f-13a4-4889-b6d9-f19a61faa510", + "metadata": {}, + "source": [ + "## Results" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "05dc3921-1837-474e-9a6d-4555a94ddc30", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Model Train Performance Correlation with Sensitive Features\n", + " SVM 0.8853 2.535203\n", + "FairSVM 0.5856 0.100212\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "## score\n", + "score = clf.decision_function(X)\n", + "fscore = fclf.decision_function(X)\n", + "\n", + "svm_perf = len(y[score*y > 0])/n\n", + "fsvm_perf = len(y[fscore*y > 0])/n\n", + "\n", + "svm_corr = score.dot(X_sen) / n\n", + "fsvm_corr = fscore.dot(X_sen) / n\n", + "\n", + "# Create a pandas DataFrame to store the results\n", + "results = pd.DataFrame({\n", + " 'Model': ['SVM', 'FairSVM'],\n", + " 'Train Performance': [svm_perf, fsvm_perf],\n", + " 'Correlation with Sensitive Features': [svm_corr, fsvm_corr]\n", + "})\n", + "\n", + "# Print the results as a table\n", + "print(results.to_string(index=False))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ad5a863e-fbbb-4caf-876d-374f3ca9b891", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import seaborn as sns\n", + "import pandas as pd\n", + "import warnings\n", + "import matplotlib.pyplot as plt\n", + "warnings.filterwarnings(\"ignore\", \"is_categorical_dtype\")\n", + "warnings.filterwarnings(\"ignore\", \"use_inf_as_na\")\n", + "\n", + "df = pd.DataFrame({'score': score, 'fscore': fscore, 'y': y})\n", + "\n", + "sns.histplot(df, x=\"score\", hue=\"y\").set_title(\"SVM\")\n", + "plt.show()\n", + "sns.histplot(df, x=\"fscore\", hue=\"y\").set_title(\"FairSVM\")\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/source/examples/QR.ipynb b/doc/source/examples/QR.ipynb new file mode 100644 index 0000000..75640d9 --- /dev/null +++ b/doc/source/examples/QR.ipynb @@ -0,0 +1,123 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e3a11293-4739-476e-a513-48a256d425a2", + "metadata": {}, + "source": [ + "## **Ridge Quantile Regression**\n", + "\n", + "[![Slides](https://img.shields.io/badge/🦌-ReHLine-blueviolet)](https://rehline-python.readthedocs.io/en/latest/)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1LwatjwjnMSB97eLVyuOiUY3sl3A3Ie__?usp=sharing)\n", + "\n", + "The regularized quantile regression solves the following optimization problem:\n", + "\n", + "$$\n", + "min_{\\beta \\in \\mathbb{R}^{d}} \\ C \\sum_{i=1}^n \\rho_\\kappa ( y_i - x^\\intercal_i \\beta ) + \\frac{1}{2} \\| \\beta \\|^2,\n", + "$$\n", + "\n", + "where $\\rho_\\kappa(u) = u\\cdot(\\kappa - \\mathbf{1}(u < 0))$ is the check loss,\n", + "$x_i \\in \\mathbb{R}^d$ is a feature vector, $y_i \\in \\mathbb{R}$ is the response variable.\n", + "\n", + "> **Note.** Since the check loss is a plq function, thus we can solve it by `rehline.plqERM_Ridge`." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b2dd4ce5-bc27-41a4-89ab-7920d393f377", + "metadata": {}, + "outputs": [], + "source": [ + "## simulate data\n", + "from sklearn.datasets import make_regression\n", + "from sklearn.preprocessing import StandardScaler\n", + "import numpy as np\n", + "\n", + "scaler = StandardScaler()\n", + "\n", + "n, d = 10000, 5\n", + "X, y = make_regression(n_samples=n, n_features=d, noise=1.0)\n", + "X = scaler.fit_transform(X)\n", + "## add intercept\n", + "X = np.hstack((X,np.ones((n,1))))\n", + "y = y/y.std()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "80129ee6-f886-4e27-a764-630f15826bca", + "metadata": {}, + "outputs": [], + "source": [ + "## solve QR with different `qt` via `plqERM_Ridge`\n", + "from rehline import plqERM_Ridge\n", + "\n", + "clf5 = plqERM_Ridge(loss={'name': 'QR', 'qt': 0.05}, C=10.0/n)\n", + "clf5.fit(X=X, y=y)\n", + "\n", + "clf95 = plqERM_Ridge(loss={'name': 'QR', 'qt': 0.95}, C=10.0/n)\n", + "clf95.fit(X=X, y=y)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1d8b90e9-6af9-4856-9751-6fe6fbc7665c", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "## plot QR results\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "import warnings\n", + "import matplotlib.pyplot as plt\n", + "warnings.filterwarnings(\"ignore\", \"is_categorical_dtype\")\n", + "\n", + "n_sample = 50\n", + "X_sample, y_sample = X[:n_sample], y[:n_sample]\n", + "q05_sample = clf5.decision_function(X_sample)\n", + "q95_sample = clf95.decision_function(X_sample)\n", + "\n", + "df = pd.DataFrame({'x0': X_sample[:,0], 'real_y': y_sample, 'q05': q05_sample, 'q95': q95_sample})\n", + "df = df.melt(id_vars='x0')\n", + "\n", + "sns.scatterplot(data=df, x='x0', y='value', hue='variable')\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/source/examples/ReHLine_QR.ipynb b/doc/source/examples/ReHLine_QR.ipynb deleted file mode 100644 index 92dcd4f..0000000 --- a/doc/source/examples/ReHLine_QR.ipynb +++ /dev/null @@ -1,176 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "## **Example: Solving Quantile Regression via ReHLine**\n", - "\n", - "[![Slides](https://img.shields.io/badge/🦌-ReHLine-blueviolet)](https://rehline-python.readthedocs.io/en/latest/)\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1LwatjwjnMSB97eLVyuOiUY3sl3A3Ie__?usp=sharing)" - ], - "metadata": { - "id": "l-wsw7CJor38" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "adidty-mclLB", - "outputId": "91d57f24-91fe-4809-ec3d-bfbb5a9346d2" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting rehline\n", - " Downloading rehline-0.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (147 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m147.1/147.1 kB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: requests>=2.27.0 in /usr/local/lib/python3.10/dist-packages (from rehline) (2.31.0)\n", - "Collecting pybind11>=2.11.1 (from rehline)\n", - " Downloading pybind11-2.13.1-py3-none-any.whl (238 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m238.8/238.8 kB\u001b[0m \u001b[31m13.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: numpy>=1.23.5 in /usr/local/lib/python3.10/dist-packages (from rehline) (1.25.2)\n", - "Requirement already satisfied: scipy>=1.11.4 in /usr/local/lib/python3.10/dist-packages (from rehline) (1.11.4)\n", - "Requirement already satisfied: scikit-learn>=1.2.2 in /usr/local/lib/python3.10/dist-packages (from rehline) (1.2.2)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.27.0->rehline) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.27.0->rehline) (3.7)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.27.0->rehline) (2.0.7)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.27.0->rehline) (2024.6.2)\n", - "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.2.2->rehline) (1.4.2)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.2.2->rehline) (3.5.0)\n", - "Installing collected packages: pybind11, rehline\n", - "Successfully installed pybind11-2.13.1 rehline-0.0.3\n" - ] - } - ], - "source": [ - "!pip install rehline" - ] - }, - { - "cell_type": "code", - "source": [ - "## simulate data\n", - "from sklearn.datasets import make_regression\n", - "from sklearn.preprocessing import StandardScaler\n", - "import numpy as np\n", - "\n", - "scaler = StandardScaler()\n", - "\n", - "n, d = 10000, 5\n", - "X, y = make_regression(n_samples=n, n_features=d, noise=1.0)\n", - "X = scaler.fit_transform(X)\n", - "y = y/y.std()" - ], - "metadata": { - "id": "WYZq1rWWctNl" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "from rehline import ReHLine\n", - "\n", - "qt = [0.25, 0.5, 0.75]\n", - "\n", - "clf = ReHLine(C=1.0/n)\n", - "X_fake = clf.make_ReLHLoss(X=X, y=y, loss={'name':'QR', 'qt':qt})\n", - "clf.fit(X_fake)\n", - "\n", - "## the first d params are the linear coefficients\n", - "## and the last 3 params are the quantile-specific intercept\n", - "clf.coef_" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "tWpmFfCxdM7K", - "outputId": "3fb01216-8ab6-4411-c40e-d5d1e133cdea" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "array([ 0.59688881, 0.39743301, 0.17675816, 0.45894896, 0.47806579,\n", - " -0.0134518 , -0.00152496, 0.01026677])" - ] - }, - "metadata": {}, - "execution_count": 47 - } - ] - }, - { - "cell_type": "code", - "source": [ - "score = [X.dot(clf.coef_[:d]) + clf.coef_[d+l] for l in range(len(qt))]\n", - "\n", - "## report Qs for some samples\n", - "X_sample, y_sample = X[:5], y[:5]\n", - "q_sample = [X_sample.dot(clf.coef_[:d]) + clf.coef_[d+l] for l in range(len(qt))]\n", - "\n", - "q_sample = np.array(q_sample).T\n", - "\n", - "print('fitted quantiles: %s \\n' %qt)\n", - "print(q_sample)\n", - "print('\\n Real Y: \\n')\n", - "print(y_sample[:,np.newaxis])" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hFqz2fI-fAr9", - "outputId": "983ee042-1729-4de2-f9d7-74824d7894fd" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "fitted quantiles: [0.25, 0.5, 0.75] \n", - "\n", - "[[-0.20078214 -0.18885531 -0.17706358]\n", - " [-2.48210771 -2.47018088 -2.45838915]\n", - " [-1.31540194 -1.30347511 -1.29168338]\n", - " [-2.70665164 -2.69472481 -2.68293308]\n", - " [ 0.37165816 0.38358499 0.39537672]]\n", - "\n", - " Real Y: \n", - "\n", - "[[-0.19786913]\n", - " [-2.51954039]\n", - " [-1.33318935]\n", - " [-2.74956754]\n", - " [ 0.39217369]]\n" - ] - } - ] - } - ] -} \ No newline at end of file diff --git a/doc/source/examples/ReHLine_SVM_FairSVM.ipynb b/doc/source/examples/ReHLine_SVM_FairSVM.ipynb deleted file mode 100644 index ea0b51c..0000000 --- a/doc/source/examples/ReHLine_SVM_FairSVM.ipynb +++ /dev/null @@ -1,187 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# **Example: solving SVM and FairSVM via ReHLine**\n", - "\n", - "[![Slides](https://img.shields.io/badge/🦌-ReHLine-blueviolet)](https://rehline-python.readthedocs.io/en/latest/)\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1f_7t1t6FNxAooQOmpyhHCOVq0IKgMxe-?usp=sharing)" - ], - "metadata": { - "id": "hK6foc_zTi1U" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "gYwC0MP9Cpze", - "outputId": "0eb3aed9-82dc-4dff-be33-232ee9c72deb" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting rehline\n", - " Downloading rehline-0.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (147 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m147.1/147.1 kB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: requests>=2.27.0 in /usr/local/lib/python3.10/dist-packages (from rehline) (2.31.0)\n", - "Collecting pybind11>=2.11.1 (from rehline)\n", - " Downloading pybind11-2.13.1-py3-none-any.whl (238 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m238.8/238.8 kB\u001b[0m \u001b[31m7.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: numpy>=1.23.5 in /usr/local/lib/python3.10/dist-packages (from rehline) (1.25.2)\n", - "Requirement already satisfied: scipy>=1.11.4 in /usr/local/lib/python3.10/dist-packages (from rehline) (1.11.4)\n", - "Requirement already satisfied: scikit-learn>=1.2.2 in /usr/local/lib/python3.10/dist-packages (from rehline) (1.2.2)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.27.0->rehline) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.27.0->rehline) (3.7)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.27.0->rehline) (2.0.7)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.27.0->rehline) (2024.6.2)\n", - "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.2.2->rehline) (1.4.2)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.2.2->rehline) (3.5.0)\n", - "Installing collected packages: pybind11, rehline\n", - "Successfully installed pybind11-2.13.1 rehline-0.0.3\n" - ] - } - ], - "source": [ - "## install package\n", - "!pip install rehline" - ] - }, - { - "cell_type": "code", - "source": [ - "## simulate data\n", - "import rehline\n", - "from rehline import ReHLine\n", - "import numpy as np\n", - "\n", - "n, d = 100000, 5\n", - "\n", - "X, y, X_sen = rehline.make_fair_classification(n_samples=n,\n", - " n_features=d,\n", - " ind_sensitive=0)" - ], - "metadata": { - "id": "BELfW8bYE_bp" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "## fitting SVMs\n", - "clf = ReHLine(loss={'name': 'svm'}, C=1./n)\n", - "\n", - "clf.make_ReLHLoss(X=X, y=y, loss={'name': 'svm'})\n", - "\n", - "clf.fit(X=X)" - ], - "metadata": { - "id": "gaMe2UYSFSFO" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "## report performance and sensitive correlation\n", - "score = X @ clf.coef_\n", - "\n", - "print('Classification Acc: %.3f' %(len(score[score*y > 0]) / len(score)))\n", - "print('Sensitive Correlation: %.3f' %( score.dot(X_sen) / n ))" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "8tjCbX4OGnKr", - "outputId": "d4cb9e3b-1b30-4583-9843-b137958e8980" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Classification Acc: 0.953\n", - "Sensitive Correlation: 0.898\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "## fitting Fair-SVMs\n", - "\n", - "rho = 0.01\n", - "\n", - "clf_fair = ReHLine(loss={'name': 'svm'}, C=1./n)\n", - "\n", - "clf_fair.make_ReLHLoss(X=X, y=y, loss={'name': 'svm'})\n", - "A = np.repeat([X_sen @ X], repeats=[2], axis=0) / n\n", - "A[1] = -A[1]\n", - "b = np.array([rho, rho])\n", - "\n", - "clf_fair.A, clf_fair.b = A, b\n", - "clf_fair.fit(X=X)" - ], - "metadata": { - "id": "uSU-t7sEIVY_" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "## report performance and sensitive correlation\n", - "\n", - "score = X @ clf_fair.coef_\n", - "\n", - "print('Classification Acc: %.3f' %(len(score[score*y > 0]) / len(score)))\n", - "print('Sensitive Correlation: %.3f' %( score.dot(X_sen) / n ))" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "OXx4TwVYI6b6", - "outputId": "279e1c88-cae4-4cf5-f774-25ff9bc63b1e" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Classification Acc: 0.574\n", - "Sensitive Correlation: 0.010\n" - ] - } - ] - } - ] -} \ No newline at end of file diff --git a/doc/source/examples/SVM.ipynb b/doc/source/examples/SVM.ipynb new file mode 100644 index 0000000..05e9fe0 --- /dev/null +++ b/doc/source/examples/SVM.ipynb @@ -0,0 +1,108 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fbcb401d-6ca6-4933-abd5-f8f504282416", + "metadata": {}, + "source": [ + "# **SVM**\n", + "\n", + "[![Slides](https://img.shields.io/badge/🦌-ReHLine-blueviolet)](https://rehline-python.readthedocs.io/en/latest/)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1f_7t1t6FNxAooQOmpyhHCOVq0IKgMxe-?usp=sharing)\n", + "\n", + "SVMs solve the following optimization problem:\n", + "$$\n", + " \\min_{\\mathbf{\\beta} \\in \\mathbb{R}^d} \\ C \\sum_{i=1}^n ( 1 - y_i \\mathbf{\\beta}^\\intercal \\mathbf{x}_i )_+ + \\frac{1}{2} \\| \\mathbf{\\beta} \\|_2^2\n", + "$$\n", + "where $\\mathbf{x}_i \\in \\mathbb{R}^d$ is a feature vector, and $y_i \\in \\{-1, 1\\}$ is a binary label.\n", + "\n", + "> **Note.** Since the hinge loss is a plq function, thus we can solve it by `rehline.plqERM_Ridge`." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "2dd1c096-e0df-492f-be63-8ac272007237", + "metadata": {}, + "outputs": [], + "source": [ + "## simulate data\n", + "from sklearn.datasets import make_classification\n", + "from sklearn.preprocessing import StandardScaler\n", + "import numpy as np\n", + "\n", + "scaler = StandardScaler()\n", + "\n", + "n, d = 10000, 5\n", + "X, y = make_classification(n_samples=n, n_features=d)\n", + "X = scaler.fit_transform(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "aece9fbe-f9be-40ae-8179-b44849fb0fd3", + "metadata": {}, + "outputs": [], + "source": [ + "## solve SVM via `plqERM_Ridge`\n", + "from rehline import plqERM_Ridge\n", + "\n", + "clf = plqERM_Ridge(loss={'name': 'svm'}, C=1.0)\n", + "clf.fit(X=X, y=y)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "93719987-c6b3-4a9b-9b40-c35e5bf90ef0", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import seaborn as sns\n", + "import pandas as pd\n", + "import warnings\n", + "import matplotlib.pyplot as plt\n", + "warnings.filterwarnings(\"ignore\", \"is_categorical_dtype\")\n", + "warnings.filterwarnings(\"ignore\", \"use_inf_as_na\")\n", + "\n", + "score = clf.decision_function(X)\n", + "df = pd.DataFrame({'score': score, 'y': y})\n", + "sns.histplot(df, x=\"score\", hue=\"y\")\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/source/tutorials.rst b/doc/source/tutorials.rst index 1822fee..fcbc63a 100644 --- a/doc/source/tutorials.rst +++ b/doc/source/tutorials.rst @@ -31,6 +31,45 @@ Loss # loss_kwargs: more keys and values for loss parameters loss = {'name': , <**loss_kwargs>} +.. list-table:: + + * - **SVM** + - | ``loss_name``: 'hinge' / 'svm' / 'SVM' + | + | *Example:* ``loss = {'name': 'SVM'}`` + + * - **Quantile Reg** + - | ``loss_name``: 'check' / 'quantile' / 'quantile regression' / 'QR' + | ``qt`` (*float*): qt + | + | *Example:* ``loss = {'name': 'QR', 'qt': 0.25}`` + + * - **Smooth SVM** + - | ``loss_name``: 'sSVM' / 'smooth SVM' / 'smooth hinge' + | + | *Example:* ``loss = {'name': 'sSVM'}`` + + * - **Huber** + - | ``loss_name``: 'huber' / 'Huber' + | + | *Example:* ``loss = {'name': 'huber'}`` + + * - **SVR** + - | ``loss_name``: 'SVR' / 'svr' + | ``epsilon`` (*float*): 0.1 + | + | *Example:* ``loss = {'name': 'svr', 'epsilon': 0.1}`` + +constraint +********** + +.. code:: python + + # list of + # name (str): name of the custom loss function + # loss_kwargs: more keys and values for loss parameters + constraint = [{'name': , <**loss_kwargs>}, ...] + .. list-table:: * - **SVM** diff --git a/rehline/__init__.py b/rehline/__init__.py index 5e95c1c..4ab3f85 100644 --- a/rehline/__init__.py +++ b/rehline/__init__.py @@ -1,9 +1,12 @@ # Import from internal C++ module -from ._base import ReHLine_solver, _BaseReHLine -from ._class import ReHLine +from ._base import ReHLine_solver, _BaseReHLine, _make_loss_rehline_param +from ._class import ReHLine, plqERM_Ridge from ._data import make_fair_classification from ._internal import rehline_internal, rehline_result __all__ = ("_BaseReHLine", "ReHLine", + "plqERM_Ridge", + "_make_loss_rehline_param", + "_make_constraint_rehline_param" "make_fair_classification") \ No newline at end of file diff --git a/rehline/_base.py b/rehline/_base.py index 9f7edd5..b4ed573 100644 --- a/rehline/_base.py +++ b/rehline/_base.py @@ -62,7 +62,6 @@ def __init__(self, C=1., self.A = A self.b = b self.L = U.shape[0] - self.n = U.shape[1] self.H = S.shape[0] self.K = A.shape[0] @@ -71,7 +70,6 @@ def auto_shape(self): Automatically generate the shape of the parameters of the ReHLine loss function. """ self.L = self.U.shape[0] - self.n = self.U.shape[1] self.H = self.S.shape[0] self.K = self.A.shape[0] @@ -89,9 +87,9 @@ def call_ReLHLoss(self, score): float ReHLine loss evaluation of the given score. """ - - relu_input = np.zeros((self.L, self.n)) - rehu_input = np.zeros((self.H, self.n)) + n = len(score) + relu_input = np.zeros((self.L, n)) + rehu_input = np.zeros((self.H, n)) if self.L > 0: relu_input = (self.U.T * score[:,np.newaxis]).T + self.V if self.H > 0: @@ -181,4 +179,284 @@ def ReHLine_solver(X, U, V, max_iter=1000, tol=1e-4, shrink=1, verbose=1, trace_freq=100): result = rehline_result() rehline_internal(result, X, A, b, U, V, S, T, Tau, max_iter, tol, shrink, verbose, trace_freq) - return result \ No newline at end of file + return result + + +def _make_loss_rehline_param(loss, X, y): + """The `_make_loss_rehline_param` function generates parameters for the ReHLine solver, based on the provided training data. + + The function supports various loss functions, including: + - 'hinge' + - 'svm' or 'SVM' + - 'check' or 'quantile' or 'quantile regression' or 'QR' + - 'sSVM' or 'smooth SVM' or 'smooth hinge' + - 'TV' + - 'huber' or 'Huber' + - 'SVR' or 'svr' + - Custom loss functions (manual setup required) + + Parameters + ---------- + loss : dict + A dictionary containing the loss function parameters. + + Keys: + - 'name' : str, the name of the loss function (e.g. 'hinge', 'svm', 'QR', etc.) + - 'loss_kwargs': more keys and values for loss parameters + + X : ndarray of shape (n_samples, n_features) + The generated samples. + + y : ndarray of shape (n_samples,) + The +/- labels for class membership of each sample. + """ + + n, d = X.shape + + ## initialization of ReHLine params + U=np.empty(shape=(0,0)) + V=np.empty(shape=(0,0)) + Tau=np.empty(shape=(0,0)) + S=np.empty(shape=(0,0)) + T=np.empty(shape=(0,0)) + + # _dummy_X = False + + if (loss['name'] == 'hinge') or (loss['name'] == 'svm')\ + or (loss['name'] == 'SVM'): + U = -y.reshape(1,-1) + V = (np.array(np.ones(n))).reshape(1,-1) + + elif (loss['name'] == 'check') \ + or (loss['name'] == 'quantile') \ + or (loss['name'] == 'quantile regression') \ + or (loss['name'] == 'QR'): + + qt = loss['qt'] + + U = np.ones((2, n)) + V = np.ones((2, n)) + + U[0] = - qt*U[0] + U[1] = (1-qt)*U[1] + V[0] = qt*V[0]*y + V[1] = -(1-qt)*V[1]*y + + # elif (loss['name'] == 'CQR') \ + + # n_qt = len(loss['qt']) + # U = np.ones((2, n*n_qt)) + # V = np.ones((2, n*n_qt)) + # X_fake = np.zeros((n*n_qt, d+n_qt)) + + # for l,qt_tmp in enumerate(loss['qt']): + # U[0,l*n:(l+1)*n] = - (qt_tmp*U[0,l*n:(l+1)*n]) + # U[1,l*n:(l+1)*n] = ((1.-qt_tmp)*U[1,l*n:(l+1)*n]) + + # V[0,l*n:(l+1)*n] = qt_tmp*V[0,l*n:(l+1)*n]*y + # V[1,l*n:(l+1)*n] = - (1.-qt_tmp)*V[1,l*n:(l+1)*n]*y + + # X_fake[l*n:(l+1)*n,:d] = X + # X_fake[l*n:(l+1)*n,d+l] = 1. + + elif (loss['name'] == 'sSVM') \ + or (loss['name'] == 'smooth SVM') \ + or (loss['name'] == 'smooth hinge'): + S = np.ones((1, n)) + T = np.ones((1, n)) + Tau = np.ones((1, n)) + S[0] = - y + + elif loss['name'] == 'TV': + U = np.ones((2, n)) + V = np.ones((2, n)) + U[1] = - U[1] + + V[0] = - X.dot(y) + V[1] = X.dot(y) + + elif (loss['name'] == 'huber') or (loss['name'] == 'Huber'): + S = np.ones((2, n)) + T = np.ones((2, n)) + Tau = loss['tau'] * np.ones((2, n)) + + S[0] = -S[0] + T[0] = y + T[1] = -y + + elif (loss['name'] in ['SVR', 'svr']): + U = np.ones((2, n)) + V = np.ones((2, n)) + U[1] = -U[1] + + V[0] = -(y + loss['epsilon']) + V[1] = (y - loss['epsilon']) + + else: + raise Exception("Sorry, ReHLine currently does not support this loss function, \ + but you can manually set ReHLine params to solve the problem via `ReHLine` class.") + + return U, V, Tau, S, T + +def _make_constraint_rehline_param(constraint, X, y=None): + """The `_make_constraint_rehline_param` function generates constraint parameters for the ReHLine solver. + + Parameters + ---------- + constraint : list of dict + A list of dictionaries, where each dictionary represents a constraint. + Each dictionary must contain a 'name' key, which specifies the type of constraint. + The following constraint types are supported: + * 'nonnegative' or '>=0': A non-negativity constraint. + * 'fair' or 'fairness': A fairness constraint. + * 'custom': A custom constraint, where the user must provide the constraint matrix 'A' and vector 'b'. + + X : array-like of shape (n_samples, n_features) + The design matrix. + + y : array-like of shape (n_samples,), default=None + The target variable. Not used in this function. + + Returns + ------- + A : array-like of shape (n_constraints, n_features) + The constraint matrix. + + b : array-like of shape (n_constraints,) + The constraint vector. + + Notes + ----- + This function iterates over the list of constraints and generates the constraint matrix 'A' and vector 'b' accordingly. + For 'nonnegative' and 'fair' constraints, the function generates the constraint parameters automatically. + For 'custom' constraints, the user must provide the constraint matrix 'A' and vector 'b' explicitly. + """ + + n, d = X.shape + + ## initialization + A=np.empty(shape=(0, 0)) + b=np.empty(shape=(0)) + + for constr_tmp in constraint: + if (constr_tmp['name'] == 'nonnegative') or (constr_tmp['name'] == '>=0'): + A_tmp = np.identity(d) + b_tmp = np.zeros(d) + elif (constr_tmp['name'] == 'fair') or (constr_tmp['name'] == 'fairness'): + X_sen = constr_tmp['X_sen'] + tol_sen = constr_tmp['tol_sen'] + tol_sen = np.array(tol_sen).reshape(-1) + + assert len(X_sen) == len(X), "X and X_sen must have the same length" + X_sen = X_sen.reshape(n,-1) + + assert X_sen.shape[1] == len(tol_sen), "dim of X_sen and len of tol_sen must be equal" + d_sen = X_sen.shape[1] + + A_tmp = np.repeat(X_sen.T @ X, repeats=[2], axis=0) / n + A_tmp[::2] = -A_tmp[::2] + b_tmp = np.repeat(tol_sen, repeats=[2], axis=0) + elif (constr_tmp['name'] == 'custom'): + A_tmp = constr_tmp['A'] + b_tmp = constr_tmp['b'] + else: + raise Exception("Sorry, ReHLine currently does not support this constraint, \ + but you can add it by manually setting A and b via {'name': 'custom', 'A': A, 'b': b}") + + A = np.vstack([A, A_tmp]) if A.size else A_tmp + b = np.hstack([b, b_tmp]) if b.size else b_tmp + + return A, b + +def _make_penalty_rehline_param(self, penalty=None, X=None): + """The `_make_penalty_rehline_param` function generates penalty parameters for the ReHLine solver. + """ + raise Exception("Sorry, `_make_penalty_rehline_param` feature is currently under development.") + + +# def append_l1(self, X, l1_pen=1.0): +# r""" +# This function appends the l1 penalty to the ReHLine problem. The formulation becomes: + +# .. math:: + +# \min_{\mathbf{\beta} \in \mathbb{R}^d} \sum_{i=1}^n \sum_{l=1}^L \text{ReLU}( u_{li} \mathbf{x}_i^\intercal \mathbf{\beta} + v_{li}) + \sum_{i=1}^n \sum_{h=1}^H {\text{ReHU}}_{\tau_{hi}}( s_{hi} \mathbf{x}_i^\intercal \mathbf{\beta} + t_{hi}) + \frac{1}{2} \| \mathbf{\beta} \|_2^2 + \lambda_1 \| \mathbf{\beta} \|_1, \\ \text{ s.t. } +# \mathbf{A} \mathbf{\beta} + \mathbf{b} \geq \mathbf{0}, + +# where :math:`\lambda_1` is associated with `l1_pen`. + +# Parameters +# ---------- + +# X : ndarray of shape (n_samples, n_features) +# The generated samples. + +# l1_pen : float, default=1.0 +# The l1 penalty level, which controls the complexity or sparsity of the resulting model. + +# Returns +# ------- + +# X_fake: ndarray of shape (n_samples+n_features, n_features) +# The manipulated data matrix. It has been padded with +# identity matrix, allowing the correctly structured data to be input +# into `self.fit` or other modelling processes. + +# Examples +# -------- + +# >>> import numpy as np +# >>> from rehline import ReHLine + +# >>> # simulate classification dataset +# >>> n, d, C, lam1 = 1000, 3, 0.5, 1.0 +# >>> np.random.seed(1024) +# >>> X = np.random.randn(1000, 3) +# >>> beta0 = np.random.randn(3) +# >>> y = np.sign(X.dot(beta0) + np.random.randn(n)) + +# >>> clf = ReHLine(loss={'name': 'svm'}, C=C) +# >>> clf.make_ReLHLoss(X=X, y=y, loss={'name': 'svm'}) +# >>> # save and fit with the manipulated data matrix +# >>> X_fake = clf.append_l1(X, l1_pen=lam1) +# >>> clf.fit(X=X_fake) +# >>> print('sol privided by rehline: %s' %clf.coef_) +# >>> sol privided by rehline: [ 7.17796629e-01 -1.87075728e-06 2.61965622e+00] #sparse sol +# >>> print(clf.decision_function([[.1,.2,.3]])) +# >>> [0.85767616] +# """ + +# n, d = X.shape +# l1_pen = l1_pen*np.ones(d) +# U_new = np.zeros((self.L+2, n+d)) +# V_new = np.zeros((self.L+2, n+d)) +# ## Block 1 +# if len(self.U): +# U_new[:self.L, :n] = self.U +# V_new[:self.L, :n] = self.V +# ## Block 2 +# U_new[-2,n:] = l1_pen +# U_new[-1,n:] = -l1_pen + +# if len(self.S): +# S_new = np.zeros((self.H, n+d)) +# T_new = np.zeros((self.H, n+d)) +# Tau_new = np.zeros((self.H, n+d)) + +# S_new[:,:n] = self.S +# T_new[:,:n] = self.T +# Tau_new[:,:n] = self.Tau + +# self.S = S_new +# self.T = T_new +# self.Tau = Tau_new + +# ## fake X +# X_fake = np.zeros((n+d, d)) +# X_fake[:n,:] = X +# X_fake[n:,:] = np.identity(d) + +# self.U = U_new +# self.V = V_new +# self.auto_shape() +# return X_fake \ No newline at end of file diff --git a/rehline/_class.py b/rehline/_class.py index d7d69e3..ed4c2a8 100644 --- a/rehline/_class.py +++ b/rehline/_class.py @@ -7,9 +7,11 @@ import numpy as np from sklearn.base import BaseEstimator -from sklearn.utils.validation import check_array, check_is_fitted, check_X_y +from sklearn.utils.validation import (_check_sample_weight, check_array, + check_is_fitted, check_X_y) -from ._base import ReHLine_solver, _BaseReHLine +from ._base import (ReHLine_solver, _BaseReHLine, + _make_constraint_rehline_param, _make_loss_rehline_param) class ReHLine(_BaseReHLine, BaseEstimator): @@ -126,8 +128,12 @@ def fit(self, X, sample_weight=None): An instance of the estimator. """ # X = check_array(X) + + if sample_weight is None: - sample_weight = np.ones(X.shape[0]) + sample_weight = self.C + else: + sample_weight = self.C*_check_sample_weight(sample_weight, X, dtype=X.dtype) if self.L > 0: U_weight = self.U * sample_weight @@ -180,18 +186,37 @@ def decision_function(self, X): X = check_array(X) return np.dot(X, self.coef_) - -class plqERM(_BaseReHLine, BaseEstimator): - r"""Empirical Risk Minimization (ERM) with a piecewise linear-quadratic (PLQ) objective. +class plqERM_Ridge(_BaseReHLine, BaseEstimator): + r"""Empirical Risk Minimization (ERM) with a piecewise linear-quadratic (PLQ) objective with a ridge penalty. .. math:: - \min_{\mathbf{\beta} \in \mathbb{R}^d} \sum_{i=1}^n \text{PLQ}(y_i, \mathbf{x}_i^T \mathbf{\beta}) + \text{pen}(\mathbf{\beta}) + \frac{1}{2} \| \mathbf{\beta} \|_2^2, \\ \text{ s.t. } + \min_{\mathbf{\beta} \in \mathbb{R}^d} \sum_{i=1}^n \text{PLQ}(y_i, \mathbf{x}_i^T \mathbf{\beta}) + \frac{1}{2} \| \mathbf{\beta} \|_2^2, \ \text{ s.t. } \ \mathbf{A} \mathbf{\beta} + \mathbf{b} \geq \mathbf{0}, - + + The function supports various loss functions, including: + - 'hinge', 'svm' or 'SVM' + - 'check' or 'quantile' or 'quantile regression' or 'QR' + - 'sSVM' or 'smooth SVM' or 'smooth hinge' + - 'TV' + - 'huber' or 'Huber' + - 'SVR' or 'svr' + + The following constraint types are supported: + * 'nonnegative' or '>=0': A non-negativity constraint. + * 'fair' or 'fairness': A fairness constraint. + * 'custom': A custom constraint, where the user must provide the constraint matrix 'A' and vector 'b'. + Parameters ---------- + loss : dict + A dictionary specifying the loss function parameters. + + constraint : list of dict + A list of dictionaries, where each dictionary represents a constraint. + Each dictionary must contain a 'name' key, which specifies the type of constraint. + C : float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. @@ -220,254 +245,54 @@ class plqERM(_BaseReHLine, BaseEstimator): Attributes ---------- + coef_ : array-like + The optimized model coefficients. - coef_ : array of shape (n_features,) - Weights assigned to the features (coefficients in the primal - problem). + n_iter_ : int + The number of iterations performed by the ReHLine solver. - n_iter_: int - Maximum number of iterations run across all classes. + opt_result_ : object + The optimization result object. - """ + dual_obj_ : array-like + The dual objective function values. + + primal_obj_ : array-like + The primal objective function values. + Methods + ------- + fit(X, y, sample_weight=None) + Fit the model based on the given training data. - def __init__(self, loss, - constraint = None, - penalty = None, + decision_function(X) + The decision function evaluated on the given dataset. + Notes + ----- + The `plqERM_Ridge` class is a subclass of `_BaseReHLine` and `BaseEstimator`, which suggests that it is part of a larger framework for implementing ReHLine algorithms. + + """ + + def __init__(self, loss, + constraint=[], C=1., + U=np.empty(shape=(0,0)), V=np.empty(shape=(0,0)), + Tau=np.empty(shape=(0,0)), + S=np.empty(shape=(0,0)), T=np.empty(shape=(0,0)), + A=np.empty(shape=(0,0)), b=np.empty(shape=(0)), max_iter=1000, tol=1e-4, shrink=1, verbose=0, trace_freq=100): + _BaseReHLine.__init__(self, C, U, V, Tau, S, T, A, b) self.loss = loss self.constraint = constraint - self.penalty = penalty self.max_iter = max_iter self.tol = tol self.shrink = shrink self.verbose = verbose self.trace_freq = trace_freq + self.dummy_n = 0 - def _make_loss_rehline_param(self, X, y): - """The `_make_loss_rehline_param` function generates parameters for the ReHLine solver, based on the provided training data. - - The function supports plq loss functions - like 'hinge', 'svm', 'SVM', 'check', 'quantile', 'quantile regression', - 'QR', 'sSVM', 'smooth SVM', 'smooth hinge', 'TV', 'huber', and 'custom'. - - Parameters - ---------- - - X : ndarray of shape (n_samples, n_features) - The generated samples. - - y : ndarray of shape (n_samples,) - The +/- labels for class membership of each sample. - """ - - n, d = X.shape - - if (self.loss['name'] == 'hinge') or (self.loss['name'] == 'svm')\ - or (self.loss['name'] == 'SVM'): - self.U = -(self.C*y).reshape(1,-1) - self.V = (self.C*np.array(np.ones(n))).reshape(1,-1) - return X - - elif (self.loss['name'] == 'check') \ - or (self.loss['name'] == 'quantile') \ - or (self.loss['name'] == 'quantile regression') \ - or (self.loss['name'] == 'QR'): - - n_qt = len(loss['qt']) - self.U = np.ones((2, n*n_qt)) - self.V = np.ones((2, n*n_qt)) - X_fake = np.zeros((n*n_qt, d+n_qt)) - - for l,qt_tmp in enumerate(loss['qt']): - self.U[0,l*n:(l+1)*n] = - (self.C*qt_tmp*self.U[0,l*n:(l+1)*n]) - self.U[1,l*n:(l+1)*n] = (self.C*(1.-qt_tmp)*self.U[1,l*n:(l+1)*n]) - - self.V[0,l*n:(l+1)*n] = self.C*qt_tmp*self.V[0,l*n:(l+1)*n]*y - self.V[1,l*n:(l+1)*n] = - self.C*(1.-qt_tmp)*self.V[1,l*n:(l+1)*n]*y - - X_fake[l*n:(l+1)*n,:d] = X - X_fake[l*n:(l+1)*n,d+l] = 1. - - self.auto_shape() - return X_fake - - elif (self.loss['name'] == 'sSVM') \ - or (self.loss['name'] == 'smooth SVM') \ - or (self.loss['name'] == 'smooth hinge'): - self.S = np.ones((1, n)) - self.T = np.ones((1, n)) - self.Tau = np.ones((1, n)) - - self.S[0] = - np.sqrt(self.C)*y - self.T[0] = np.sqrt(self.C) - self.Tau[0] = np.sqrt(self.C) - return X - - elif self.loss['name'] == 'TV': - self.U = np.ones((2, n))*self.C - self.V = np.ones((2, n))*self.C - self.U[1] = -self.U[1] - - self.V[0] = - X.dot(y)*self.C - self.V[1] = X.dot(y)*self.C - return X - - elif (self.loss['name'] == 'huber') or (self.loss['name'] == 'Huber'): - self.S = np.ones((2, n)) - self.T = np.ones((2, n)) - self.Tau = np.sqrt(self.C) * loss['tau'] * np.ones((2, n)) - - self.S[0] = - np.sqrt(self.C) - self.S[1] = np.sqrt(self.C) - self.T[0] = np.sqrt(self.C)*y - self.T[1] = -np.sqrt(self.C)*y - return X - elif (self.loss['name'] in ['SVR', 'svr']): - self.U = np.ones((2, n))*self.C - self.V = np.ones((2, n)) - self.U[1] = -self.U[1] - - self.V[0] = -self.C*(y + self.loss['epsilon']) - self.V[1] = self.C*(y - self.loss['epsilon']) - return X - elif (self.loss['name'] == 'custom'): - pass - else: - raise Exception("Sorry, plqERM currently does not support this loss function, \ - but you can manually set ReHLine params to solve the problem via ReHLine class.") - self.auto_shape() - - def _make_constraint_rehline_param(self): - """The `_make_constraint_rehline_param` function generates constraint parameters for the ReHLine solver. - """ - if (self.constraint['name'] == 'nonnegative') or (self.constraint['name'] == '>=0'): - A = np.repeat([X_sen @ X], repeats=[2], axis=0) / n - - - def append_l1(self, X, l1_pen=1.0): - r""" - This function appends the l1 penalty to the ReHLine problem. The formulation becomes: - - .. math:: - - \min_{\mathbf{\beta} \in \mathbb{R}^d} \sum_{i=1}^n \sum_{l=1}^L \text{ReLU}( u_{li} \mathbf{x}_i^\intercal \mathbf{\beta} + v_{li}) + \sum_{i=1}^n \sum_{h=1}^H {\text{ReHU}}_{\tau_{hi}}( s_{hi} \mathbf{x}_i^\intercal \mathbf{\beta} + t_{hi}) + \frac{1}{2} \| \mathbf{\beta} \|_2^2 + \lambda_1 \| \mathbf{\beta} \|_1, \\ \text{ s.t. } - \mathbf{A} \mathbf{\beta} + \mathbf{b} \geq \mathbf{0}, - - where :math:`\lambda_1` is associated with `l1_pen`. - - Parameters - ---------- - - X : ndarray of shape (n_samples, n_features) - The generated samples. - - l1_pen : float, default=1.0 - The l1 penalty level, which controls the complexity or sparsity of the resulting model. - - Returns - ------- - - X_fake: ndarray of shape (n_samples+n_features, n_features) - The manipulated data matrix. It has been padded with - identity matrix, allowing the correctly structured data to be input - into `self.fit` or other modelling processes. - - Examples - -------- - - >>> import numpy as np - >>> from rehline import ReHLine - - >>> # simulate classification dataset - >>> n, d, C, lam1 = 1000, 3, 0.5, 1.0 - >>> np.random.seed(1024) - >>> X = np.random.randn(1000, 3) - >>> beta0 = np.random.randn(3) - >>> y = np.sign(X.dot(beta0) + np.random.randn(n)) - - >>> clf = ReHLine(loss={'name': 'svm'}, C=C) - >>> clf.make_ReLHLoss(X=X, y=y, loss={'name': 'svm'}) - >>> # save and fit with the manipulated data matrix - >>> X_fake = clf.append_l1(X, l1_pen=lam1) - >>> clf.fit(X=X_fake) - >>> print('sol privided by rehline: %s' %clf.coef_) - >>> sol privided by rehline: [ 7.17796629e-01 -1.87075728e-06 2.61965622e+00] #sparse sol - >>> print(clf.decision_function([[.1,.2,.3]])) - >>> [0.85767616] - """ - - n, d = X.shape - l1_pen = l1_pen*np.ones(d) - U_new = np.zeros((self.L+2, n+d)) - V_new = np.zeros((self.L+2, n+d)) - ## Block 1 - if len(self.U): - U_new[:self.L, :n] = self.U - V_new[:self.L, :n] = self.V - ## Block 2 - U_new[-2,n:] = l1_pen - U_new[-1,n:] = -l1_pen - - if len(self.S): - S_new = np.zeros((self.H, n+d)) - T_new = np.zeros((self.H, n+d)) - Tau_new = np.zeros((self.H, n+d)) - - S_new[:,:n] = self.S - T_new[:,:n] = self.T - Tau_new[:,:n] = self.Tau - - self.S = S_new - self.T = T_new - self.Tau = Tau_new - - ## fake X - X_fake = np.zeros((n+d, d)) - X_fake[:n,:] = X - X_fake[n:,:] = np.identity(d) - - self.U = U_new - self.V = V_new - self.auto_shape() - return X_fake - - def auto_shape(self): - """ - Automatically generate the shape of the parameters of the ReHLine loss function. - """ - self.L = self.U.shape[0] - self.n = self.U.shape[1] - self.H = self.S.shape[0] - self.K = self.A.shape[0] - - def call_ReLHLoss(self, score): - """ - Return the value of the ReHLine loss of the `score`. - - Parameters - ---------- - score : ndarray of shape (n_samples, ) - The input score that will be evaluated through the ReHLine loss. - - Returns - ------- - float - ReHLine loss evaluation of the given score. - """ - - relu_input = np.zeros((self.L, self.n)) - rehu_input = np.zeros((self.H, self.n)) - if self.L > 0: - relu_input = (self.U.T * score[:,np.newaxis]).T + self.V - if self.H > 0: - rehu_input = (self.S.T * score[:,np.newaxis]).T + self.T - return np.sum(relu(relu_input), 0) + np.sum(rehu(rehu_input), 0) - - - def fit(self, X, sample_weight=None): + def fit(self, X, y, sample_weight=None): """Fit the model based on the given training data. Parameters @@ -477,6 +302,9 @@ def fit(self, X, sample_weight=None): Training vector, where `n_samples` is the number of samples and `n_features` is the number of features. + y : array-like of shape (n_samples,) + The target variable. + sample_weight : array-like of shape (n_samples,), default=None Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. @@ -485,11 +313,23 @@ def fit(self, X, sample_weight=None): ------- self : object An instance of the estimator. + + """ + n, d = X.shape + + ## loss -> rehline params + self.U, self.V, self.Tau, self.S, self.T = _make_loss_rehline_param(loss=self.loss, X=X, y=y) + + ## constrain -> rehline params + self.A, self.b = _make_constraint_rehline_param(constraint=self.constraint, X=X, y=y) + self.auto_shape() - # X = check_array(X) + ## sample weight -> rehline params if sample_weight is None: - sample_weight = np.ones(X.shape[0]) + sample_weight = self.C + else: + sample_weight = self.C*_check_sample_weight(sample_weight, X, dtype=X.dtype) if self.L > 0: U_weight = self.U * sample_weight