diff --git a/.ipynb_checkpoints/env-checkpoint.yaml b/.ipynb_checkpoints/env-checkpoint.yaml
new file mode 100644
index 0000000..133e4f0
--- /dev/null
+++ b/.ipynb_checkpoints/env-checkpoint.yaml
@@ -0,0 +1,130 @@
+name: equity_toolkit
+channels:
+ - defaults
+dependencies:
+ - bzip2=1.0.8=he774522_0
+ - ca-certificates=2023.01.10=haa95532_0
+ - certifi=2022.12.7=py311haa95532_0
+ - libffi=3.4.2=hd77b12b_6
+ - openssl=1.1.1t=h2bbff1b_0
+ - pip=23.0.1=py311haa95532_0
+ - python=3.11.0=h966fe2a_3
+ - setuptools=65.6.3=py311haa95532_0
+ - sqlite=3.41.1=h2bbff1b_0
+ - tk=8.6.12=h2bbff1b_0
+ - vc=14.2=h21ff451_1
+ - vs2015_runtime=14.27.29016=h5e58377_2
+ - wheel=0.38.4=py311haa95532_0
+ - wincertstore=0.2=py311haa95532_0
+ - xz=5.2.10=h8cc25b3_1
+ - zlib=1.2.13=h8cc25b3_0
+ - pip:
+ - aiofiles==22.1.0
+ - aiosqlite==0.18.0
+ - anyio==3.6.2
+ - argon2-cffi==21.3.0
+ - argon2-cffi-bindings==21.2.0
+ - arrow==1.2.3
+ - asttokens==2.2.1
+ - attrs==22.2.0
+ - babel==2.12.1
+ - backcall==0.2.0
+ - beautifulsoup4==4.12.2
+ - bleach==6.0.0
+ - cffi==1.15.1
+ - charset-normalizer==3.1.0
+ - click==8.1.3
+ - colorama==0.4.6
+ - comm==0.1.3
+ - dash==2.9.2
+ - dash-bootstrap-components==1.6.0
+ - dash-core-components==2.0.0
+ - dash-html-components==2.0.0
+ - dash-table==5.0.0
+ - debugpy==1.6.7
+ - decorator==5.1.1
+ - defusedxml==0.7.1
+ - executing==1.2.0
+ - fastjsonschema==2.16.3
+ - flask==2.2.3
+ - fqdn==1.5.1
+ - idna==3.4
+ - ipykernel==6.22.0
+ - ipython==8.12.0
+ - ipython-genutils==0.2.0
+ - isoduration==20.11.0
+ - itsdangerous==2.1.2
+ - jedi==0.18.2
+ - jinja2==3.1.2
+ - json5==0.9.11
+ - jsonpointer==2.3
+ - jsonschema==4.17.3
+ - jupyter-client==8.1.0
+ - jupyter-core==5.3.0
+ - jupyter-events==0.6.3
+ - jupyter-server==2.5.0
+ - jupyter-server-fileid==0.8.0
+ - jupyter-server-terminals==0.4.4
+ - jupyter-server-ydoc==0.8.0
+ - jupyter-ydoc==0.2.3
+ - jupyterlab==3.6.3
+ - jupyterlab-pygments==0.2.2
+ - jupyterlab-server==2.22.0
+ - markupsafe==2.1.2
+ - mistune==2.0.5
+ - mypy==1.11.2
+ - mypy-extensions==1.0.0
+ - nbclassic==0.5.5
+ - nbclient==0.7.3
+ - nbconvert==7.3.0
+ - nbformat==5.8.0
+ - nest-asyncio==1.5.6
+ - notebook==6.5.4
+ - notebook-shim==0.2.2
+ - numpy==1.24.2
+ - packaging==23.0
+ - pandas==2.0.0
+ - pandocfilters==1.5.0
+ - parso==0.8.3
+ - pickleshare==0.7.5
+ - platformdirs==3.2.0
+ - plotly==5.14.1
+ - prometheus-client==0.16.0
+ - prompt-toolkit==3.0.38
+ - psutil==5.9.4
+ - pure-eval==0.2.2
+ - pycparser==2.21
+ - pygments==2.14.0
+ - pyrsistent==0.19.3
+ - python-dateutil==2.8.2
+ - python-json-logger==2.0.7
+ - pytz==2023.3
+ - pywin32==306
+ - pywinpty==2.0.10
+ - pyyaml==6.0
+ - pyzmq==25.0.2
+ - requests==2.28.2
+ - rfc3339-validator==0.1.4
+ - rfc3986-validator==0.1.1
+ - scipy==1.10.1
+ - send2trash==1.8.0
+ - six==1.16.0
+ - sniffio==1.3.0
+ - soupsieve==2.4
+ - stack-data==0.6.2
+ - tenacity==8.2.2
+ - terminado==0.17.1
+ - tinycss2==1.2.1
+ - tornado==6.2
+ - traitlets==5.9.0
+ - typing-extensions==4.12.2
+ - tzdata==2023.3
+ - uri-template==1.2.0
+ - urllib3==1.26.15
+ - wcwidth==0.2.6
+ - webcolors==1.13
+ - webencodings==0.5.1
+ - websocket-client==1.5.1
+ - werkzeug==2.2.3
+ - y-py==0.5.9
+ - ypy-websocket==0.8.2
\ No newline at end of file
diff --git a/.ipynb_checkpoints/model-checkpoint.py b/.ipynb_checkpoints/model-checkpoint.py
index 2302456..3956f79 100644
--- a/.ipynb_checkpoints/model-checkpoint.py
+++ b/.ipynb_checkpoints/model-checkpoint.py
@@ -4,11 +4,15 @@
from typing import List
from typing import Tuple
+import os
import yaml
import src.model_classes as mc
-with open('config.yaml') as f:
+package_dir = os.path.dirname(os.path.abspath(__file__))
+config_fp = os.path.join(package_dir, "config.yaml")
+
+with open(config_fp) as f:
config = yaml.safe_load(f)
class Model:
diff --git a/.ipynb_checkpoints/toolkit-checkpoint.ipynb b/.ipynb_checkpoints/toolkit-checkpoint.ipynb
deleted file mode 100644
index b307bda..0000000
--- a/.ipynb_checkpoints/toolkit-checkpoint.ipynb
+++ /dev/null
@@ -1,1181 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 14,
- "id": "ab695a7b-3043-4154-a59b-01e57feaf8f0",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "ename": "IndentationError",
- "evalue": "unexpected indent (3060257492.py, line 620)",
- "output_type": "error",
- "traceback": [
- "\u001b[1;36m Cell \u001b[1;32mIn[14], line 620\u001b[1;36m\u001b[0m\n\u001b[1;33m self.A = A\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mIndentationError\u001b[0m\u001b[1;31m:\u001b[0m unexpected indent\n"
- ]
- }
- ],
- "source": [
- "import pandas as pd\n",
- "import numpy as np\n",
- "from scipy.stats import chi2_contingency\n",
- "import yaml\n",
- "\n",
- "from pandas import DataFrame\n",
- "from typing import Dict\n",
- "from typing import List\n",
- "from typing import Any\n",
- "from typing import Tuple\n",
- "\n",
- "with open('config.yaml') as f:\n",
- " config = yaml.safe_load(f)\n",
- "\n",
- "class Ingest:\n",
- " \n",
- " \"\"\"\n",
- " Class to ingest dataframe input.\n",
- " \"\"\"\n",
- " \n",
- " def __init__(\n",
- " self,\n",
- " config: Dict[Any, Any]\n",
- " ) -> None:\n",
- " \n",
- " \"\"\"\n",
- " Inits class with the config file\n",
- " and unpacks the config file.\n",
- " \"\"\"\n",
- " \n",
- " self.config = config\n",
- " \n",
- " self.unpack_config()\n",
- "\n",
- " \n",
- " def run(\n",
- " self\n",
- " ) -> DataFrame:\n",
- " \n",
- " \"\"\"\n",
- " Run function for the class.\n",
- " \n",
- " :param None:\n",
- " :return df:\n",
- " DataFrame, ingested df\n",
- " \"\"\"\n",
- " \n",
- " df = self.run_load()\n",
- " \n",
- " df = self.run_harmonize(df)\n",
- " \n",
- " return df\n",
- " \n",
- " \n",
- " def unpack_config(\n",
- " self\n",
- " ) -> None:\n",
- " \n",
- " \"\"\"\n",
- " Function to unpack config vars.\n",
- " \n",
- " :var filepath:\n",
- " str, the relative filepath \n",
- " :var group_variable:\n",
- " str, the column name for the \n",
- " group variable of interest e.g.\n",
- " gender, which contains the target \n",
- " class and non-target class e.g.\n",
- " females and males.\n",
- " :var group_target_val:\n",
- " str, within the group_variable column,\n",
- " contains the contains the target \n",
- " class value e.g.\n",
- " females.\n",
- " :var group_other_val:\n",
- " str, within the group_variable column,\n",
- " contains the contains the non-target \n",
- " class value e.g. males.\n",
- " :var outcome_variable:\n",
- " str, the column name for the \n",
- " outcome variable of interest e.g.\n",
- " hired, which contains the target \n",
- " class and non-target class e.g.\n",
- " hired and not-hired.\n",
- " :var outcome_target_val:\n",
- " str, within the outcome_variable column,\n",
- " contains the contains the target \n",
- " class value e.g.\n",
- " hired.\n",
- " :var outcome_other_val:\n",
- " str, within the outcome_variable column,\n",
- " contains the contains the non-target \n",
- " class value e.g. not-hired.\n",
- " :var grpers:\n",
- " Dict[str,str], can be any set of filterable\n",
- " columns to slice into particular groups within\n",
- " the broader employee roster. The key is the column,\n",
- " the value is the desired class within the column\n",
- " e.g. job_title: analyst.\n",
- " \"\"\"\n",
- " \n",
- " config = self.config\n",
- " \n",
- " try:\n",
- " self.filepath: str = config[\"Ingest\"][\"filepath\"]\n",
- " self.group_variable: str = config[\"Ingest\"][\"group_variable\"]\n",
- " self.group_target_val: str = config[\"Ingest\"][\"group_target_val\"]\n",
- " self.group_other_val: str = config[\"Ingest\"][\"group_other_val\"]\n",
- " self.outcome_variable: str = config[\"Ingest\"][\"outcome_variable\"]\n",
- " self.outcome_target_val: str = config[\"Ingest\"][\"outcome_target_val\"]\n",
- " self.outcome_other_val: str = config[\"Ingest\"][\"outcome_other_val\"]\n",
- " self.grpers: Dict[str, str] = config[\"Ingest\"][\"grpers\"]\n",
- "\n",
- " # Type validation\n",
- " if not isinstance(self.filepath, str):\n",
- " raise TypeError(\"Expected 'filepath' to be of type 'str'.\")\n",
- " if not isinstance(self.group_variable, str):\n",
- " raise TypeError(\"Expected 'group_variable' to be of type 'str'.\")\n",
- " if not isinstance(self.group_target_val, str):\n",
- " raise TypeError(\"Expected 'group_target_val' to be of type 'str'.\")\n",
- " if not isinstance(self.group_other_val, str):\n",
- " raise TypeError(\"Expected 'group_other_val' to be of type 'str'.\")\n",
- " if not isinstance(self.outcome_variable, str):\n",
- " raise TypeError(\"Expected 'outcome_variable' to be of type 'str'.\")\n",
- " if not isinstance(self.outcome_target_val, str):\n",
- " raise TypeError(\"Expected 'outcome_target_val' to be of type 'str'.\")\n",
- " if not isinstance(self.outcome_other_val, str):\n",
- " raise TypeError(\"Expected 'outcome_other_val' to be of type 'str'.\")\n",
- " if not isinstance(self.grpers, dict):\n",
- " raise TypeError(\"Expected 'grpers' to be of type 'dict'.\")\n",
- "\n",
- " except KeyError as e:\n",
- " raise KeyError(f\"Missing key '{e.args[0]}' in the config file. \"\n",
- " \"Please ensure the config file contains all required keys under the 'Ingest' section: \"\n",
- " \"'filepath', 'group_variable', 'group_target_val', 'group_other_val', \"\n",
- " \"'outcome_variable', 'outcome_target_val', 'outcome_other_val', and 'grpers'.\")\n",
- "\n",
- " except TypeError as e:\n",
- " raise TypeError(f\"Config file error: {e}\")\n",
- " \n",
- " def run_load(\n",
- " self\n",
- " ) -> DataFrame:\n",
- " \n",
- " \"\"\"\n",
- " Loads csv file. Assumes headers are row 0.\n",
- " \n",
- " :param None:\n",
- " :return DataFrame:\n",
- " \"\"\"\n",
- " \n",
- " filepath = self.filepath\n",
- " \n",
- " try:\n",
- " return pd.read_csv(filepath, skiprows=0)\n",
- "\n",
- " except FileNotFoundError:\n",
- " raise FileNotFoundError(\n",
- " f\"The file at {filepath} was not found. Please check the file path.\"\n",
- " )\n",
- "\n",
- " except pd.errors.EmptyDataError:\n",
- " raise ValueError(\n",
- " f\"The file at {filepath} is empty and cannot be loaded.\"\n",
- " )\n",
- "\n",
- " except pd.errors.ParserError:\n",
- " raise ValueError(\n",
- " f\"The file at {filepath} contains malformed data and could not be parsed as a valid CSV.\"\n",
- " )\n",
- "\n",
- " except PermissionError:\n",
- " raise PermissionError(\n",
- " f\"Permission denied when attempting to read the file at {filepath}.\"\n",
- " f\"Please check the file permissions.\"\n",
- " )\n",
- "\n",
- " except Exception as e:\n",
- " raise Exception(\n",
- " f\"An unexpected error occurred while loading the file: {str(e)}\"\n",
- " )\n",
- " \n",
- " def run_harmonize(\n",
- " self,\n",
- " df: DataFrame\n",
- " ) -> DataFrame: \n",
- "\n",
- " \"\"\"\n",
- " Function to harmonize the dataset.\n",
- " \n",
- " :param df: \n",
- " DataFrame, loaded df\n",
- " :return df:\n",
- " DataFrame, filtered down to target and other group and\n",
- " harmonize the fields\n",
- " \"\"\"\n",
- " \n",
- " group_variable = self.group_variable\n",
- " group_target_val = self.group_target_val\n",
- " group_other_val = self.group_other_val\n",
- " outcome_variable = self.outcome_variable\n",
- " outcome_target_val = self.outcome_target_val\n",
- " outcome_other_val = self.outcome_other_val\n",
- " grpers = self.grpers\n",
- "\n",
- " df = self._apply_filters(\n",
- " df=df,\n",
- " group_variable=group_variable,\n",
- " group_target_val=group_target_val,\n",
- " group_other_val=group_other_val,\n",
- " grpers=grpers\n",
- " )\n",
- " \n",
- " df = self._apply_harmonize(\n",
- " df=df,\n",
- " group_variable=group_variable,\n",
- " group_target_val=group_target_val,\n",
- " group_other_val=group_other_val,\n",
- " outcome_variable=outcome_variable,\n",
- " outcome_target_val=outcome_target_val,\n",
- " outcome_other_val=outcome_other_val\n",
- " )\n",
- " \n",
- " return df\n",
- " \n",
- " def _apply_filters(\n",
- " self,\n",
- " df: DataFrame,\n",
- " group_variable: str,\n",
- " group_target_val: str,\n",
- " group_other_val: str,\n",
- " grpers: Dict[str,str],\n",
- " ) -> DataFrame:\n",
- " \n",
- " \"\"\"\n",
- " Method to apply filters\n",
- " \n",
- " :param df:\n",
- " DataFrame, target df\n",
- " :param group_variable:\n",
- " str, column name of the\n",
- " target variable\n",
- " :param group_target_val:\n",
- " str, class target value of the group_variable\n",
- " aka the protected class value\n",
- " :param group_other_val:\n",
- " str, class nontarget value of the group_variable\n",
- " aka the nonprotected class value\n",
- " :return df:\n",
- " DataFrame, filtered df\n",
- " \"\"\"\n",
- " \n",
- " df = df.loc[\n",
- " df[group_variable].isin(\n",
- " [\n",
- " group_target_val, \n",
- " group_other_val\n",
- " ]\n",
- " )\n",
- " ]\n",
- " \n",
- " for k, v in grpers.items():\n",
- " \n",
- " df = df.loc[\n",
- " df[k].isin([v])\n",
- " ] \n",
- " \n",
- " return df\n",
- " \n",
- " def _apply_harmonize(\n",
- " self,\n",
- " df: DataFrame,\n",
- " group_variable: str,\n",
- " group_target_val: str,\n",
- " group_other_val: str,\n",
- " outcome_variable: str,\n",
- " outcome_target_val: str,\n",
- " outcome_other_val: str\n",
- " ) -> DataFrame:\n",
- " \n",
- " \"\"\"\n",
- " Method to harmonize targets.\n",
- " \n",
- " :param df:\n",
- " DataFrame, target df\n",
- " :param group_variable:\n",
- " str, column name of the\n",
- " target variable\n",
- " :param group_target_val:\n",
- " str, class target value of the group_variable\n",
- " aka the protected class value\n",
- " :param group_other_val:\n",
- " str, class nontarget value of the group_variable\n",
- " aka the nonprotected class value \n",
- " :param outcome_variable:\n",
- " str, the column name of the outcome \n",
- " :param outcome_target_val:\n",
- " str, class target value of the outcome_variable\n",
- " aka success\n",
- " :param outcome_other_val:\n",
- " str, class nontarget value of the outcome_variable\n",
- " :return df:\n",
- " DataFrame, target df\n",
- " \"\"\"\n",
- " \n",
- " # harmonize the group target\n",
- " df['group_var_clean'] = np.where(\n",
- " df[group_variable]==group_target_val, \n",
- " 1,\n",
- " np.where(\n",
- " df[group_variable]==group_other_val, \n",
- " 0, \n",
- " -1\n",
- " )\n",
- " )\n",
- " \n",
- " # harmonize the outcome target\n",
- " df['outcome_var_clean'] = np.where(\n",
- " df[outcome_variable]==outcome_target_val, \n",
- " 1, \n",
- " np.where(\n",
- " df[self.outcome_variable]==outcome_other_val,\n",
- " 0, \n",
- " -1\n",
- " )\n",
- " ) \n",
- " \n",
- " return df\n",
- " \n",
- "class Transform:\n",
- " \n",
- " \"\"\"\n",
- " Class to transform dataframe inputs into \n",
- " 2x2 contingency table.\n",
- " \"\"\"\n",
- " \n",
- " def __init__(\n",
- " self, \n",
- " df: DataFrame\n",
- " ) -> None:\n",
- " \n",
- " \"\"\"\n",
- " :param df:\n",
- " DataFrame, input df\n",
- " \"\"\"\n",
- " \n",
- " self.df = df\n",
- " \n",
- " def run_build_cont_table(\n",
- " self\n",
- " ) -> List[int]:\n",
- " \n",
- " \"\"\"\n",
- " Function to generate contingency table format.\n",
- " \n",
- " Places the target group val in the top row and the\n",
- " target group other to the bottom row.\n",
- " \n",
- " Places no-success outcome on the first column and success\n",
- " on the second column.\n",
- " \n",
- " :return tbl:\n",
- " List[int], filtered down to target and other group.\n",
- " \"\"\"\n",
- " \n",
- " df = self.df\n",
- " \n",
- " cols = [\n",
- " 'group_var_clean', \n",
- " 'outcome_var_clean'\n",
- " ]\n",
- " \n",
- " df = df[cols]\n",
- " \n",
- " tbl = (\n",
- " df.pivot_table(\n",
- " index='group_var_clean',\n",
- " columns='outcome_var_clean', \n",
- " aggfunc=len\n",
- " ).\n",
- " sort_index(\n",
- " axis=1, \n",
- " ascending=True\n",
- " ).\n",
- " sort_index(ascending=False). # ensure always [1,0]\n",
- " values.tolist()\n",
- " ) \n",
- " \n",
- " return tbl\n",
- " \n",
- "class StatsTesting2x2Cont:\n",
- " \n",
- " \"\"\"\n",
- " Class to perform 2x2 Contigency Table analysis\n",
- " with Chi2 and Phi Correlation Coefficent Testing.\n",
- "\n",
- " Provides context into potential association between\n",
- " variables and the strength of the association.\n",
- " \"\"\"\n",
- " \n",
- " def __init__(\n",
- " self,\n",
- " config: Dict[Any, Any],\n",
- " tbl: List[int],\n",
- " df: DataFrame\n",
- " ) -> None:\n",
- " \n",
- " \"\"\"\n",
- " Inits the class variables and unpacks the\n",
- " config variables.\n",
- " \n",
- " :param config:\n",
- " Dict[str,Any], loaded config file.\n",
- " :param tbl:\n",
- " List[int], 2x2 cont table.\n",
- " :param df:\n",
- " DataFrame, original input DataFrame.\n",
- " \"\"\"\n",
- " \n",
- " self.config = config\n",
- " self.tbl = tbl\n",
- " self.df = df\n",
- "\n",
- " self.unpack_config()\n",
- "\n",
- " def run_testing(\n",
- " self\n",
- " ) -> DataFrame:\n",
- " \n",
- " \"\"\"\n",
- " Run function for the class.\n",
- " \n",
- " Runs hypothesis evaluation and builds\n",
- " the output report DataFrame.\n",
- " \n",
- " :param None:\n",
- " :return df_results:\n",
- " DataFrame, with testing results.\n",
- " \"\"\"\n",
- " \n",
- " alpha = self.alpha\n",
- " tbl = self.tbl\n",
- " process = self.process\n",
- " group_variable = self.group_variable\n",
- " group_target_val = self.group_target_val\n",
- " group_other_val = self.group_other_val\n",
- " bin_edges = self.bin_edges\n",
- " bin_labels = self.bin_labels\n",
- " \n",
- " res = self.gen_hypothesis_eval(tbl)\n",
- "\n",
- " df_results = self.run_report_bld(\n",
- " alpha=alpha,\n",
- " res=res,\n",
- " tbl=tbl,\n",
- " process=process,\n",
- " group_variable=group_variable,\n",
- " group_target_val=group_target_val,\n",
- " group_other_val=group_other_val,\n",
- " bin_edges=bin_edges,\n",
- " bin_labels=bin_labels\n",
- " )\n",
- " \n",
- " return df_results\n",
- " \n",
- " def unpack_config(\n",
- " self\n",
- " ) -> None:\n",
- " \n",
- " \"\"\"\n",
- " Function to unpack config variables.\n",
- " \n",
- " :param None:\n",
- " :return None:\n",
- " \"\"\"\n",
- " \n",
- " config = self.config\n",
- "\n",
- " try:\n",
- " self.alpha: float = config[\"StatsTesting2x2Cont\"][\"alpha\"]\n",
- " self.group_variable: str = config[\"Ingest\"][\"group_variable\"]\n",
- " self.group_target_val: str = config[\"Ingest\"][\"group_target_val\"]\n",
- " self.group_other_val: str = config[\"Ingest\"][\"group_other_val\"]\n",
- " self.outcome_variable: str = config[\"Ingest\"][\"outcome_variable\"]\n",
- " self.outcome_target_val: str = config[\"Ingest\"][\"outcome_target_val\"]\n",
- " self.outcome_other_val: str = config[\"Ingest\"][\"outcome_other_val\"]\n",
- " self.grpers: Dict[str, str] = config[\"Ingest\"][\"grpers\"]\n",
- " self.testing: str = config[\"StatsTesting2x2Cont\"][\"testing\"]\n",
- " self.process: str = config[\"StatsTesting2x2Cont\"][\"process\"]\n",
- " self.bin_edges: List[float] = config[\"StatsTesting2x2Cont\"][\"phi_bin_edges\"]\n",
- " self.bin_labels: List[str] = config[\"StatsTesting2x2Cont\"][\"phi_bin_labels\"]\n",
- "\n",
- " if not isinstance(self.alpha, float):\n",
- " raise TypeError(\"Expected 'alpha' to be of type 'float'.\")\n",
- " if not isinstance(self.group_variable, str):\n",
- " raise TypeError(\"Expected 'group_variable' to be of type 'str'.\")\n",
- " if not isinstance(self.group_target_val, str):\n",
- " raise TypeError(\"Expected 'group_target_val' to be of type 'str'.\")\n",
- " if not isinstance(self.group_other_val, str):\n",
- " raise TypeError(\"Expected 'group_other_val' to be of type 'str'.\")\n",
- " if not isinstance(self.outcome_variable, str):\n",
- " raise TypeError(\"Expected 'outcome_variable' to be of type 'str'.\")\n",
- " if not isinstance(self.outcome_target_val, str):\n",
- " raise TypeError(\"Expected 'outcome_target_val' to be of type 'str'.\")\n",
- " if not isinstance(self.outcome_other_val, str):\n",
- " raise TypeError(\"Expected 'outcome_other_val' to be of type 'str'.\")\n",
- " if not isinstance(self.grpers, dict):\n",
- " raise TypeError(\"Expected 'grpers' to be of type 'dict'.\")\n",
- " if not isinstance(self.testing, str):\n",
- " raise TypeError(\"Expected 'testing' to be of type 'str'.\")\n",
- " if not isinstance(self.process, str):\n",
- " raise TypeError(\"Expected 'process' to be of type 'str'.\")\n",
- " if not isinstance(\n",
- " self.bin_edges, list\n",
- " ) or not all(\n",
- " isinstance(\n",
- " i, (int, float)\n",
- " ) for i in self.bin_edges\n",
- " ):\n",
- " raise TypeError(\"Expected 'bin_edges' to be a list of floats.\")\n",
- " if not isinstance(\n",
- " self.bin_labels, list\n",
- " ) or not all(\n",
- " isinstance(i, str) for i in self.bin_labels\n",
- " ):\n",
- " raise TypeError(\"Expected 'bin_labels' to be a list of strings.\")\n",
- " \n",
- " except KeyError as e:\n",
- " raise KeyError(\n",
- " f\"Missing key '{e.args[0]}' in the config file. \"\n",
- " f\"Ensure all required keys are present in the 'Ingest' and 'StatsTesting2x2Cont' sections.\"\n",
- " )\n",
- "\n",
- " except TypeError as e:\n",
- " raise TypeError(f\"Config file error: {e}\")\n",
- "\n",
- " \n",
- " def gen_hypothesis_eval(\n",
- " self,\n",
- " tbl: List[int]\n",
- " ) -> chi2_contingency:\n",
- " \n",
- " \"\"\"\n",
- " Function to generate the chi2_contigency\n",
- " statistic and result.\n",
- " \"\"\"\n",
- " \n",
- " #size = np.shape(tbl)\n",
- " #tbl_len = len(tbl)\n",
- " \n",
- " res = chi2_contingency(\n",
- " tbl\n",
- " )\n",
- " \n",
- " return res\n",
- " \n",
- " def run_report_bld(\n",
- " self,\n",
- " alpha: float,\n",
- " res: chi2_contingency,\n",
- " tbl: List[int],\n",
- " process: str,\n",
- " group_variable: str,\n",
- " group_target_val: str,\n",
- " group_other_val: str,\n",
- " bin_edges: List[float],\n",
- " bin_labels: List[str]\n",
- " ) -> DataFrame:\n",
- " \n",
- " \"\"\"\n",
- " Runs report for statistical testing\n",
- " chi2_contingency results\n",
- " \n",
- " :param alpha:\n",
- " float, alpha value for significance evaluation.\n",
- " :param res:\n",
- " chi2_contingency, result of the chi2_contingency.\n",
- " :param tbl:\n",
- " List[int], the contingency table.\n",
- " :param process: \n",
- " str, the name of the business process\n",
- " being tested, e.g. 'hiring'.\n",
- " :param group_variable:\n",
- " str, column name of the\n",
- " target variable.\n",
- " :param group_target_val:\n",
- " str, class target value of the group_variable\n",
- " aka the protected class value.\n",
- " :param group_other_val:\n",
- " str, class nontarget value of the group_variable\n",
- " aka the nonprotected class value. \n",
- " :param bin_edges:\n",
- " List[float], edges for phi\n",
- " bins.\n",
- " :param bin_labels:\n",
- " List[str], labels for the phi\n",
- " bins.\n",
- " :return df:\n",
- " DataFrame, target\n",
- " \"\"\"\n",
- " \n",
- " pvalue = res[1]\n",
- " \n",
- " df = pd.DataFrame()\n",
- "\n",
- " df = self._gen_significance_test(\n",
- " df=df,\n",
- " pvalue=pvalue,\n",
- " alpha=alpha\n",
- " )\n",
- " \n",
- " (\n",
- " df,\n",
- " A,\n",
- " B,\n",
- " C,\n",
- " D,\n",
- " total_target_grp,\n",
- " total_non_target_grp,\n",
- " diagonals,\n",
- " percent_target_succ,\n",
- " percent_non_target_succ,\n",
- " phi_numerator,\n",
- " phi_denominator\n",
- " ) = self._gen_table_calcs(\n",
- " df=df,\n",
- " tbl=tbl,\n",
- " )\n",
- " \n",
- " if res[1] <= alpha:\n",
- " df, phi_result = self._gen_phi_coefficient(\n",
- " df=df,\n",
- " tbl=tbl,\n",
- " bin_edges=bin_edges,\n",
- " bin_labels=bin_labels,\n",
- " process=process,\n",
- " group_variable=group_variable,\n",
- " group_target_val=group_target_val,\n",
- " group_other_val=group_other_val\n",
- " )\n",
- " \n",
- " else:\n",
- " df['phi_corr_coeff'] = np.nan\n",
- " df['phi_bins'] = np.nan\n",
- " \n",
- " phi_result = \"\"\n",
- " \n",
- " df = self._gen_four_fifths_test(df)\n",
- " \n",
- " df = self._gen_outcome_meta(\n",
- " df,\n",
- " round(res[1],3),\n",
- " phi_result\n",
- " )\n",
- " \n",
- " df = self._gen_unpack_stats(\n",
- " df,\n",
- " res\n",
- " )\n",
- " \n",
- " return df\n",
- " \n",
- " def _gen_unpack_stats(\n",
- " self,\n",
- " df: DataFrame,\n",
- " res: chi2_contingency\n",
- " ) -> DataFrame:\n",
- " \n",
- " \"\"\"\n",
- " Method to unpack test stats from\n",
- " chi2_contingency results.\n",
- " \n",
- " :param df:\n",
- " DataFrame, output df.\n",
- " :param res:\n",
- " chi2_contingency, results array.\n",
- " :return df:\n",
- " DataFrame, output df.\n",
- " \"\"\"\n",
- " \n",
- " group_target_val = self.group_target_val\n",
- " group_other_val = self.group_other_val\n",
- " rows = [group_target_val] + [group_other_val]\n",
- " \n",
- " df['statistic'] = res[0]\n",
- " df['pvalue'] = res[1]\n",
- " df['dof'] = res[2]\n",
- " df['tbl_row'] = [rows]\n",
- " df['tbl'] = [tbl]\n",
- " df['expected_freq'] = [res[3]]\n",
- " df['tbl_expected_diff'] = [tbl - res[3]]\n",
- " \n",
- " return df\n",
- " \n",
- " def _gen_significance_test(\n",
- " self,\n",
- " df: DataFrame,\n",
- " pvalue: float,\n",
- " alpha: float\n",
- " ):\n",
- " \"\"\"\n",
- " Method to report on test significance.\n",
- " \n",
- " :param df:\n",
- " DataFrame, results df.\n",
- " :param pval:\n",
- " int, pvalue.\n",
- " :param alpha:\n",
- " float, the alpha value for testing eval.\n",
- " :return df:\n",
- " DataFrame with metadata added. \n",
- " \"\"\"\n",
- " \n",
- " if pvalue <= alpha:\n",
- " val = 'statistically significant result'\n",
- " \n",
- " else:\n",
- " val = 'no statistically significant result'\n",
- " \n",
- " df['test_result'] = [val]\n",
- " \n",
- " return df\n",
- " \n",
- " def _gen_phi_coefficient(\n",
- " self,\n",
- " df: DataFrame,\n",
- " tbl: List[int],\n",
- " process: str,\n",
- " group_variable: str,\n",
- " group_target_val: str,\n",
- " group_other_val: str,\n",
- " bin_edges: List[float],\n",
- " bin_labels: List[str]\n",
- " ) -> DataFrame:\n",
- " \n",
- " \"\"\"\n",
- " Method to generate the phi coefficient.\n",
- " \n",
- " :param df:\n",
- " DataFrame, the results df.\n",
- " :param tbl:\n",
- " List[int], the 2x2 cont table.\n",
- " :param process: \n",
- " str, the name of the business process\n",
- " being tested, e.g. 'hiring'.\n",
- " :param group_variable:\n",
- " str, column name of the\n",
- " target variable.\n",
- " :param group_target_val:\n",
- " str, class target value of the group_variable\n",
- " aka the protected class value.\n",
- " :param group_other_val:\n",
- " str, class nontarget value of the group_variable\n",
- " aka the nonprotected class value. \n",
- " :param bin_edges:\n",
- " List[float], edges for phi\n",
- " bins.\n",
- " :param bin_labels:\n",
- " List[str], lab\n",
- " :return df:\n",
- " DataFrame, output df.\n",
- " \"\"\"\n",
- " \n",
- " diagonals = self.diagonals\n",
- " numerator = self.phi_numerator\n",
- " denominator = self.phi_denominator\n",
- "\n",
- " phi = numerator / denominator if denominator != 0 else 0\n",
- "\n",
- " df['phi_corr_coeff'] = phi\n",
- " \n",
- " df = self._gen_prep_phi_bins(\n",
- " df=df,\n",
- " bin_edges=bin_edges,\n",
- " bin_labels=bin_labels\n",
- " )\n",
- "\n",
- " df, phi_result = self._gen_prep_diagonals(\n",
- " df=df,\n",
- " diagonals=diagonals,\n",
- " process=process,\n",
- " group_variable=group_variable,\n",
- " group_other_val=group_other_val,\n",
- " group_target_val=group_target_val,\n",
- " percent_non_target_succ=self.percent_non_target_succ,\n",
- " percent_target_succ=self.percent_target_succ,\n",
- " )\n",
- " \n",
- " return df, phi_result\n",
- " \n",
- " def _gen_table_calcs(\n",
- " self,\n",
- " df: DataFrame,\n",
- " tbl: List[int]\n",
- " ) -> Tuple[\n",
- " DataFrame, float, float, float, float,\n",
- " float, float, float, float, \n",
- " float, float, float\n",
- " ]:\n",
- " \n",
- " \"\"\"\n",
- " Method to generate phi bins. Provides additional\n",
- " explainability on the magnitude of association, when \n",
- " an association is found.\n",
- " \n",
- " :param df:\n",
- " DataFrame, output df.\n",
- " :param tbl:\n",
- " List[int], 2x2 contingency.\n",
- " :return [\n",
- " df, A, B, C, D, total_target_grp,\n",
- " total_non_target_grp, diagonals,\n",
- " percent_target_succ, percent_non_target_succ,\n",
- " phi_numerator, phi_denominator\n",
- " ]:\n",
- " Tuple[DataFrame, float, float, float, float,\n",
- " float, float, float, float, \n",
- " float, float, float\n",
- " ]\n",
- " \"\"\"\n",
- " \n",
- " # females, males; no succ, succ\n",
- " A, B = tbl[0] \n",
- " C, D = tbl[1]\n",
- " \n",
- " total_target_grp = A + B\n",
- " total_non_target_grp = C + D\n",
- " diagonals = (A + D) > (B + C)\n",
- " percent_target_succ = (B / total_target_grp) * 100\n",
- " percent_non_target_succ = (D / total_non_target_grp) * 100\n",
- " phi_numerator = (A * D) - (B * C)\n",
- " phi_denominator = np.sqrt((A + B) * (C + D) * (A + C) * (B + D)) \n",
- " \n",
- " return (\n",
- " df,\n",
- " A,\n",
- " B,\n",
- " C,\n",
- " D,\n",
- " total_target_grp,\n",
- " total_non_target_grp,\n",
- " diagonals,\n",
- " percent_target_succ,\n",
- " percent_non_target_succ,\n",
- " phi_numerator,\n",
- " phi_denominator\n",
- " )\n",
- " \n",
- " def _gen_prep_phi_bins(\n",
- " self,\n",
- " df: DataFrame,\n",
- " bin_edges: List[float],\n",
- " bin_labels: List[str]\n",
- " ) -> DataFrame:\n",
- " \n",
- " \"\"\"\n",
- " Method to generate pandas bins for \n",
- " phi coeff.\n",
- " \n",
- " :param df:\n",
- " DataFrame, output df.\n",
- " :param bin_edges:\n",
- " List[float], edges for phi\n",
- " bins.\n",
- " :param bin_labels:\n",
- " List[str], labels for the phi\n",
- " bins.\n",
- " :return df:\n",
- " DataFrame, output df.\n",
- " \"\"\"\n",
- " \n",
- " df['phi_bins'] = pd.cut(\n",
- " df['phi_corr_coeff'], \n",
- " bins=bin_edges, \n",
- " labels=bin_labels, \n",
- " include_lowest=True\n",
- " )\n",
- " \n",
- " return df\n",
- " \n",
- " def _gen_four_fifths_test(\n",
- " self,\n",
- " df: DataFrame\n",
- " ) -> DataFrame:\n",
- " \n",
- " percent_target_succ = self.percent_target_succ\n",
- " percent_non_target_succ = self.percent_non_target_succ \n",
- " \n",
- " ratio = percent_target_succ / percent_non_target_succ\n",
- " \n",
- " if ratio < .8:\n",
- " ratio_desc = f'failed with 4/5 test at {round(ratio,3)}'\n",
- " elif ratio >= .8:\n",
- " ratio_desc = f'passed with 4/5 test at {round(ratio,3)}'\n",
- " else:\n",
- " ratio_desc = 'error calculating 4/5 test'\n",
- " \n",
- " df['four_fifths_test'] = ratio_desc\n",
- " return df\n",
- " \n",
- " def _gen_prep_diagonals(\n",
- " self,\n",
- " df: DataFrame,\n",
- " diagonals: bool,\n",
- " process: str,\n",
- " group_variable: str,\n",
- " group_other_val: str,\n",
- " group_target_val: str,\n",
- " percent_non_target_succ: float,\n",
- " percent_target_succ: float,\n",
- " ) -> Tuple[DataFrame, str]:\n",
- " \n",
- " \"\"\"\n",
- " Method to generate the magnitude of the\n",
- " assocation using phi coefficient analysis.\n",
- " \n",
- " :param df:\n",
- " DataFrame, output df.\n",
- " :param diagonals:\n",
- " bool,\n",
- " :param process: \n",
- " str, the name of the business process\n",
- " being tested, e.g. 'hiring'.\n",
- " :param group_variable:\n",
- " str, column name of the\n",
- " target variable.\n",
- " :param group_target_val:\n",
- " str, class target value of the group_variable\n",
- " aka the protected class value.\n",
- " :param group_other_val:\n",
- " str, class nontarget value of the group_variable\n",
- " aka the nonprotected class value. \n",
- " :param percent_non_target_succ:\n",
- " float, the success percentage attained\n",
- " for the the non-target group.\n",
- " :param percent_target_succ:\n",
- " float, the success percentage attained for the\n",
- " target class.\n",
- " :return (df, phi_col):\n",
- " Tuple[df, phi_col]\n",
- " \"\"\"\n",
- " \n",
- " phi_bin = df['phi_bins'].values[0] \n",
- " phi_corr_coeff = df['phi_corr_coeff'].values[0] \n",
- "\n",
- " if diagonals:\n",
- " diagonal_msg = (\n",
- " f\"The values on the positive diagonal of the 'tbl' indicate the distribution of {process} success across {group_variable} categories.\"\n",
- " f\" {group_other_val} had a higher proportion of successful outcomes compared to {group_target_val}.\"\n",
- " f\" Specifically, {percent_non_target_succ:.1f}% of {group_other_val} had success while only {percent_target_succ:.1f}%\"\n",
- " f\" of {group_target_val} had success.\"\n",
- " f\" This significant difference in {process} success rates suggests a potential {group_variable} bias, with {group_other_val} success in {process}\"\n",
- " f\" at a higher rate than {group_target_val}.\"\n",
- " )\n",
- " \n",
- " else:\n",
- " diagonal_msg = \"the diagonal values are not substantially higher, suggesting the relationship might be more nuanced.\"\n",
- " \n",
- " phi_col = f\"The phi correlation coefficient is {phi_corr_coeff:.3f}, indicating a {phi_bin} effect size. {diagonal_msg}\"\n",
- " \n",
- " return df, phi_col\n",
- " \n",
- " def _gen_outcome_meta(\n",
- " self,\n",
- " df: DataFrame,\n",
- " pval: float,\n",
- " phi_result: str\n",
- " ) -> DataFrame:\n",
- " \n",
- " \"\"\"\n",
- " Method to generate meta data for \n",
- " reporting dataframe\n",
- " \n",
- " :param df:\n",
- " DataFrame, results df\n",
- " :param pval:\n",
- " int, pvalue\n",
- " :param phi_result:\n",
- " str, result of phi testing.\n",
- " :return df:\n",
- " DataFrame with metadata added\n",
- " \"\"\"\n",
- " \n",
- " grpers = self.grpers\n",
- " result = df['test_result'].values[0]\n",
- " phi_col = df['phi_corr_coeff'].values[0]\n",
- " testing = self.testing\n",
- " process = self.process\n",
- " group_target_val = self.group_target_val\n",
- " alpha = self.alpha\n",
- " four_fifths = df['four_fifths_test'].values[0]\n",
- " \n",
- " col = f\"Testing for {grpers}, {four_fifths}. Based on the results of the chi-square test of independence, there is {result} for {testing}-based {process} discrimination against {group_target_val} at the chosen significance level of {alpha}.\"\n",
- "\n",
- " if result == \"statistically significant result\":\n",
- " col = f\"{col} {phi_result}\"\n",
- " \n",
- " df['result_desc'] = col\n",
- " \n",
- " return df\n",
- " \n",
- "# pipeline\n",
- "\n",
- "ingestObj = Ingest(config)\n",
- "df = ingestObj.run()\n",
- "\n",
- "transObj = Transform(\n",
- " df.copy()\n",
- ")\n",
- "tbl = transObj.run_build_cont_table()\n",
- "\n",
- "statsObj = StatsTesting2x2Cont(\n",
- " config,\n",
- " tbl,\n",
- " df.copy() # need to add some more context in plain text\n",
- ")\n",
- "df_result = statsObj.run_testing()\n",
- "\n",
- "df_result['result_desc'].tolist()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "id": "01672735-2ad5-42ae-9488-1962f3d0e63e",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " test_result | \n",
- " phi_corr_coeff | \n",
- " phi_bins | \n",
- " four_fifths_test | \n",
- " result_desc | \n",
- " statistic | \n",
- " pvalue | \n",
- " dof | \n",
- " tbl_row | \n",
- " tbl | \n",
- " expected_freq | \n",
- " tbl_expected_diff | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " statistically significant result | \n",
- " 0.39736 | \n",
- " moderate | \n",
- " failed 4/5 test at 0.167 | \n",
- " Testing for {'job_title': 'analyst'}, based on... | \n",
- " 5.218246 | \n",
- " 0.022351 | \n",
- " 1 | \n",
- " [Female, Male] | \n",
- " [[10, 1], [15, 18]] | \n",
- " [[6.25, 4.75], [18.75, 14.25]] | \n",
- " [[3.75, -3.75], [-3.75, 3.75]] | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " test_result phi_corr_coeff phi_bins \n",
- "0 statistically significant result 0.39736 moderate \\\n",
- "\n",
- " four_fifths_test \n",
- "0 failed 4/5 test at 0.167 \\\n",
- "\n",
- " result_desc statistic pvalue \n",
- "0 Testing for {'job_title': 'analyst'}, based on... 5.218246 0.022351 \\\n",
- "\n",
- " dof tbl_row tbl expected_freq \n",
- "0 1 [Female, Male] [[10, 1], [15, 18]] [[6.25, 4.75], [18.75, 14.25]] \\\n",
- "\n",
- " tbl_expected_diff \n",
- "0 [[3.75, -3.75], [-3.75, 3.75]] "
- ]
- },
- "execution_count": 24,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_result"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "94114f77-caa8-41a3-900a-44317c84f4b7",
- "metadata": {},
- "source": [
- "to do:\n",
- " \n",
- "implement these tests\n",
- "\n",
- "\n",
- "https://en.wikipedia.org/wiki/Disparate_impact\n",
- "\n",
- "Add handler for filtered size of group must be ...\n",
- "\n",
- "# need to check this size\n",
- "# https://online.stat.psu.edu/stat500/lesson/8/8.2#:~:text=That%20equates%20to%20the%20Chi,count%20of%20at%20least%205.\n",
- "\n",
- "# make sure at least 5 in each slice, then at least 50"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "1d975b96-c8af-4e46-9521-0c7fbe442ff1",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "hrailabs_dev",
- "language": "python",
- "name": "hrailabs"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.0"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/__pycache__/__init__.cpython-311.pyc b/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000..27c8d58
Binary files /dev/null and b/__pycache__/__init__.cpython-311.pyc differ
diff --git a/__pycache__/model.cpython-311.pyc b/__pycache__/model.cpython-311.pyc
index 42d4609..ef93758 100644
Binary files a/__pycache__/model.cpython-311.pyc and b/__pycache__/model.cpython-311.pyc differ
diff --git a/env.yaml b/env.yaml
new file mode 100644
index 0000000..133e4f0
--- /dev/null
+++ b/env.yaml
@@ -0,0 +1,130 @@
+name: equity_toolkit
+channels:
+ - defaults
+dependencies:
+ - bzip2=1.0.8=he774522_0
+ - ca-certificates=2023.01.10=haa95532_0
+ - certifi=2022.12.7=py311haa95532_0
+ - libffi=3.4.2=hd77b12b_6
+ - openssl=1.1.1t=h2bbff1b_0
+ - pip=23.0.1=py311haa95532_0
+ - python=3.11.0=h966fe2a_3
+ - setuptools=65.6.3=py311haa95532_0
+ - sqlite=3.41.1=h2bbff1b_0
+ - tk=8.6.12=h2bbff1b_0
+ - vc=14.2=h21ff451_1
+ - vs2015_runtime=14.27.29016=h5e58377_2
+ - wheel=0.38.4=py311haa95532_0
+ - wincertstore=0.2=py311haa95532_0
+ - xz=5.2.10=h8cc25b3_1
+ - zlib=1.2.13=h8cc25b3_0
+ - pip:
+ - aiofiles==22.1.0
+ - aiosqlite==0.18.0
+ - anyio==3.6.2
+ - argon2-cffi==21.3.0
+ - argon2-cffi-bindings==21.2.0
+ - arrow==1.2.3
+ - asttokens==2.2.1
+ - attrs==22.2.0
+ - babel==2.12.1
+ - backcall==0.2.0
+ - beautifulsoup4==4.12.2
+ - bleach==6.0.0
+ - cffi==1.15.1
+ - charset-normalizer==3.1.0
+ - click==8.1.3
+ - colorama==0.4.6
+ - comm==0.1.3
+ - dash==2.9.2
+ - dash-bootstrap-components==1.6.0
+ - dash-core-components==2.0.0
+ - dash-html-components==2.0.0
+ - dash-table==5.0.0
+ - debugpy==1.6.7
+ - decorator==5.1.1
+ - defusedxml==0.7.1
+ - executing==1.2.0
+ - fastjsonschema==2.16.3
+ - flask==2.2.3
+ - fqdn==1.5.1
+ - idna==3.4
+ - ipykernel==6.22.0
+ - ipython==8.12.0
+ - ipython-genutils==0.2.0
+ - isoduration==20.11.0
+ - itsdangerous==2.1.2
+ - jedi==0.18.2
+ - jinja2==3.1.2
+ - json5==0.9.11
+ - jsonpointer==2.3
+ - jsonschema==4.17.3
+ - jupyter-client==8.1.0
+ - jupyter-core==5.3.0
+ - jupyter-events==0.6.3
+ - jupyter-server==2.5.0
+ - jupyter-server-fileid==0.8.0
+ - jupyter-server-terminals==0.4.4
+ - jupyter-server-ydoc==0.8.0
+ - jupyter-ydoc==0.2.3
+ - jupyterlab==3.6.3
+ - jupyterlab-pygments==0.2.2
+ - jupyterlab-server==2.22.0
+ - markupsafe==2.1.2
+ - mistune==2.0.5
+ - mypy==1.11.2
+ - mypy-extensions==1.0.0
+ - nbclassic==0.5.5
+ - nbclient==0.7.3
+ - nbconvert==7.3.0
+ - nbformat==5.8.0
+ - nest-asyncio==1.5.6
+ - notebook==6.5.4
+ - notebook-shim==0.2.2
+ - numpy==1.24.2
+ - packaging==23.0
+ - pandas==2.0.0
+ - pandocfilters==1.5.0
+ - parso==0.8.3
+ - pickleshare==0.7.5
+ - platformdirs==3.2.0
+ - plotly==5.14.1
+ - prometheus-client==0.16.0
+ - prompt-toolkit==3.0.38
+ - psutil==5.9.4
+ - pure-eval==0.2.2
+ - pycparser==2.21
+ - pygments==2.14.0
+ - pyrsistent==0.19.3
+ - python-dateutil==2.8.2
+ - python-json-logger==2.0.7
+ - pytz==2023.3
+ - pywin32==306
+ - pywinpty==2.0.10
+ - pyyaml==6.0
+ - pyzmq==25.0.2
+ - requests==2.28.2
+ - rfc3339-validator==0.1.4
+ - rfc3986-validator==0.1.1
+ - scipy==1.10.1
+ - send2trash==1.8.0
+ - six==1.16.0
+ - sniffio==1.3.0
+ - soupsieve==2.4
+ - stack-data==0.6.2
+ - tenacity==8.2.2
+ - terminado==0.17.1
+ - tinycss2==1.2.1
+ - tornado==6.2
+ - traitlets==5.9.0
+ - typing-extensions==4.12.2
+ - tzdata==2023.3
+ - uri-template==1.2.0
+ - urllib3==1.26.15
+ - wcwidth==0.2.6
+ - webcolors==1.13
+ - webencodings==0.5.1
+ - websocket-client==1.5.1
+ - werkzeug==2.2.3
+ - y-py==0.5.9
+ - ypy-websocket==0.8.2
\ No newline at end of file
diff --git a/model.py b/model.py
index 2302456..3956f79 100644
--- a/model.py
+++ b/model.py
@@ -4,11 +4,15 @@
from typing import List
from typing import Tuple
+import os
import yaml
import src.model_classes as mc
-with open('config.yaml') as f:
+package_dir = os.path.dirname(os.path.abspath(__file__))
+config_fp = os.path.join(package_dir, "config.yaml")
+
+with open(config_fp) as f:
config = yaml.safe_load(f)
class Model:
diff --git a/notebooks/.ipynb_checkpoints/toolkit-checkpoint.ipynb b/notebooks/.ipynb_checkpoints/toolkit-checkpoint.ipynb
new file mode 100644
index 0000000..d6ae82b
--- /dev/null
+++ b/notebooks/.ipynb_checkpoints/toolkit-checkpoint.ipynb
@@ -0,0 +1,146 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "f9c027ac-4352-4da6-a6d7-394eade3031c",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "import yaml\n",
+ "import sys\n",
+ "import os\n",
+ "\n",
+ "project_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
+ "sys.path.append(project_dir)\n",
+ "import model as model\n",
+ "\n",
+ "config_fp = os.path.join(project_dir, \"config.yaml\")\n",
+ "with open(config_fp) as f:\n",
+ " config = yaml.safe_load(f)\n",
+ " \n",
+ "mod = model.Model(config)\n",
+ "\n",
+ "df_prep, tbl = mod.prep()\n",
+ "\n",
+ "df_result = mod.analysis(df_prep.copy(), tbl)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "8c15a313-bfc9-44a9-9891-387ed56564be",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " test_result | \n",
+ " phi_corr_coeff | \n",
+ " phi_bins | \n",
+ " four_fifths_test | \n",
+ " result_desc | \n",
+ " statistic | \n",
+ " pvalue | \n",
+ " alpha | \n",
+ " dof | \n",
+ " tbl_rows | \n",
+ " tbl_cols | \n",
+ " tbl | \n",
+ " expected_freq | \n",
+ " tbl_expected_diff | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Statistically significant result | \n",
+ " 0.39736 | \n",
+ " moderate | \n",
+ " 4/5ths Test passed at a ratio of: 1.0. | \n",
+ " Testing for {'job_title': 'analyst'}, 4/5ths T... | \n",
+ " 5.218246 | \n",
+ " 0.022351 | \n",
+ " 0.05 | \n",
+ " 1 | \n",
+ " [Female, Male] | \n",
+ " [not_hired, hired] | \n",
+ " [[10, 1], [15, 18]] | \n",
+ " [[6.25, 4.75], [18.75, 14.25]] | \n",
+ " [[3.75, -3.75], [-3.75, 3.75]] | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " test_result phi_corr_coeff phi_bins \n",
+ "0 Statistically significant result 0.39736 moderate \\\n",
+ "\n",
+ " four_fifths_test \n",
+ "0 4/5ths Test passed at a ratio of: 1.0. \\\n",
+ "\n",
+ " result_desc statistic pvalue \n",
+ "0 Testing for {'job_title': 'analyst'}, 4/5ths T... 5.218246 0.022351 \\\n",
+ "\n",
+ " alpha dof tbl_rows tbl_cols tbl \n",
+ "0 0.05 1 [Female, Male] [not_hired, hired] [[10, 1], [15, 18]] \\\n",
+ "\n",
+ " expected_freq tbl_expected_diff \n",
+ "0 [[6.25, 4.75], [18.75, 14.25]] [[3.75, -3.75], [-3.75, 3.75]] "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_result"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "hrailabs_dev",
+ "language": "python",
+ "name": "hrailabs"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/toolkit.ipynb b/notebooks/toolkit.ipynb
new file mode 100644
index 0000000..d6ae82b
--- /dev/null
+++ b/notebooks/toolkit.ipynb
@@ -0,0 +1,146 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "f9c027ac-4352-4da6-a6d7-394eade3031c",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "import yaml\n",
+ "import sys\n",
+ "import os\n",
+ "\n",
+ "project_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
+ "sys.path.append(project_dir)\n",
+ "import model as model\n",
+ "\n",
+ "config_fp = os.path.join(project_dir, \"config.yaml\")\n",
+ "with open(config_fp) as f:\n",
+ " config = yaml.safe_load(f)\n",
+ " \n",
+ "mod = model.Model(config)\n",
+ "\n",
+ "df_prep, tbl = mod.prep()\n",
+ "\n",
+ "df_result = mod.analysis(df_prep.copy(), tbl)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "8c15a313-bfc9-44a9-9891-387ed56564be",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " test_result | \n",
+ " phi_corr_coeff | \n",
+ " phi_bins | \n",
+ " four_fifths_test | \n",
+ " result_desc | \n",
+ " statistic | \n",
+ " pvalue | \n",
+ " alpha | \n",
+ " dof | \n",
+ " tbl_rows | \n",
+ " tbl_cols | \n",
+ " tbl | \n",
+ " expected_freq | \n",
+ " tbl_expected_diff | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Statistically significant result | \n",
+ " 0.39736 | \n",
+ " moderate | \n",
+ " 4/5ths Test passed at a ratio of: 1.0. | \n",
+ " Testing for {'job_title': 'analyst'}, 4/5ths T... | \n",
+ " 5.218246 | \n",
+ " 0.022351 | \n",
+ " 0.05 | \n",
+ " 1 | \n",
+ " [Female, Male] | \n",
+ " [not_hired, hired] | \n",
+ " [[10, 1], [15, 18]] | \n",
+ " [[6.25, 4.75], [18.75, 14.25]] | \n",
+ " [[3.75, -3.75], [-3.75, 3.75]] | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " test_result phi_corr_coeff phi_bins \n",
+ "0 Statistically significant result 0.39736 moderate \\\n",
+ "\n",
+ " four_fifths_test \n",
+ "0 4/5ths Test passed at a ratio of: 1.0. \\\n",
+ "\n",
+ " result_desc statistic pvalue \n",
+ "0 Testing for {'job_title': 'analyst'}, 4/5ths T... 5.218246 0.022351 \\\n",
+ "\n",
+ " alpha dof tbl_rows tbl_cols tbl \n",
+ "0 0.05 1 [Female, Male] [not_hired, hired] [[10, 1], [15, 18]] \\\n",
+ "\n",
+ " expected_freq tbl_expected_diff \n",
+ "0 [[6.25, 4.75], [18.75, 14.25]] [[3.75, -3.75], [-3.75, 3.75]] "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_result"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "hrailabs_dev",
+ "language": "python",
+ "name": "hrailabs"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/.ipynb_checkpoints/Pay_Gap_Reg-checkpoint.ipynb b/pay_equity/.ipynb_checkpoints/Pay_Gap_Reg-checkpoint.ipynb
similarity index 100%
rename from .ipynb_checkpoints/Pay_Gap_Reg-checkpoint.ipynb
rename to pay_equity/.ipynb_checkpoints/Pay_Gap_Reg-checkpoint.ipynb
diff --git a/Pay_Gap_Reg.ipynb b/pay_equity/Pay_Gap_Reg.ipynb
similarity index 100%
rename from Pay_Gap_Reg.ipynb
rename to pay_equity/Pay_Gap_Reg.ipynb
diff --git a/src/.ipynb_checkpoints/model_classes-checkpoint.py b/src/.ipynb_checkpoints/model_classes-checkpoint.py
index d44109b..3b601b7 100644
--- a/src/.ipynb_checkpoints/model_classes-checkpoint.py
+++ b/src/.ipynb_checkpoints/model_classes-checkpoint.py
@@ -6,8 +6,12 @@
import pandas as pd
import numpy as np
+import os
from scipy.stats import chi2_contingency
+package_dir = os.path.dirname(os.path.abspath(__file__))
+main_dir = os.path.abspath(os.path.join(package_dir, ".."))
+
class Ingest:
"""
@@ -146,28 +150,29 @@ def run_load(
"""
filepath = self.filepath
-
+ csv_fp = os.path.join(main_dir, filepath)
+
try:
- return pd.read_csv(filepath, skiprows=0)
+ return pd.read_csv(csv_fp, skiprows=0)
except FileNotFoundError:
raise FileNotFoundError(
- f"The file at {filepath} was not found. Please check the file path."
+ f"The file at {csv_fp} was not found. Please check the file path."
)
except pd.errors.EmptyDataError:
raise ValueError(
- f"The file at {filepath} is empty and cannot be loaded."
+ f"The file at {csv_fp} is empty and cannot be loaded."
)
except pd.errors.ParserError:
raise ValueError(
- f"The file at {filepath} contains malformed data and could not be parsed as a valid CSV."
+ f"The file at {csv_fp} contains malformed data and could not be parsed as a valid CSV."
)
except PermissionError:
raise PermissionError(
- f"Permission denied when attempting to read the file at {filepath}."
+ f"Permission denied when attempting to read the file at {csv_fp}."
f"Please check the file permissions."
)
diff --git a/src/__pycache__/model_classes.cpython-311.pyc b/src/__pycache__/model_classes.cpython-311.pyc
index 1e700e5..a3fee23 100644
Binary files a/src/__pycache__/model_classes.cpython-311.pyc and b/src/__pycache__/model_classes.cpython-311.pyc differ
diff --git a/src/model_classes.py b/src/model_classes.py
index d44109b..3b601b7 100644
--- a/src/model_classes.py
+++ b/src/model_classes.py
@@ -6,8 +6,12 @@
import pandas as pd
import numpy as np
+import os
from scipy.stats import chi2_contingency
+package_dir = os.path.dirname(os.path.abspath(__file__))
+main_dir = os.path.abspath(os.path.join(package_dir, ".."))
+
class Ingest:
"""
@@ -146,28 +150,29 @@ def run_load(
"""
filepath = self.filepath
-
+ csv_fp = os.path.join(main_dir, filepath)
+
try:
- return pd.read_csv(filepath, skiprows=0)
+ return pd.read_csv(csv_fp, skiprows=0)
except FileNotFoundError:
raise FileNotFoundError(
- f"The file at {filepath} was not found. Please check the file path."
+ f"The file at {csv_fp} was not found. Please check the file path."
)
except pd.errors.EmptyDataError:
raise ValueError(
- f"The file at {filepath} is empty and cannot be loaded."
+ f"The file at {csv_fp} is empty and cannot be loaded."
)
except pd.errors.ParserError:
raise ValueError(
- f"The file at {filepath} contains malformed data and could not be parsed as a valid CSV."
+ f"The file at {csv_fp} contains malformed data and could not be parsed as a valid CSV."
)
except PermissionError:
raise PermissionError(
- f"Permission denied when attempting to read the file at {filepath}."
+ f"Permission denied when attempting to read the file at {csv_fp}."
f"Please check the file permissions."
)
diff --git a/toolkit.ipynb b/toolkit.ipynb
deleted file mode 100644
index aa39ad2..0000000
--- a/toolkit.ipynb
+++ /dev/null
@@ -1,1304 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 17,
- "id": "ab695a7b-3043-4154-a59b-01e57feaf8f0",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[\"Testing for {'job_title': 'analyst'}, passed with 4/5 test at 1.0. Based on the results of the chi-square test of independence, there is statistically significant result for gender-based hiring discrimination against Female at the chosen significance level of 0.05. The phi correlation coefficient is 0.397, indicating a moderate effect size. The values on the positive diagonal of the 'tbl' indicate the distribution of hiring success across gen categories. Male had a higher proportion of successful outcomes compared to Female. Specifically, 54.5% of Male had success while only 54.5% of Female had success. This significant difference in hiring success rates suggests a potential gen bias, with Male success in hiring at a higher rate than Female.\"]"
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import pandas as pd\n",
- "import numpy as np\n",
- "from scipy.stats import chi2_contingency\n",
- "import yaml\n",
- "\n",
- "from pandas import DataFrame\n",
- "from typing import Dict\n",
- "from typing import List\n",
- "from typing import Any\n",
- "from typing import Tuple\n",
- "\n",
- "with open('config.yaml') as f:\n",
- " config = yaml.safe_load(f)\n",
- "\n",
- "class Ingest:\n",
- " \n",
- " \"\"\"\n",
- " Class to ingest dataframe input.\n",
- " \"\"\"\n",
- " \n",
- " def __init__(\n",
- " self,\n",
- " config: Dict[Any, Any]\n",
- " ) -> None:\n",
- " \n",
- " \"\"\"\n",
- " Inits class with the config file\n",
- " and unpacks the config file.\n",
- " \"\"\"\n",
- " \n",
- " self.config = config\n",
- " \n",
- " self.unpack_config()\n",
- "\n",
- " \n",
- " def run(\n",
- " self\n",
- " ) -> DataFrame:\n",
- " \n",
- " \"\"\"\n",
- " Run function for the class.\n",
- " \n",
- " :param None:\n",
- " :return df:\n",
- " DataFrame, ingested df\n",
- " \"\"\"\n",
- " \n",
- " df = self.run_load()\n",
- " \n",
- " df = self.run_harmonize(df)\n",
- " \n",
- " return df\n",
- " \n",
- " \n",
- " def unpack_config(\n",
- " self\n",
- " ) -> None:\n",
- " \n",
- " \"\"\"\n",
- " Function to unpack config vars.\n",
- " \n",
- " :var filepath:\n",
- " str, the relative filepath \n",
- " :var group_variable:\n",
- " str, the column name for the \n",
- " group variable of interest e.g.\n",
- " gender, which contains the target \n",
- " class and non-target class e.g.\n",
- " females and males.\n",
- " :var group_target_val:\n",
- " str, within the group_variable column,\n",
- " contains the contains the target \n",
- " class value e.g.\n",
- " females.\n",
- " :var group_other_val:\n",
- " str, within the group_variable column,\n",
- " contains the contains the non-target \n",
- " class value e.g. males.\n",
- " :var outcome_variable:\n",
- " str, the column name for the \n",
- " outcome variable of interest e.g.\n",
- " hired, which contains the target \n",
- " class and non-target class e.g.\n",
- " hired and not-hired.\n",
- " :var outcome_target_val:\n",
- " str, within the outcome_variable column,\n",
- " contains the contains the target \n",
- " class value e.g.\n",
- " hired.\n",
- " :var outcome_other_val:\n",
- " str, within the outcome_variable column,\n",
- " contains the contains the non-target \n",
- " class value e.g. not-hired.\n",
- " :var grpers:\n",
- " Dict[str,str], can be any set of filterable\n",
- " columns to slice into particular groups within\n",
- " the broader employee roster. The key is the column,\n",
- " the value is the desired class within the column\n",
- " e.g. job_title: analyst.\n",
- " \"\"\"\n",
- " \n",
- " config = self.config\n",
- " \n",
- " try:\n",
- " self.filepath: str = config[\"Ingest\"][\"filepath\"]\n",
- " self.group_variable: str = config[\"Ingest\"][\"group_variable\"]\n",
- " self.group_target_val: str = config[\"Ingest\"][\"group_target_val\"]\n",
- " self.group_other_val: str = config[\"Ingest\"][\"group_other_val\"]\n",
- " self.outcome_variable: str = config[\"Ingest\"][\"outcome_variable\"]\n",
- " self.outcome_target_val: str = config[\"Ingest\"][\"outcome_target_val\"]\n",
- " self.outcome_other_val: str = config[\"Ingest\"][\"outcome_other_val\"]\n",
- " self.grpers: Dict[str, str] = config[\"Ingest\"][\"grpers\"]\n",
- "\n",
- " # Type validation\n",
- " if not isinstance(self.filepath, str):\n",
- " raise TypeError(\"Expected 'filepath' to be of type 'str'.\")\n",
- " if not isinstance(self.group_variable, str):\n",
- " raise TypeError(\"Expected 'group_variable' to be of type 'str'.\")\n",
- " if not isinstance(self.group_target_val, str):\n",
- " raise TypeError(\"Expected 'group_target_val' to be of type 'str'.\")\n",
- " if not isinstance(self.group_other_val, str):\n",
- " raise TypeError(\"Expected 'group_other_val' to be of type 'str'.\")\n",
- " if not isinstance(self.outcome_variable, str):\n",
- " raise TypeError(\"Expected 'outcome_variable' to be of type 'str'.\")\n",
- " if not isinstance(self.outcome_target_val, str):\n",
- " raise TypeError(\"Expected 'outcome_target_val' to be of type 'str'.\")\n",
- " if not isinstance(self.outcome_other_val, str):\n",
- " raise TypeError(\"Expected 'outcome_other_val' to be of type 'str'.\")\n",
- " if not isinstance(self.grpers, dict):\n",
- " raise TypeError(\"Expected 'grpers' to be of type 'dict'.\")\n",
- "\n",
- " except KeyError as e:\n",
- " raise KeyError(f\"Missing key '{e.args[0]}' in the config file. \"\n",
- " \"Please ensure the config file contains all required keys under the 'Ingest' section: \"\n",
- " \"'filepath', 'group_variable', 'group_target_val', 'group_other_val', \"\n",
- " \"'outcome_variable', 'outcome_target_val', 'outcome_other_val', and 'grpers'.\")\n",
- "\n",
- " except TypeError as e:\n",
- " raise TypeError(f\"Config file error: {e}\")\n",
- " \n",
- " def run_load(\n",
- " self\n",
- " ) -> DataFrame:\n",
- " \n",
- " \"\"\"\n",
- " Loads csv file. Assumes headers are row 0.\n",
- " \n",
- " :param None:\n",
- " :return DataFrame:\n",
- " \"\"\"\n",
- " \n",
- " filepath = self.filepath\n",
- " \n",
- " try:\n",
- " return pd.read_csv(filepath, skiprows=0)\n",
- "\n",
- " except FileNotFoundError:\n",
- " raise FileNotFoundError(\n",
- " f\"The file at {filepath} was not found. Please check the file path.\"\n",
- " )\n",
- "\n",
- " except pd.errors.EmptyDataError:\n",
- " raise ValueError(\n",
- " f\"The file at {filepath} is empty and cannot be loaded.\"\n",
- " )\n",
- "\n",
- " except pd.errors.ParserError:\n",
- " raise ValueError(\n",
- " f\"The file at {filepath} contains malformed data and could not be parsed as a valid CSV.\"\n",
- " )\n",
- "\n",
- " except PermissionError:\n",
- " raise PermissionError(\n",
- " f\"Permission denied when attempting to read the file at {filepath}.\"\n",
- " f\"Please check the file permissions.\"\n",
- " )\n",
- "\n",
- " except Exception as e:\n",
- " raise Exception(\n",
- " f\"An unexpected error occurred while loading the file: {str(e)}\"\n",
- " )\n",
- " \n",
- " def run_harmonize(\n",
- " self,\n",
- " df: DataFrame\n",
- " ) -> DataFrame: \n",
- "\n",
- " \"\"\"\n",
- " Function to harmonize the dataset.\n",
- " \n",
- " :param df: \n",
- " DataFrame, loaded df\n",
- " :return df:\n",
- " DataFrame, filtered down to target and other group and\n",
- " harmonize the fields\n",
- " \"\"\"\n",
- " \n",
- " group_variable = self.group_variable\n",
- " group_target_val = self.group_target_val\n",
- " group_other_val = self.group_other_val\n",
- " outcome_variable = self.outcome_variable\n",
- " outcome_target_val = self.outcome_target_val\n",
- " outcome_other_val = self.outcome_other_val\n",
- " grpers = self.grpers\n",
- "\n",
- " df = self._apply_filters(\n",
- " df=df,\n",
- " group_variable=group_variable,\n",
- " group_target_val=group_target_val,\n",
- " group_other_val=group_other_val,\n",
- " grpers=grpers\n",
- " )\n",
- " \n",
- " df = self._apply_harmonize(\n",
- " df=df,\n",
- " group_variable=group_variable,\n",
- " group_target_val=group_target_val,\n",
- " group_other_val=group_other_val,\n",
- " outcome_variable=outcome_variable,\n",
- " outcome_target_val=outcome_target_val,\n",
- " outcome_other_val=outcome_other_val\n",
- " )\n",
- " \n",
- " return df\n",
- " \n",
- " def _apply_filters(\n",
- " self,\n",
- " df: DataFrame,\n",
- " group_variable: str,\n",
- " group_target_val: str,\n",
- " group_other_val: str,\n",
- " grpers: Dict[str,str],\n",
- " ) -> DataFrame:\n",
- " \n",
- " \"\"\"\n",
- " Method to apply filters\n",
- " \n",
- " :param df:\n",
- " DataFrame, target df\n",
- " :param group_variable:\n",
- " str, column name of the\n",
- " target variable\n",
- " :param group_target_val:\n",
- " str, class target value of the group_variable\n",
- " aka the protected class value\n",
- " :param group_other_val:\n",
- " str, class nontarget value of the group_variable\n",
- " aka the nonprotected class value\n",
- " :return df:\n",
- " DataFrame, filtered df\n",
- " \"\"\"\n",
- " \n",
- " df = df.loc[\n",
- " df[group_variable].isin(\n",
- " [\n",
- " group_target_val, \n",
- " group_other_val\n",
- " ]\n",
- " )\n",
- " ]\n",
- " \n",
- " for k, v in grpers.items():\n",
- " \n",
- " df = df.loc[\n",
- " df[k].isin([v])\n",
- " ] \n",
- " \n",
- " return df\n",
- " \n",
- " def _apply_harmonize(\n",
- " self,\n",
- " df: DataFrame,\n",
- " group_variable: str,\n",
- " group_target_val: str,\n",
- " group_other_val: str,\n",
- " outcome_variable: str,\n",
- " outcome_target_val: str,\n",
- " outcome_other_val: str\n",
- " ) -> DataFrame:\n",
- " \n",
- " \"\"\"\n",
- " Method to harmonize targets.\n",
- " \n",
- " :param df:\n",
- " DataFrame, target df\n",
- " :param group_variable:\n",
- " str, column name of the\n",
- " target variable\n",
- " :param group_target_val:\n",
- " str, class target value of the group_variable\n",
- " aka the protected class value\n",
- " :param group_other_val:\n",
- " str, class nontarget value of the group_variable\n",
- " aka the nonprotected class value \n",
- " :param outcome_variable:\n",
- " str, the column name of the outcome \n",
- " :param outcome_target_val:\n",
- " str, class target value of the outcome_variable\n",
- " aka success\n",
- " :param outcome_other_val:\n",
- " str, class nontarget value of the outcome_variable\n",
- " :return df:\n",
- " DataFrame, target df\n",
- " \"\"\"\n",
- " \n",
- " # harmonize the group target\n",
- " df['group_var_clean'] = np.where(\n",
- " df[group_variable]==group_target_val, \n",
- " 1,\n",
- " np.where(\n",
- " df[group_variable]==group_other_val, \n",
- " 0, \n",
- " -1\n",
- " )\n",
- " )\n",
- " \n",
- " # harmonize the outcome target\n",
- " df['outcome_var_clean'] = np.where(\n",
- " df[outcome_variable]==outcome_target_val, \n",
- " 1, \n",
- " np.where(\n",
- " df[self.outcome_variable]==outcome_other_val,\n",
- " 0, \n",
- " -1\n",
- " )\n",
- " ) \n",
- " \n",
- " return df\n",
- " \n",
- "class Transform:\n",
- " \n",
- " \"\"\"\n",
- " Class to transform dataframe inputs into \n",
- " 2x2 contingency table.\n",
- " \"\"\"\n",
- " \n",
- " def __init__(\n",
- " self, \n",
- " df: DataFrame\n",
- " ) -> None:\n",
- " \n",
- " \"\"\"\n",
- " :param df:\n",
- " DataFrame, input df\n",
- " \"\"\"\n",
- " \n",
- " self.df = df\n",
- " \n",
- " def run_build_cont_table(\n",
- " self\n",
- " ) -> List[int]:\n",
- " \n",
- " \"\"\"\n",
- " Function to generate contingency table format.\n",
- " \n",
- " Places the target group val in the top row and the\n",
- " target group other to the bottom row.\n",
- " \n",
- " Places no-success outcome on the first column and success\n",
- " on the second column.\n",
- " \n",
- " :return tbl:\n",
- " List[int], filtered down to target and other group.\n",
- " \"\"\"\n",
- " \n",
- " df = self.df\n",
- " \n",
- " cols = [\n",
- " 'group_var_clean', \n",
- " 'outcome_var_clean'\n",
- " ]\n",
- " \n",
- " df = df[cols]\n",
- " \n",
- " tbl = (\n",
- " df.pivot_table(\n",
- " index='group_var_clean',\n",
- " columns='outcome_var_clean', \n",
- " aggfunc=len\n",
- " ).\n",
- " sort_index(\n",
- " axis=1, \n",
- " ascending=True\n",
- " ).\n",
- " sort_index(ascending=False). # ensure always [1,0]\n",
- " values.tolist()\n",
- " ) \n",
- " \n",
- " return tbl\n",
- " \n",
- "class StatsTesting2x2Cont:\n",
- " \n",
- " \"\"\"\n",
- " Class to perform 2x2 Contigency Table analysis\n",
- " with Chi2 and Phi Correlation Coefficent Testing.\n",
- "\n",
- " Provides context into potential association between\n",
- " variables and the strength of the association.\n",
- " \"\"\"\n",
- " \n",
- " def __init__(\n",
- " self,\n",
- " config: Dict[Any, Any],\n",
- " tbl: List[int],\n",
- " df: DataFrame\n",
- " ) -> None:\n",
- " \n",
- " \"\"\"\n",
- " Inits the class variables and unpacks the\n",
- " config variables.\n",
- " \n",
- " :param config:\n",
- " Dict[str,Any], loaded config file.\n",
- " :param tbl:\n",
- " List[int], 2x2 cont table.\n",
- " :param df:\n",
- " DataFrame, original input DataFrame.\n",
- " \"\"\"\n",
- " \n",
- " self.config = config\n",
- " self.tbl = tbl\n",
- " self.df = df\n",
- "\n",
- " self.unpack_config()\n",
- "\n",
- " def run_testing(\n",
- " self\n",
- " ) -> DataFrame:\n",
- " \n",
- " \"\"\"\n",
- " Run function for the class.\n",
- " \n",
- " Runs hypothesis evaluation and builds\n",
- " the output report DataFrame.\n",
- " \n",
- " :param None:\n",
- " :return df_results:\n",
- " DataFrame, with testing results.\n",
- " \"\"\"\n",
- " \n",
- " alpha = self.alpha\n",
- " tbl = self.tbl\n",
- " process = self.process\n",
- " group_variable = self.group_variable\n",
- " group_target_val = self.group_target_val\n",
- " group_other_val = self.group_other_val\n",
- " bin_edges = self.bin_edges\n",
- " bin_labels = self.bin_labels\n",
- " \n",
- " res = self.gen_hypothesis_eval(tbl)\n",
- "\n",
- " df_results = self.run_report_bld(\n",
- " alpha=alpha,\n",
- " res=res,\n",
- " tbl=tbl,\n",
- " process=process,\n",
- " group_variable=group_variable,\n",
- " group_target_val=group_target_val,\n",
- " group_other_val=group_other_val,\n",
- " bin_edges=bin_edges,\n",
- " bin_labels=bin_labels\n",
- " )\n",
- " \n",
- " return df_results\n",
- " \n",
- " def unpack_config(\n",
- " self\n",
- " ) -> None:\n",
- " \n",
- " \"\"\"\n",
- " Function to unpack config variables.\n",
- " \n",
- " :param None:\n",
- " :return None:\n",
- " \"\"\"\n",
- " \n",
- " config = self.config\n",
- "\n",
- " try:\n",
- " self.alpha: float = config[\"StatsTesting2x2Cont\"][\"alpha\"]\n",
- " self.group_variable: str = config[\"Ingest\"][\"group_variable\"]\n",
- " self.group_target_val: str = config[\"Ingest\"][\"group_target_val\"]\n",
- " self.group_other_val: str = config[\"Ingest\"][\"group_other_val\"]\n",
- " self.outcome_variable: str = config[\"Ingest\"][\"outcome_variable\"]\n",
- " self.outcome_target_val: str = config[\"Ingest\"][\"outcome_target_val\"]\n",
- " self.outcome_other_val: str = config[\"Ingest\"][\"outcome_other_val\"]\n",
- " self.grpers: Dict[str, str] = config[\"Ingest\"][\"grpers\"]\n",
- " self.testing: str = config[\"StatsTesting2x2Cont\"][\"testing\"]\n",
- " self.process: str = config[\"StatsTesting2x2Cont\"][\"process\"]\n",
- " self.bin_edges: List[float] = config[\"StatsTesting2x2Cont\"][\"phi_bin_edges\"]\n",
- " self.bin_labels: List[str] = config[\"StatsTesting2x2Cont\"][\"phi_bin_labels\"]\n",
- "\n",
- " if not isinstance(self.alpha, float):\n",
- " raise TypeError(\"Expected 'alpha' to be of type 'float'.\")\n",
- " if not isinstance(self.group_variable, str):\n",
- " raise TypeError(\"Expected 'group_variable' to be of type 'str'.\")\n",
- " if not isinstance(self.group_target_val, str):\n",
- " raise TypeError(\"Expected 'group_target_val' to be of type 'str'.\")\n",
- " if not isinstance(self.group_other_val, str):\n",
- " raise TypeError(\"Expected 'group_other_val' to be of type 'str'.\")\n",
- " if not isinstance(self.outcome_variable, str):\n",
- " raise TypeError(\"Expected 'outcome_variable' to be of type 'str'.\")\n",
- " if not isinstance(self.outcome_target_val, str):\n",
- " raise TypeError(\"Expected 'outcome_target_val' to be of type 'str'.\")\n",
- " if not isinstance(self.outcome_other_val, str):\n",
- " raise TypeError(\"Expected 'outcome_other_val' to be of type 'str'.\")\n",
- " if not isinstance(self.grpers, dict):\n",
- " raise TypeError(\"Expected 'grpers' to be of type 'dict'.\")\n",
- " if not isinstance(self.testing, str):\n",
- " raise TypeError(\"Expected 'testing' to be of type 'str'.\")\n",
- " if not isinstance(self.process, str):\n",
- " raise TypeError(\"Expected 'process' to be of type 'str'.\")\n",
- " if not isinstance(\n",
- " self.bin_edges, list\n",
- " ) or not all(\n",
- " isinstance(\n",
- " i, (int, float)\n",
- " ) for i in self.bin_edges\n",
- " ):\n",
- " raise TypeError(\"Expected 'bin_edges' to be a list of floats.\")\n",
- " if not isinstance(\n",
- " self.bin_labels, list\n",
- " ) or not all(\n",
- " isinstance(i, str) for i in self.bin_labels\n",
- " ):\n",
- " raise TypeError(\"Expected 'bin_labels' to be a list of strings.\")\n",
- " \n",
- " except KeyError as e:\n",
- " raise KeyError(\n",
- " f\"Missing key '{e.args[0]}' in the config file. \"\n",
- " f\"Ensure all required keys are present in the 'Ingest' and 'StatsTesting2x2Cont' sections.\"\n",
- " )\n",
- "\n",
- " except TypeError as e:\n",
- " raise TypeError(f\"Config file error: {e}\")\n",
- "\n",
- " \n",
- " def gen_hypothesis_eval(\n",
- " self,\n",
- " tbl: List[int]\n",
- " ) -> chi2_contingency:\n",
- " \n",
- " \"\"\"\n",
- " Function to generate the chi2_contigency\n",
- " statistic and result.\n",
- " \"\"\"\n",
- " \n",
- " #size = np.shape(tbl)\n",
- " #tbl_len = len(tbl)\n",
- " \n",
- " res = chi2_contingency(\n",
- " tbl\n",
- " )\n",
- " \n",
- " return res\n",
- " \n",
- " def run_report_bld(\n",
- " self,\n",
- " alpha: float,\n",
- " res: chi2_contingency,\n",
- " tbl: List[int],\n",
- " process: str,\n",
- " group_variable: str,\n",
- " group_target_val: str,\n",
- " group_other_val: str,\n",
- " bin_edges: List[float],\n",
- " bin_labels: List[str]\n",
- " ) -> DataFrame:\n",
- " \n",
- " \"\"\"\n",
- " Runs report for statistical testing\n",
- " chi2_contingency results\n",
- " \n",
- " :param alpha:\n",
- " float, alpha value for significance evaluation.\n",
- " :param res:\n",
- " chi2_contingency, result of the chi2_contingency.\n",
- " :param tbl:\n",
- " List[int], the contingency table.\n",
- " :param process: \n",
- " str, the name of the business process\n",
- " being tested, e.g. 'hiring'.\n",
- " :param group_variable:\n",
- " str, column name of the\n",
- " target variable.\n",
- " :param group_target_val:\n",
- " str, class target value of the group_variable\n",
- " aka the protected class value.\n",
- " :param group_other_val:\n",
- " str, class nontarget value of the group_variable\n",
- " aka the nonprotected class value. \n",
- " :param bin_edges:\n",
- " List[float], edges for phi\n",
- " bins.\n",
- " :param bin_labels:\n",
- " List[str], labels for the phi\n",
- " bins.\n",
- " :return df:\n",
- " DataFrame, target\n",
- " \"\"\"\n",
- " \n",
- " pvalue = res[1]\n",
- " \n",
- " df = pd.DataFrame()\n",
- "\n",
- " df = self._gen_significance_test(\n",
- " df=df,\n",
- " pvalue=pvalue,\n",
- " alpha=alpha\n",
- " )\n",
- " \n",
- " (\n",
- " df,\n",
- " A,\n",
- " B,\n",
- " C,\n",
- " D,\n",
- " total_target_grp,\n",
- " total_non_target_grp,\n",
- " diagonals,\n",
- " percent_target_succ,\n",
- " percent_non_target_succ,\n",
- " phi_numerator,\n",
- " phi_denominator\n",
- " ) = self._gen_table_calcs(\n",
- " df=df,\n",
- " tbl=tbl,\n",
- " )\n",
- " \n",
- " if res[1] <= alpha:\n",
- " df, phi_result = self._gen_phi_coefficient(\n",
- " df=df,\n",
- " tbl=tbl,\n",
- " bin_edges=bin_edges,\n",
- " bin_labels=bin_labels,\n",
- " process=process,\n",
- " group_variable=group_variable,\n",
- " group_target_val=group_target_val,\n",
- " group_other_val=group_other_val,\n",
- " diagonals=diagonals,\n",
- " numerator=phi_numerator,\n",
- " denominator=phi_denominator,\n",
- " percent_target_succ=percent_non_target_succ,\n",
- " percent_non_target_succ=percent_non_target_succ,\n",
- " )\n",
- " \n",
- " else:\n",
- " df['phi_corr_coeff'] = np.nan\n",
- " df['phi_bins'] = np.nan\n",
- " \n",
- " phi_result = \"\"\n",
- " \n",
- " df = self._gen_four_fifths_test(\n",
- " df,\n",
- " percent_target_succ=percent_non_target_succ,\n",
- " percent_non_target_succ=percent_non_target_succ\n",
- " )\n",
- " \n",
- " df = self._gen_outcome_meta(\n",
- " df,\n",
- " round(res[1],3),\n",
- " phi_result\n",
- " )\n",
- " \n",
- " df = self._gen_unpack_stats(\n",
- " df,\n",
- " res\n",
- " )\n",
- " \n",
- " return df\n",
- " \n",
- " def _gen_unpack_stats(\n",
- " self,\n",
- " df: DataFrame,\n",
- " res: chi2_contingency\n",
- " ) -> DataFrame:\n",
- " \n",
- " \"\"\"\n",
- " Method to unpack test stats from\n",
- " chi2_contingency results.\n",
- " \n",
- " :param df:\n",
- " DataFrame, output df.\n",
- " :param res:\n",
- " chi2_contingency, results array.\n",
- " :return df:\n",
- " DataFrame, output df.\n",
- " \"\"\"\n",
- " \n",
- " group_target_val = self.group_target_val\n",
- " group_other_val = self.group_other_val\n",
- " rows = [group_target_val] + [group_other_val]\n",
- " \n",
- " df['statistic'] = res[0]\n",
- " df['pvalue'] = res[1]\n",
- " df['dof'] = res[2]\n",
- " df['tbl_row'] = [rows]\n",
- " df['tbl'] = [tbl]\n",
- " df['expected_freq'] = [res[3]]\n",
- " df['tbl_expected_diff'] = [tbl - res[3]]\n",
- " \n",
- " return df\n",
- " \n",
- " def _gen_significance_test(\n",
- " self,\n",
- " df: DataFrame,\n",
- " pvalue: float,\n",
- " alpha: float\n",
- " ):\n",
- " \"\"\"\n",
- " Method to report on test significance.\n",
- " \n",
- " :param df:\n",
- " DataFrame, results df.\n",
- " :param pval:\n",
- " int, pvalue.\n",
- " :param alpha:\n",
- " float, the alpha value for testing eval.\n",
- " :return df:\n",
- " DataFrame with metadata added. \n",
- " \"\"\"\n",
- " \n",
- " if pvalue <= alpha:\n",
- " val = 'statistically significant result'\n",
- " \n",
- " else:\n",
- " val = 'no statistically significant result'\n",
- " \n",
- " df['test_result'] = [val]\n",
- " \n",
- " return df\n",
- " \n",
- " def _gen_phi_coefficient(\n",
- " self,\n",
- " df: DataFrame,\n",
- " tbl: List[int],\n",
- " process: str,\n",
- " group_variable: str,\n",
- " group_target_val: str,\n",
- " group_other_val: str,\n",
- " bin_edges: List[float],\n",
- " bin_labels: List[str],\n",
- " diagonals: List[float],\n",
- " numerator: float,\n",
- " denominator: float,\n",
- " percent_target_succ: float,\n",
- " percent_non_target_succ: float,\n",
- " ) -> DataFrame:\n",
- " \n",
- " \"\"\"\n",
- " Method to generate the phi coefficient.\n",
- " \n",
- " :param df:\n",
- " DataFrame, the results df.\n",
- " :param tbl:\n",
- " List[int], the 2x2 cont table.\n",
- " :param process: \n",
- " str, the name of the business process\n",
- " being tested, e.g. 'hiring'.\n",
- " :param group_variable:\n",
- " str, column name of the\n",
- " target variable.\n",
- " :param group_target_val:\n",
- " str, class target value of the group_variable\n",
- " aka the protected class value.\n",
- " :param group_other_val:\n",
- " str, class nontarget value of the group_variable\n",
- " aka the nonprotected class value. \n",
- " :param bin_edges:\n",
- " List[float], edges for phi\n",
- " bins.\n",
- " :param bin_labels:\n",
- " List[str], lab\n",
- " :return df:\n",
- " DataFrame, output df.\n",
- " \"\"\"\n",
- " phi = numerator / denominator if denominator != 0 else 0\n",
- "\n",
- " df['phi_corr_coeff'] = phi\n",
- " \n",
- " df = self._gen_prep_phi_bins(\n",
- " df=df,\n",
- " bin_edges=bin_edges,\n",
- " bin_labels=bin_labels\n",
- " )\n",
- "\n",
- " df, phi_result = self._gen_prep_diagonals(\n",
- " df=df,\n",
- " diagonals=diagonals,\n",
- " process=process,\n",
- " group_variable=group_variable,\n",
- " group_other_val=group_other_val,\n",
- " group_target_val=group_target_val,\n",
- " percent_non_target_succ=percent_non_target_succ,\n",
- " percent_target_succ=percent_target_succ,\n",
- " )\n",
- " \n",
- " return df, phi_result\n",
- " \n",
- " def _gen_table_calcs(\n",
- " self,\n",
- " df: DataFrame,\n",
- " tbl: List[int]\n",
- " ) -> Tuple[\n",
- " DataFrame, float, float, float, float,\n",
- " float, float, float, float, \n",
- " float, float, float\n",
- " ]:\n",
- " \n",
- " \"\"\"\n",
- " Method to generate phi bins. Provides additional\n",
- " explainability on the magnitude of association, when \n",
- " an association is found.\n",
- " \n",
- " :param df:\n",
- " DataFrame, output df.\n",
- " :param tbl:\n",
- " List[int], 2x2 contingency.\n",
- " :return [\n",
- " df, A, B, C, D, total_target_grp,\n",
- " total_non_target_grp, diagonals,\n",
- " percent_target_succ, percent_non_target_succ,\n",
- " phi_numerator, phi_denominator\n",
- " ]:\n",
- " Tuple[DataFrame, float, float, float, float,\n",
- " float, float, float, float, \n",
- " float, float, float\n",
- " ]\n",
- " \"\"\"\n",
- " \n",
- " # females, males; no succ, succ\n",
- " A, B = tbl[0] \n",
- " C, D = tbl[1]\n",
- " \n",
- " total_target_grp = A + B\n",
- " total_non_target_grp = C + D\n",
- " diagonals = (A + D) > (B + C)\n",
- " percent_target_succ = (B / total_target_grp) * 100\n",
- " percent_non_target_succ = (D / total_non_target_grp) * 100\n",
- " phi_numerator = (A * D) - (B * C)\n",
- " phi_denominator = np.sqrt((A + B) * (C + D) * (A + C) * (B + D)) \n",
- " \n",
- " return (\n",
- " df,\n",
- " A,\n",
- " B,\n",
- " C,\n",
- " D,\n",
- " total_target_grp,\n",
- " total_non_target_grp,\n",
- " diagonals,\n",
- " percent_target_succ,\n",
- " percent_non_target_succ,\n",
- " phi_numerator,\n",
- " phi_denominator\n",
- " )\n",
- " \n",
- " def _gen_prep_phi_bins(\n",
- " self,\n",
- " df: DataFrame,\n",
- " bin_edges: List[float],\n",
- " bin_labels: List[str]\n",
- " ) -> DataFrame:\n",
- " \n",
- " \"\"\"\n",
- " Method to generate pandas bins for \n",
- " phi coeff.\n",
- " \n",
- " :param df:\n",
- " DataFrame, output df.\n",
- " :param bin_edges:\n",
- " List[float], edges for phi\n",
- " bins.\n",
- " :param bin_labels:\n",
- " List[str], labels for the phi\n",
- " bins.\n",
- " :return df:\n",
- " DataFrame, output df.\n",
- " \"\"\"\n",
- " \n",
- " df['phi_bins'] = pd.cut(\n",
- " df['phi_corr_coeff'], \n",
- " bins=bin_edges, \n",
- " labels=bin_labels, \n",
- " include_lowest=True\n",
- " )\n",
- " \n",
- " return df\n",
- " \n",
- " def _gen_four_fifths_test(\n",
- " self,\n",
- " df: DataFrame,\n",
- " percent_target_succ: float,\n",
- " percent_non_target_succ: float\n",
- " ) -> DataFrame:\n",
- " \n",
- " ratio = percent_target_succ / percent_non_target_succ\n",
- " \n",
- " if ratio < .8:\n",
- " ratio_desc = f'failed with 4/5 test at {round(ratio,3)}'\n",
- " elif ratio >= .8:\n",
- " ratio_desc = f'passed with 4/5 test at {round(ratio,3)}'\n",
- " else:\n",
- " ratio_desc = 'error calculating 4/5 test'\n",
- " \n",
- " df['four_fifths_test'] = ratio_desc\n",
- " return df\n",
- " \n",
- " def _gen_prep_diagonals(\n",
- " self,\n",
- " df: DataFrame,\n",
- " diagonals: bool,\n",
- " process: str,\n",
- " group_variable: str,\n",
- " group_other_val: str,\n",
- " group_target_val: str,\n",
- " percent_non_target_succ: float,\n",
- " percent_target_succ: float,\n",
- " ) -> Tuple[DataFrame, str]:\n",
- " \n",
- " \"\"\"\n",
- " Method to generate the magnitude of the\n",
- " assocation using phi coefficient analysis.\n",
- " \n",
- " :param df:\n",
- " DataFrame, output df.\n",
- " :param diagonals:\n",
- " bool,\n",
- " :param process: \n",
- " str, the name of the business process\n",
- " being tested, e.g. 'hiring'.\n",
- " :param group_variable:\n",
- " str, column name of the\n",
- " target variable.\n",
- " :param group_target_val:\n",
- " str, class target value of the group_variable\n",
- " aka the protected class value.\n",
- " :param group_other_val:\n",
- " str, class nontarget value of the group_variable\n",
- " aka the nonprotected class value. \n",
- " :param percent_non_target_succ:\n",
- " float, the success percentage attained\n",
- " for the the non-target group.\n",
- " :param percent_target_succ:\n",
- " float, the success percentage attained for the\n",
- " target class.\n",
- " :return (df, phi_col):\n",
- " Tuple[df, phi_col]\n",
- " \"\"\"\n",
- " \n",
- " phi_bin = df['phi_bins'].values[0] \n",
- " phi_corr_coeff = df['phi_corr_coeff'].values[0] \n",
- "\n",
- " if diagonals:\n",
- " diagonal_msg = (\n",
- " f\"The values on the positive diagonal of the 'tbl' indicate the distribution of {process} success across {group_variable} categories.\"\n",
- " f\" {group_other_val} had a higher proportion of successful outcomes compared to {group_target_val}.\"\n",
- " f\" Specifically, {percent_non_target_succ:.1f}% of {group_other_val} had success while only {percent_target_succ:.1f}%\"\n",
- " f\" of {group_target_val} had success.\"\n",
- " f\" This significant difference in {process} success rates suggests a potential {group_variable} bias, with {group_other_val} success in {process}\"\n",
- " f\" at a higher rate than {group_target_val}.\"\n",
- " )\n",
- " \n",
- " else:\n",
- " diagonal_msg = \"the diagonal values are not substantially higher, suggesting the relationship might be more nuanced.\"\n",
- " \n",
- " phi_col = f\"The phi correlation coefficient is {phi_corr_coeff:.3f}, indicating a {phi_bin} effect size. {diagonal_msg}\"\n",
- " \n",
- " return df, phi_col\n",
- " \n",
- " def _gen_outcome_meta(\n",
- " self,\n",
- " df: DataFrame,\n",
- " pval: float,\n",
- " phi_result: str\n",
- " ) -> DataFrame:\n",
- " \n",
- " \"\"\"\n",
- " Method to generate meta data for \n",
- " reporting dataframe\n",
- " \n",
- " :param df:\n",
- " DataFrame, results df\n",
- " :param pval:\n",
- " int, pvalue\n",
- " :param phi_result:\n",
- " str, result of phi testing.\n",
- " :return df:\n",
- " DataFrame with metadata added\n",
- " \"\"\"\n",
- " \n",
- " grpers = self.grpers\n",
- " result = df['test_result'].values[0]\n",
- " phi_col = df['phi_corr_coeff'].values[0]\n",
- " testing = self.testing\n",
- " process = self.process\n",
- " group_target_val = self.group_target_val\n",
- " alpha = self.alpha\n",
- " four_fifths = df['four_fifths_test'].values[0]\n",
- " \n",
- " col = f\"Testing for {grpers}, {four_fifths}. Based on the results of the chi-square test of independence, there is {result} for {testing}-based {process} discrimination against {group_target_val} at the chosen significance level of {alpha}.\"\n",
- "\n",
- " if result == \"statistically significant result\":\n",
- " col = f\"{col} {phi_result}\"\n",
- " \n",
- " df['result_desc'] = col\n",
- " \n",
- " return df\n",
- " \n",
- "# pipeline\n",
- "\n",
- "ingestObj = Ingest(config)\n",
- "df = ingestObj.run()\n",
- "\n",
- "transObj = Transform(\n",
- " df.copy()\n",
- ")\n",
- "tbl = transObj.run_build_cont_table()\n",
- "\n",
- "statsObj = StatsTesting2x2Cont(\n",
- " config,\n",
- " tbl,\n",
- " df.copy() # need to add some more context in plain text\n",
- ")\n",
- "df_result = statsObj.run_testing()\n",
- "\n",
- "df_result['result_desc'].tolist()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "d303e1d1-c69a-4b59-9489-14574000bd55",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "import yaml\n",
- "import model\n",
- "\n",
- "with open('config.yaml') as f:\n",
- " config = yaml.safe_load(f)\n",
- " \n",
- "model = model.Model(config)\n",
- "\n",
- "df_prep, tbl = model.prep()\n",
- "\n",
- "df_result = model.analysis(df_prep.copy(), tbl)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "cebbe841-c185-443d-ac02-7ccd8c50e005",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " test_result | \n",
- " phi_corr_coeff | \n",
- " phi_bins | \n",
- " four_fifths_test | \n",
- " result_desc | \n",
- " statistic | \n",
- " pvalue | \n",
- " dof | \n",
- " tbl_rows | \n",
- " tbl_cols | \n",
- " tbl | \n",
- " expected_freq | \n",
- " tbl_expected_diff | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " statistically significant result | \n",
- " 0.39736 | \n",
- " moderate | \n",
- " passed with 4/5 test at 1.0 | \n",
- " Testing for {'job_title': 'analyst'}, passed w... | \n",
- " 5.218246 | \n",
- " 0.022351 | \n",
- " 1 | \n",
- " [Female, Male] | \n",
- " [hired, not_hired] | \n",
- " [[10, 1], [15, 18]] | \n",
- " [[6.25, 4.75], [18.75, 14.25]] | \n",
- " [[3.75, -3.75], [-3.75, 3.75]] | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " test_result phi_corr_coeff phi_bins \n",
- "0 statistically significant result 0.39736 moderate \\\n",
- "\n",
- " four_fifths_test \n",
- "0 passed with 4/5 test at 1.0 \\\n",
- "\n",
- " result_desc statistic pvalue \n",
- "0 Testing for {'job_title': 'analyst'}, passed w... 5.218246 0.022351 \\\n",
- "\n",
- " dof tbl_rows tbl_cols tbl \n",
- "0 1 [Female, Male] [hired, not_hired] [[10, 1], [15, 18]] \\\n",
- "\n",
- " expected_freq tbl_expected_diff \n",
- "0 [[6.25, 4.75], [18.75, 14.25]] [[3.75, -3.75], [-3.75, 3.75]] "
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_result"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "id": "01672735-2ad5-42ae-9488-1962f3d0e63e",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " test_result | \n",
- " phi_corr_coeff | \n",
- " phi_bins | \n",
- " four_fifths_test | \n",
- " result_desc | \n",
- " statistic | \n",
- " pvalue | \n",
- " dof | \n",
- " tbl_row | \n",
- " tbl | \n",
- " expected_freq | \n",
- " tbl_expected_diff | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " statistically significant result | \n",
- " 0.39736 | \n",
- " moderate | \n",
- " failed 4/5 test at 0.167 | \n",
- " Testing for {'job_title': 'analyst'}, based on... | \n",
- " 5.218246 | \n",
- " 0.022351 | \n",
- " 1 | \n",
- " [Female, Male] | \n",
- " [[10, 1], [15, 18]] | \n",
- " [[6.25, 4.75], [18.75, 14.25]] | \n",
- " [[3.75, -3.75], [-3.75, 3.75]] | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " test_result phi_corr_coeff phi_bins \n",
- "0 statistically significant result 0.39736 moderate \\\n",
- "\n",
- " four_fifths_test \n",
- "0 failed 4/5 test at 0.167 \\\n",
- "\n",
- " result_desc statistic pvalue \n",
- "0 Testing for {'job_title': 'analyst'}, based on... 5.218246 0.022351 \\\n",
- "\n",
- " dof tbl_row tbl expected_freq \n",
- "0 1 [Female, Male] [[10, 1], [15, 18]] [[6.25, 4.75], [18.75, 14.25]] \\\n",
- "\n",
- " tbl_expected_diff \n",
- "0 [[3.75, -3.75], [-3.75, 3.75]] "
- ]
- },
- "execution_count": 24,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_result"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "94114f77-caa8-41a3-900a-44317c84f4b7",
- "metadata": {},
- "source": [
- "to do:\n",
- " \n",
- "implement these tests\n",
- "\n",
- "\n",
- "https://en.wikipedia.org/wiki/Disparate_impact\n",
- "\n",
- "Add handler for filtered size of group must be ...\n",
- "\n",
- "# need to check this size\n",
- "# https://online.stat.psu.edu/stat500/lesson/8/8.2#:~:text=That%20equates%20to%20the%20Chi,count%20of%20at%20least%205.\n",
- "\n",
- "# make sure at least 5 in each slice, then at least 50"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "1d975b96-c8af-4e46-9521-0c7fbe442ff1",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "hrailabs_dev",
- "language": "python",
- "name": "hrailabs"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.0"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}