diff --git a/.ipynb_checkpoints/env-checkpoint.yaml b/.ipynb_checkpoints/env-checkpoint.yaml new file mode 100644 index 0000000..133e4f0 --- /dev/null +++ b/.ipynb_checkpoints/env-checkpoint.yaml @@ -0,0 +1,130 @@ +name: equity_toolkit +channels: + - defaults +dependencies: + - bzip2=1.0.8=he774522_0 + - ca-certificates=2023.01.10=haa95532_0 + - certifi=2022.12.7=py311haa95532_0 + - libffi=3.4.2=hd77b12b_6 + - openssl=1.1.1t=h2bbff1b_0 + - pip=23.0.1=py311haa95532_0 + - python=3.11.0=h966fe2a_3 + - setuptools=65.6.3=py311haa95532_0 + - sqlite=3.41.1=h2bbff1b_0 + - tk=8.6.12=h2bbff1b_0 + - vc=14.2=h21ff451_1 + - vs2015_runtime=14.27.29016=h5e58377_2 + - wheel=0.38.4=py311haa95532_0 + - wincertstore=0.2=py311haa95532_0 + - xz=5.2.10=h8cc25b3_1 + - zlib=1.2.13=h8cc25b3_0 + - pip: + - aiofiles==22.1.0 + - aiosqlite==0.18.0 + - anyio==3.6.2 + - argon2-cffi==21.3.0 + - argon2-cffi-bindings==21.2.0 + - arrow==1.2.3 + - asttokens==2.2.1 + - attrs==22.2.0 + - babel==2.12.1 + - backcall==0.2.0 + - beautifulsoup4==4.12.2 + - bleach==6.0.0 + - cffi==1.15.1 + - charset-normalizer==3.1.0 + - click==8.1.3 + - colorama==0.4.6 + - comm==0.1.3 + - dash==2.9.2 + - dash-bootstrap-components==1.6.0 + - dash-core-components==2.0.0 + - dash-html-components==2.0.0 + - dash-table==5.0.0 + - debugpy==1.6.7 + - decorator==5.1.1 + - defusedxml==0.7.1 + - executing==1.2.0 + - fastjsonschema==2.16.3 + - flask==2.2.3 + - fqdn==1.5.1 + - idna==3.4 + - ipykernel==6.22.0 + - ipython==8.12.0 + - ipython-genutils==0.2.0 + - isoduration==20.11.0 + - itsdangerous==2.1.2 + - jedi==0.18.2 + - jinja2==3.1.2 + - json5==0.9.11 + - jsonpointer==2.3 + - jsonschema==4.17.3 + - jupyter-client==8.1.0 + - jupyter-core==5.3.0 + - jupyter-events==0.6.3 + - jupyter-server==2.5.0 + - jupyter-server-fileid==0.8.0 + - jupyter-server-terminals==0.4.4 + - jupyter-server-ydoc==0.8.0 + - jupyter-ydoc==0.2.3 + - jupyterlab==3.6.3 + - jupyterlab-pygments==0.2.2 + - jupyterlab-server==2.22.0 + - markupsafe==2.1.2 + - mistune==2.0.5 + - mypy==1.11.2 + - mypy-extensions==1.0.0 + - nbclassic==0.5.5 + - nbclient==0.7.3 + - nbconvert==7.3.0 + - nbformat==5.8.0 + - nest-asyncio==1.5.6 + - notebook==6.5.4 + - notebook-shim==0.2.2 + - numpy==1.24.2 + - packaging==23.0 + - pandas==2.0.0 + - pandocfilters==1.5.0 + - parso==0.8.3 + - pickleshare==0.7.5 + - platformdirs==3.2.0 + - plotly==5.14.1 + - prometheus-client==0.16.0 + - prompt-toolkit==3.0.38 + - psutil==5.9.4 + - pure-eval==0.2.2 + - pycparser==2.21 + - pygments==2.14.0 + - pyrsistent==0.19.3 + - python-dateutil==2.8.2 + - python-json-logger==2.0.7 + - pytz==2023.3 + - pywin32==306 + - pywinpty==2.0.10 + - pyyaml==6.0 + - pyzmq==25.0.2 + - requests==2.28.2 + - rfc3339-validator==0.1.4 + - rfc3986-validator==0.1.1 + - scipy==1.10.1 + - send2trash==1.8.0 + - six==1.16.0 + - sniffio==1.3.0 + - soupsieve==2.4 + - stack-data==0.6.2 + - tenacity==8.2.2 + - terminado==0.17.1 + - tinycss2==1.2.1 + - tornado==6.2 + - traitlets==5.9.0 + - typing-extensions==4.12.2 + - tzdata==2023.3 + - uri-template==1.2.0 + - urllib3==1.26.15 + - wcwidth==0.2.6 + - webcolors==1.13 + - webencodings==0.5.1 + - websocket-client==1.5.1 + - werkzeug==2.2.3 + - y-py==0.5.9 + - ypy-websocket==0.8.2 \ No newline at end of file diff --git a/.ipynb_checkpoints/model-checkpoint.py b/.ipynb_checkpoints/model-checkpoint.py index 2302456..3956f79 100644 --- a/.ipynb_checkpoints/model-checkpoint.py +++ b/.ipynb_checkpoints/model-checkpoint.py @@ -4,11 +4,15 @@ from typing import List from typing import Tuple +import os import yaml import src.model_classes as mc -with open('config.yaml') as f: +package_dir = os.path.dirname(os.path.abspath(__file__)) +config_fp = os.path.join(package_dir, "config.yaml") + +with open(config_fp) as f: config = yaml.safe_load(f) class Model: diff --git a/.ipynb_checkpoints/toolkit-checkpoint.ipynb b/.ipynb_checkpoints/toolkit-checkpoint.ipynb deleted file mode 100644 index b307bda..0000000 --- a/.ipynb_checkpoints/toolkit-checkpoint.ipynb +++ /dev/null @@ -1,1181 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 14, - "id": "ab695a7b-3043-4154-a59b-01e57feaf8f0", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "ename": "IndentationError", - "evalue": "unexpected indent (3060257492.py, line 620)", - "output_type": "error", - "traceback": [ - "\u001b[1;36m Cell \u001b[1;32mIn[14], line 620\u001b[1;36m\u001b[0m\n\u001b[1;33m self.A = A\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mIndentationError\u001b[0m\u001b[1;31m:\u001b[0m unexpected indent\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from scipy.stats import chi2_contingency\n", - "import yaml\n", - "\n", - "from pandas import DataFrame\n", - "from typing import Dict\n", - "from typing import List\n", - "from typing import Any\n", - "from typing import Tuple\n", - "\n", - "with open('config.yaml') as f:\n", - " config = yaml.safe_load(f)\n", - "\n", - "class Ingest:\n", - " \n", - " \"\"\"\n", - " Class to ingest dataframe input.\n", - " \"\"\"\n", - " \n", - " def __init__(\n", - " self,\n", - " config: Dict[Any, Any]\n", - " ) -> None:\n", - " \n", - " \"\"\"\n", - " Inits class with the config file\n", - " and unpacks the config file.\n", - " \"\"\"\n", - " \n", - " self.config = config\n", - " \n", - " self.unpack_config()\n", - "\n", - " \n", - " def run(\n", - " self\n", - " ) -> DataFrame:\n", - " \n", - " \"\"\"\n", - " Run function for the class.\n", - " \n", - " :param None:\n", - " :return df:\n", - " DataFrame, ingested df\n", - " \"\"\"\n", - " \n", - " df = self.run_load()\n", - " \n", - " df = self.run_harmonize(df)\n", - " \n", - " return df\n", - " \n", - " \n", - " def unpack_config(\n", - " self\n", - " ) -> None:\n", - " \n", - " \"\"\"\n", - " Function to unpack config vars.\n", - " \n", - " :var filepath:\n", - " str, the relative filepath \n", - " :var group_variable:\n", - " str, the column name for the \n", - " group variable of interest e.g.\n", - " gender, which contains the target \n", - " class and non-target class e.g.\n", - " females and males.\n", - " :var group_target_val:\n", - " str, within the group_variable column,\n", - " contains the contains the target \n", - " class value e.g.\n", - " females.\n", - " :var group_other_val:\n", - " str, within the group_variable column,\n", - " contains the contains the non-target \n", - " class value e.g. males.\n", - " :var outcome_variable:\n", - " str, the column name for the \n", - " outcome variable of interest e.g.\n", - " hired, which contains the target \n", - " class and non-target class e.g.\n", - " hired and not-hired.\n", - " :var outcome_target_val:\n", - " str, within the outcome_variable column,\n", - " contains the contains the target \n", - " class value e.g.\n", - " hired.\n", - " :var outcome_other_val:\n", - " str, within the outcome_variable column,\n", - " contains the contains the non-target \n", - " class value e.g. not-hired.\n", - " :var grpers:\n", - " Dict[str,str], can be any set of filterable\n", - " columns to slice into particular groups within\n", - " the broader employee roster. The key is the column,\n", - " the value is the desired class within the column\n", - " e.g. job_title: analyst.\n", - " \"\"\"\n", - " \n", - " config = self.config\n", - " \n", - " try:\n", - " self.filepath: str = config[\"Ingest\"][\"filepath\"]\n", - " self.group_variable: str = config[\"Ingest\"][\"group_variable\"]\n", - " self.group_target_val: str = config[\"Ingest\"][\"group_target_val\"]\n", - " self.group_other_val: str = config[\"Ingest\"][\"group_other_val\"]\n", - " self.outcome_variable: str = config[\"Ingest\"][\"outcome_variable\"]\n", - " self.outcome_target_val: str = config[\"Ingest\"][\"outcome_target_val\"]\n", - " self.outcome_other_val: str = config[\"Ingest\"][\"outcome_other_val\"]\n", - " self.grpers: Dict[str, str] = config[\"Ingest\"][\"grpers\"]\n", - "\n", - " # Type validation\n", - " if not isinstance(self.filepath, str):\n", - " raise TypeError(\"Expected 'filepath' to be of type 'str'.\")\n", - " if not isinstance(self.group_variable, str):\n", - " raise TypeError(\"Expected 'group_variable' to be of type 'str'.\")\n", - " if not isinstance(self.group_target_val, str):\n", - " raise TypeError(\"Expected 'group_target_val' to be of type 'str'.\")\n", - " if not isinstance(self.group_other_val, str):\n", - " raise TypeError(\"Expected 'group_other_val' to be of type 'str'.\")\n", - " if not isinstance(self.outcome_variable, str):\n", - " raise TypeError(\"Expected 'outcome_variable' to be of type 'str'.\")\n", - " if not isinstance(self.outcome_target_val, str):\n", - " raise TypeError(\"Expected 'outcome_target_val' to be of type 'str'.\")\n", - " if not isinstance(self.outcome_other_val, str):\n", - " raise TypeError(\"Expected 'outcome_other_val' to be of type 'str'.\")\n", - " if not isinstance(self.grpers, dict):\n", - " raise TypeError(\"Expected 'grpers' to be of type 'dict'.\")\n", - "\n", - " except KeyError as e:\n", - " raise KeyError(f\"Missing key '{e.args[0]}' in the config file. \"\n", - " \"Please ensure the config file contains all required keys under the 'Ingest' section: \"\n", - " \"'filepath', 'group_variable', 'group_target_val', 'group_other_val', \"\n", - " \"'outcome_variable', 'outcome_target_val', 'outcome_other_val', and 'grpers'.\")\n", - "\n", - " except TypeError as e:\n", - " raise TypeError(f\"Config file error: {e}\")\n", - " \n", - " def run_load(\n", - " self\n", - " ) -> DataFrame:\n", - " \n", - " \"\"\"\n", - " Loads csv file. Assumes headers are row 0.\n", - " \n", - " :param None:\n", - " :return DataFrame:\n", - " \"\"\"\n", - " \n", - " filepath = self.filepath\n", - " \n", - " try:\n", - " return pd.read_csv(filepath, skiprows=0)\n", - "\n", - " except FileNotFoundError:\n", - " raise FileNotFoundError(\n", - " f\"The file at {filepath} was not found. Please check the file path.\"\n", - " )\n", - "\n", - " except pd.errors.EmptyDataError:\n", - " raise ValueError(\n", - " f\"The file at {filepath} is empty and cannot be loaded.\"\n", - " )\n", - "\n", - " except pd.errors.ParserError:\n", - " raise ValueError(\n", - " f\"The file at {filepath} contains malformed data and could not be parsed as a valid CSV.\"\n", - " )\n", - "\n", - " except PermissionError:\n", - " raise PermissionError(\n", - " f\"Permission denied when attempting to read the file at {filepath}.\"\n", - " f\"Please check the file permissions.\"\n", - " )\n", - "\n", - " except Exception as e:\n", - " raise Exception(\n", - " f\"An unexpected error occurred while loading the file: {str(e)}\"\n", - " )\n", - " \n", - " def run_harmonize(\n", - " self,\n", - " df: DataFrame\n", - " ) -> DataFrame: \n", - "\n", - " \"\"\"\n", - " Function to harmonize the dataset.\n", - " \n", - " :param df: \n", - " DataFrame, loaded df\n", - " :return df:\n", - " DataFrame, filtered down to target and other group and\n", - " harmonize the fields\n", - " \"\"\"\n", - " \n", - " group_variable = self.group_variable\n", - " group_target_val = self.group_target_val\n", - " group_other_val = self.group_other_val\n", - " outcome_variable = self.outcome_variable\n", - " outcome_target_val = self.outcome_target_val\n", - " outcome_other_val = self.outcome_other_val\n", - " grpers = self.grpers\n", - "\n", - " df = self._apply_filters(\n", - " df=df,\n", - " group_variable=group_variable,\n", - " group_target_val=group_target_val,\n", - " group_other_val=group_other_val,\n", - " grpers=grpers\n", - " )\n", - " \n", - " df = self._apply_harmonize(\n", - " df=df,\n", - " group_variable=group_variable,\n", - " group_target_val=group_target_val,\n", - " group_other_val=group_other_val,\n", - " outcome_variable=outcome_variable,\n", - " outcome_target_val=outcome_target_val,\n", - " outcome_other_val=outcome_other_val\n", - " )\n", - " \n", - " return df\n", - " \n", - " def _apply_filters(\n", - " self,\n", - " df: DataFrame,\n", - " group_variable: str,\n", - " group_target_val: str,\n", - " group_other_val: str,\n", - " grpers: Dict[str,str],\n", - " ) -> DataFrame:\n", - " \n", - " \"\"\"\n", - " Method to apply filters\n", - " \n", - " :param df:\n", - " DataFrame, target df\n", - " :param group_variable:\n", - " str, column name of the\n", - " target variable\n", - " :param group_target_val:\n", - " str, class target value of the group_variable\n", - " aka the protected class value\n", - " :param group_other_val:\n", - " str, class nontarget value of the group_variable\n", - " aka the nonprotected class value\n", - " :return df:\n", - " DataFrame, filtered df\n", - " \"\"\"\n", - " \n", - " df = df.loc[\n", - " df[group_variable].isin(\n", - " [\n", - " group_target_val, \n", - " group_other_val\n", - " ]\n", - " )\n", - " ]\n", - " \n", - " for k, v in grpers.items():\n", - " \n", - " df = df.loc[\n", - " df[k].isin([v])\n", - " ] \n", - " \n", - " return df\n", - " \n", - " def _apply_harmonize(\n", - " self,\n", - " df: DataFrame,\n", - " group_variable: str,\n", - " group_target_val: str,\n", - " group_other_val: str,\n", - " outcome_variable: str,\n", - " outcome_target_val: str,\n", - " outcome_other_val: str\n", - " ) -> DataFrame:\n", - " \n", - " \"\"\"\n", - " Method to harmonize targets.\n", - " \n", - " :param df:\n", - " DataFrame, target df\n", - " :param group_variable:\n", - " str, column name of the\n", - " target variable\n", - " :param group_target_val:\n", - " str, class target value of the group_variable\n", - " aka the protected class value\n", - " :param group_other_val:\n", - " str, class nontarget value of the group_variable\n", - " aka the nonprotected class value \n", - " :param outcome_variable:\n", - " str, the column name of the outcome \n", - " :param outcome_target_val:\n", - " str, class target value of the outcome_variable\n", - " aka success\n", - " :param outcome_other_val:\n", - " str, class nontarget value of the outcome_variable\n", - " :return df:\n", - " DataFrame, target df\n", - " \"\"\"\n", - " \n", - " # harmonize the group target\n", - " df['group_var_clean'] = np.where(\n", - " df[group_variable]==group_target_val, \n", - " 1,\n", - " np.where(\n", - " df[group_variable]==group_other_val, \n", - " 0, \n", - " -1\n", - " )\n", - " )\n", - " \n", - " # harmonize the outcome target\n", - " df['outcome_var_clean'] = np.where(\n", - " df[outcome_variable]==outcome_target_val, \n", - " 1, \n", - " np.where(\n", - " df[self.outcome_variable]==outcome_other_val,\n", - " 0, \n", - " -1\n", - " )\n", - " ) \n", - " \n", - " return df\n", - " \n", - "class Transform:\n", - " \n", - " \"\"\"\n", - " Class to transform dataframe inputs into \n", - " 2x2 contingency table.\n", - " \"\"\"\n", - " \n", - " def __init__(\n", - " self, \n", - " df: DataFrame\n", - " ) -> None:\n", - " \n", - " \"\"\"\n", - " :param df:\n", - " DataFrame, input df\n", - " \"\"\"\n", - " \n", - " self.df = df\n", - " \n", - " def run_build_cont_table(\n", - " self\n", - " ) -> List[int]:\n", - " \n", - " \"\"\"\n", - " Function to generate contingency table format.\n", - " \n", - " Places the target group val in the top row and the\n", - " target group other to the bottom row.\n", - " \n", - " Places no-success outcome on the first column and success\n", - " on the second column.\n", - " \n", - " :return tbl:\n", - " List[int], filtered down to target and other group.\n", - " \"\"\"\n", - " \n", - " df = self.df\n", - " \n", - " cols = [\n", - " 'group_var_clean', \n", - " 'outcome_var_clean'\n", - " ]\n", - " \n", - " df = df[cols]\n", - " \n", - " tbl = (\n", - " df.pivot_table(\n", - " index='group_var_clean',\n", - " columns='outcome_var_clean', \n", - " aggfunc=len\n", - " ).\n", - " sort_index(\n", - " axis=1, \n", - " ascending=True\n", - " ).\n", - " sort_index(ascending=False). # ensure always [1,0]\n", - " values.tolist()\n", - " ) \n", - " \n", - " return tbl\n", - " \n", - "class StatsTesting2x2Cont:\n", - " \n", - " \"\"\"\n", - " Class to perform 2x2 Contigency Table analysis\n", - " with Chi2 and Phi Correlation Coefficent Testing.\n", - "\n", - " Provides context into potential association between\n", - " variables and the strength of the association.\n", - " \"\"\"\n", - " \n", - " def __init__(\n", - " self,\n", - " config: Dict[Any, Any],\n", - " tbl: List[int],\n", - " df: DataFrame\n", - " ) -> None:\n", - " \n", - " \"\"\"\n", - " Inits the class variables and unpacks the\n", - " config variables.\n", - " \n", - " :param config:\n", - " Dict[str,Any], loaded config file.\n", - " :param tbl:\n", - " List[int], 2x2 cont table.\n", - " :param df:\n", - " DataFrame, original input DataFrame.\n", - " \"\"\"\n", - " \n", - " self.config = config\n", - " self.tbl = tbl\n", - " self.df = df\n", - "\n", - " self.unpack_config()\n", - "\n", - " def run_testing(\n", - " self\n", - " ) -> DataFrame:\n", - " \n", - " \"\"\"\n", - " Run function for the class.\n", - " \n", - " Runs hypothesis evaluation and builds\n", - " the output report DataFrame.\n", - " \n", - " :param None:\n", - " :return df_results:\n", - " DataFrame, with testing results.\n", - " \"\"\"\n", - " \n", - " alpha = self.alpha\n", - " tbl = self.tbl\n", - " process = self.process\n", - " group_variable = self.group_variable\n", - " group_target_val = self.group_target_val\n", - " group_other_val = self.group_other_val\n", - " bin_edges = self.bin_edges\n", - " bin_labels = self.bin_labels\n", - " \n", - " res = self.gen_hypothesis_eval(tbl)\n", - "\n", - " df_results = self.run_report_bld(\n", - " alpha=alpha,\n", - " res=res,\n", - " tbl=tbl,\n", - " process=process,\n", - " group_variable=group_variable,\n", - " group_target_val=group_target_val,\n", - " group_other_val=group_other_val,\n", - " bin_edges=bin_edges,\n", - " bin_labels=bin_labels\n", - " )\n", - " \n", - " return df_results\n", - " \n", - " def unpack_config(\n", - " self\n", - " ) -> None:\n", - " \n", - " \"\"\"\n", - " Function to unpack config variables.\n", - " \n", - " :param None:\n", - " :return None:\n", - " \"\"\"\n", - " \n", - " config = self.config\n", - "\n", - " try:\n", - " self.alpha: float = config[\"StatsTesting2x2Cont\"][\"alpha\"]\n", - " self.group_variable: str = config[\"Ingest\"][\"group_variable\"]\n", - " self.group_target_val: str = config[\"Ingest\"][\"group_target_val\"]\n", - " self.group_other_val: str = config[\"Ingest\"][\"group_other_val\"]\n", - " self.outcome_variable: str = config[\"Ingest\"][\"outcome_variable\"]\n", - " self.outcome_target_val: str = config[\"Ingest\"][\"outcome_target_val\"]\n", - " self.outcome_other_val: str = config[\"Ingest\"][\"outcome_other_val\"]\n", - " self.grpers: Dict[str, str] = config[\"Ingest\"][\"grpers\"]\n", - " self.testing: str = config[\"StatsTesting2x2Cont\"][\"testing\"]\n", - " self.process: str = config[\"StatsTesting2x2Cont\"][\"process\"]\n", - " self.bin_edges: List[float] = config[\"StatsTesting2x2Cont\"][\"phi_bin_edges\"]\n", - " self.bin_labels: List[str] = config[\"StatsTesting2x2Cont\"][\"phi_bin_labels\"]\n", - "\n", - " if not isinstance(self.alpha, float):\n", - " raise TypeError(\"Expected 'alpha' to be of type 'float'.\")\n", - " if not isinstance(self.group_variable, str):\n", - " raise TypeError(\"Expected 'group_variable' to be of type 'str'.\")\n", - " if not isinstance(self.group_target_val, str):\n", - " raise TypeError(\"Expected 'group_target_val' to be of type 'str'.\")\n", - " if not isinstance(self.group_other_val, str):\n", - " raise TypeError(\"Expected 'group_other_val' to be of type 'str'.\")\n", - " if not isinstance(self.outcome_variable, str):\n", - " raise TypeError(\"Expected 'outcome_variable' to be of type 'str'.\")\n", - " if not isinstance(self.outcome_target_val, str):\n", - " raise TypeError(\"Expected 'outcome_target_val' to be of type 'str'.\")\n", - " if not isinstance(self.outcome_other_val, str):\n", - " raise TypeError(\"Expected 'outcome_other_val' to be of type 'str'.\")\n", - " if not isinstance(self.grpers, dict):\n", - " raise TypeError(\"Expected 'grpers' to be of type 'dict'.\")\n", - " if not isinstance(self.testing, str):\n", - " raise TypeError(\"Expected 'testing' to be of type 'str'.\")\n", - " if not isinstance(self.process, str):\n", - " raise TypeError(\"Expected 'process' to be of type 'str'.\")\n", - " if not isinstance(\n", - " self.bin_edges, list\n", - " ) or not all(\n", - " isinstance(\n", - " i, (int, float)\n", - " ) for i in self.bin_edges\n", - " ):\n", - " raise TypeError(\"Expected 'bin_edges' to be a list of floats.\")\n", - " if not isinstance(\n", - " self.bin_labels, list\n", - " ) or not all(\n", - " isinstance(i, str) for i in self.bin_labels\n", - " ):\n", - " raise TypeError(\"Expected 'bin_labels' to be a list of strings.\")\n", - " \n", - " except KeyError as e:\n", - " raise KeyError(\n", - " f\"Missing key '{e.args[0]}' in the config file. \"\n", - " f\"Ensure all required keys are present in the 'Ingest' and 'StatsTesting2x2Cont' sections.\"\n", - " )\n", - "\n", - " except TypeError as e:\n", - " raise TypeError(f\"Config file error: {e}\")\n", - "\n", - " \n", - " def gen_hypothesis_eval(\n", - " self,\n", - " tbl: List[int]\n", - " ) -> chi2_contingency:\n", - " \n", - " \"\"\"\n", - " Function to generate the chi2_contigency\n", - " statistic and result.\n", - " \"\"\"\n", - " \n", - " #size = np.shape(tbl)\n", - " #tbl_len = len(tbl)\n", - " \n", - " res = chi2_contingency(\n", - " tbl\n", - " )\n", - " \n", - " return res\n", - " \n", - " def run_report_bld(\n", - " self,\n", - " alpha: float,\n", - " res: chi2_contingency,\n", - " tbl: List[int],\n", - " process: str,\n", - " group_variable: str,\n", - " group_target_val: str,\n", - " group_other_val: str,\n", - " bin_edges: List[float],\n", - " bin_labels: List[str]\n", - " ) -> DataFrame:\n", - " \n", - " \"\"\"\n", - " Runs report for statistical testing\n", - " chi2_contingency results\n", - " \n", - " :param alpha:\n", - " float, alpha value for significance evaluation.\n", - " :param res:\n", - " chi2_contingency, result of the chi2_contingency.\n", - " :param tbl:\n", - " List[int], the contingency table.\n", - " :param process: \n", - " str, the name of the business process\n", - " being tested, e.g. 'hiring'.\n", - " :param group_variable:\n", - " str, column name of the\n", - " target variable.\n", - " :param group_target_val:\n", - " str, class target value of the group_variable\n", - " aka the protected class value.\n", - " :param group_other_val:\n", - " str, class nontarget value of the group_variable\n", - " aka the nonprotected class value. \n", - " :param bin_edges:\n", - " List[float], edges for phi\n", - " bins.\n", - " :param bin_labels:\n", - " List[str], labels for the phi\n", - " bins.\n", - " :return df:\n", - " DataFrame, target\n", - " \"\"\"\n", - " \n", - " pvalue = res[1]\n", - " \n", - " df = pd.DataFrame()\n", - "\n", - " df = self._gen_significance_test(\n", - " df=df,\n", - " pvalue=pvalue,\n", - " alpha=alpha\n", - " )\n", - " \n", - " (\n", - " df,\n", - " A,\n", - " B,\n", - " C,\n", - " D,\n", - " total_target_grp,\n", - " total_non_target_grp,\n", - " diagonals,\n", - " percent_target_succ,\n", - " percent_non_target_succ,\n", - " phi_numerator,\n", - " phi_denominator\n", - " ) = self._gen_table_calcs(\n", - " df=df,\n", - " tbl=tbl,\n", - " )\n", - " \n", - " if res[1] <= alpha:\n", - " df, phi_result = self._gen_phi_coefficient(\n", - " df=df,\n", - " tbl=tbl,\n", - " bin_edges=bin_edges,\n", - " bin_labels=bin_labels,\n", - " process=process,\n", - " group_variable=group_variable,\n", - " group_target_val=group_target_val,\n", - " group_other_val=group_other_val\n", - " )\n", - " \n", - " else:\n", - " df['phi_corr_coeff'] = np.nan\n", - " df['phi_bins'] = np.nan\n", - " \n", - " phi_result = \"\"\n", - " \n", - " df = self._gen_four_fifths_test(df)\n", - " \n", - " df = self._gen_outcome_meta(\n", - " df,\n", - " round(res[1],3),\n", - " phi_result\n", - " )\n", - " \n", - " df = self._gen_unpack_stats(\n", - " df,\n", - " res\n", - " )\n", - " \n", - " return df\n", - " \n", - " def _gen_unpack_stats(\n", - " self,\n", - " df: DataFrame,\n", - " res: chi2_contingency\n", - " ) -> DataFrame:\n", - " \n", - " \"\"\"\n", - " Method to unpack test stats from\n", - " chi2_contingency results.\n", - " \n", - " :param df:\n", - " DataFrame, output df.\n", - " :param res:\n", - " chi2_contingency, results array.\n", - " :return df:\n", - " DataFrame, output df.\n", - " \"\"\"\n", - " \n", - " group_target_val = self.group_target_val\n", - " group_other_val = self.group_other_val\n", - " rows = [group_target_val] + [group_other_val]\n", - " \n", - " df['statistic'] = res[0]\n", - " df['pvalue'] = res[1]\n", - " df['dof'] = res[2]\n", - " df['tbl_row'] = [rows]\n", - " df['tbl'] = [tbl]\n", - " df['expected_freq'] = [res[3]]\n", - " df['tbl_expected_diff'] = [tbl - res[3]]\n", - " \n", - " return df\n", - " \n", - " def _gen_significance_test(\n", - " self,\n", - " df: DataFrame,\n", - " pvalue: float,\n", - " alpha: float\n", - " ):\n", - " \"\"\"\n", - " Method to report on test significance.\n", - " \n", - " :param df:\n", - " DataFrame, results df.\n", - " :param pval:\n", - " int, pvalue.\n", - " :param alpha:\n", - " float, the alpha value for testing eval.\n", - " :return df:\n", - " DataFrame with metadata added. \n", - " \"\"\"\n", - " \n", - " if pvalue <= alpha:\n", - " val = 'statistically significant result'\n", - " \n", - " else:\n", - " val = 'no statistically significant result'\n", - " \n", - " df['test_result'] = [val]\n", - " \n", - " return df\n", - " \n", - " def _gen_phi_coefficient(\n", - " self,\n", - " df: DataFrame,\n", - " tbl: List[int],\n", - " process: str,\n", - " group_variable: str,\n", - " group_target_val: str,\n", - " group_other_val: str,\n", - " bin_edges: List[float],\n", - " bin_labels: List[str]\n", - " ) -> DataFrame:\n", - " \n", - " \"\"\"\n", - " Method to generate the phi coefficient.\n", - " \n", - " :param df:\n", - " DataFrame, the results df.\n", - " :param tbl:\n", - " List[int], the 2x2 cont table.\n", - " :param process: \n", - " str, the name of the business process\n", - " being tested, e.g. 'hiring'.\n", - " :param group_variable:\n", - " str, column name of the\n", - " target variable.\n", - " :param group_target_val:\n", - " str, class target value of the group_variable\n", - " aka the protected class value.\n", - " :param group_other_val:\n", - " str, class nontarget value of the group_variable\n", - " aka the nonprotected class value. \n", - " :param bin_edges:\n", - " List[float], edges for phi\n", - " bins.\n", - " :param bin_labels:\n", - " List[str], lab\n", - " :return df:\n", - " DataFrame, output df.\n", - " \"\"\"\n", - " \n", - " diagonals = self.diagonals\n", - " numerator = self.phi_numerator\n", - " denominator = self.phi_denominator\n", - "\n", - " phi = numerator / denominator if denominator != 0 else 0\n", - "\n", - " df['phi_corr_coeff'] = phi\n", - " \n", - " df = self._gen_prep_phi_bins(\n", - " df=df,\n", - " bin_edges=bin_edges,\n", - " bin_labels=bin_labels\n", - " )\n", - "\n", - " df, phi_result = self._gen_prep_diagonals(\n", - " df=df,\n", - " diagonals=diagonals,\n", - " process=process,\n", - " group_variable=group_variable,\n", - " group_other_val=group_other_val,\n", - " group_target_val=group_target_val,\n", - " percent_non_target_succ=self.percent_non_target_succ,\n", - " percent_target_succ=self.percent_target_succ,\n", - " )\n", - " \n", - " return df, phi_result\n", - " \n", - " def _gen_table_calcs(\n", - " self,\n", - " df: DataFrame,\n", - " tbl: List[int]\n", - " ) -> Tuple[\n", - " DataFrame, float, float, float, float,\n", - " float, float, float, float, \n", - " float, float, float\n", - " ]:\n", - " \n", - " \"\"\"\n", - " Method to generate phi bins. Provides additional\n", - " explainability on the magnitude of association, when \n", - " an association is found.\n", - " \n", - " :param df:\n", - " DataFrame, output df.\n", - " :param tbl:\n", - " List[int], 2x2 contingency.\n", - " :return [\n", - " df, A, B, C, D, total_target_grp,\n", - " total_non_target_grp, diagonals,\n", - " percent_target_succ, percent_non_target_succ,\n", - " phi_numerator, phi_denominator\n", - " ]:\n", - " Tuple[DataFrame, float, float, float, float,\n", - " float, float, float, float, \n", - " float, float, float\n", - " ]\n", - " \"\"\"\n", - " \n", - " # females, males; no succ, succ\n", - " A, B = tbl[0] \n", - " C, D = tbl[1]\n", - " \n", - " total_target_grp = A + B\n", - " total_non_target_grp = C + D\n", - " diagonals = (A + D) > (B + C)\n", - " percent_target_succ = (B / total_target_grp) * 100\n", - " percent_non_target_succ = (D / total_non_target_grp) * 100\n", - " phi_numerator = (A * D) - (B * C)\n", - " phi_denominator = np.sqrt((A + B) * (C + D) * (A + C) * (B + D)) \n", - " \n", - " return (\n", - " df,\n", - " A,\n", - " B,\n", - " C,\n", - " D,\n", - " total_target_grp,\n", - " total_non_target_grp,\n", - " diagonals,\n", - " percent_target_succ,\n", - " percent_non_target_succ,\n", - " phi_numerator,\n", - " phi_denominator\n", - " )\n", - " \n", - " def _gen_prep_phi_bins(\n", - " self,\n", - " df: DataFrame,\n", - " bin_edges: List[float],\n", - " bin_labels: List[str]\n", - " ) -> DataFrame:\n", - " \n", - " \"\"\"\n", - " Method to generate pandas bins for \n", - " phi coeff.\n", - " \n", - " :param df:\n", - " DataFrame, output df.\n", - " :param bin_edges:\n", - " List[float], edges for phi\n", - " bins.\n", - " :param bin_labels:\n", - " List[str], labels for the phi\n", - " bins.\n", - " :return df:\n", - " DataFrame, output df.\n", - " \"\"\"\n", - " \n", - " df['phi_bins'] = pd.cut(\n", - " df['phi_corr_coeff'], \n", - " bins=bin_edges, \n", - " labels=bin_labels, \n", - " include_lowest=True\n", - " )\n", - " \n", - " return df\n", - " \n", - " def _gen_four_fifths_test(\n", - " self,\n", - " df: DataFrame\n", - " ) -> DataFrame:\n", - " \n", - " percent_target_succ = self.percent_target_succ\n", - " percent_non_target_succ = self.percent_non_target_succ \n", - " \n", - " ratio = percent_target_succ / percent_non_target_succ\n", - " \n", - " if ratio < .8:\n", - " ratio_desc = f'failed with 4/5 test at {round(ratio,3)}'\n", - " elif ratio >= .8:\n", - " ratio_desc = f'passed with 4/5 test at {round(ratio,3)}'\n", - " else:\n", - " ratio_desc = 'error calculating 4/5 test'\n", - " \n", - " df['four_fifths_test'] = ratio_desc\n", - " return df\n", - " \n", - " def _gen_prep_diagonals(\n", - " self,\n", - " df: DataFrame,\n", - " diagonals: bool,\n", - " process: str,\n", - " group_variable: str,\n", - " group_other_val: str,\n", - " group_target_val: str,\n", - " percent_non_target_succ: float,\n", - " percent_target_succ: float,\n", - " ) -> Tuple[DataFrame, str]:\n", - " \n", - " \"\"\"\n", - " Method to generate the magnitude of the\n", - " assocation using phi coefficient analysis.\n", - " \n", - " :param df:\n", - " DataFrame, output df.\n", - " :param diagonals:\n", - " bool,\n", - " :param process: \n", - " str, the name of the business process\n", - " being tested, e.g. 'hiring'.\n", - " :param group_variable:\n", - " str, column name of the\n", - " target variable.\n", - " :param group_target_val:\n", - " str, class target value of the group_variable\n", - " aka the protected class value.\n", - " :param group_other_val:\n", - " str, class nontarget value of the group_variable\n", - " aka the nonprotected class value. \n", - " :param percent_non_target_succ:\n", - " float, the success percentage attained\n", - " for the the non-target group.\n", - " :param percent_target_succ:\n", - " float, the success percentage attained for the\n", - " target class.\n", - " :return (df, phi_col):\n", - " Tuple[df, phi_col]\n", - " \"\"\"\n", - " \n", - " phi_bin = df['phi_bins'].values[0] \n", - " phi_corr_coeff = df['phi_corr_coeff'].values[0] \n", - "\n", - " if diagonals:\n", - " diagonal_msg = (\n", - " f\"The values on the positive diagonal of the 'tbl' indicate the distribution of {process} success across {group_variable} categories.\"\n", - " f\" {group_other_val} had a higher proportion of successful outcomes compared to {group_target_val}.\"\n", - " f\" Specifically, {percent_non_target_succ:.1f}% of {group_other_val} had success while only {percent_target_succ:.1f}%\"\n", - " f\" of {group_target_val} had success.\"\n", - " f\" This significant difference in {process} success rates suggests a potential {group_variable} bias, with {group_other_val} success in {process}\"\n", - " f\" at a higher rate than {group_target_val}.\"\n", - " )\n", - " \n", - " else:\n", - " diagonal_msg = \"the diagonal values are not substantially higher, suggesting the relationship might be more nuanced.\"\n", - " \n", - " phi_col = f\"The phi correlation coefficient is {phi_corr_coeff:.3f}, indicating a {phi_bin} effect size. {diagonal_msg}\"\n", - " \n", - " return df, phi_col\n", - " \n", - " def _gen_outcome_meta(\n", - " self,\n", - " df: DataFrame,\n", - " pval: float,\n", - " phi_result: str\n", - " ) -> DataFrame:\n", - " \n", - " \"\"\"\n", - " Method to generate meta data for \n", - " reporting dataframe\n", - " \n", - " :param df:\n", - " DataFrame, results df\n", - " :param pval:\n", - " int, pvalue\n", - " :param phi_result:\n", - " str, result of phi testing.\n", - " :return df:\n", - " DataFrame with metadata added\n", - " \"\"\"\n", - " \n", - " grpers = self.grpers\n", - " result = df['test_result'].values[0]\n", - " phi_col = df['phi_corr_coeff'].values[0]\n", - " testing = self.testing\n", - " process = self.process\n", - " group_target_val = self.group_target_val\n", - " alpha = self.alpha\n", - " four_fifths = df['four_fifths_test'].values[0]\n", - " \n", - " col = f\"Testing for {grpers}, {four_fifths}. Based on the results of the chi-square test of independence, there is {result} for {testing}-based {process} discrimination against {group_target_val} at the chosen significance level of {alpha}.\"\n", - "\n", - " if result == \"statistically significant result\":\n", - " col = f\"{col} {phi_result}\"\n", - " \n", - " df['result_desc'] = col\n", - " \n", - " return df\n", - " \n", - "# pipeline\n", - "\n", - "ingestObj = Ingest(config)\n", - "df = ingestObj.run()\n", - "\n", - "transObj = Transform(\n", - " df.copy()\n", - ")\n", - "tbl = transObj.run_build_cont_table()\n", - "\n", - "statsObj = StatsTesting2x2Cont(\n", - " config,\n", - " tbl,\n", - " df.copy() # need to add some more context in plain text\n", - ")\n", - "df_result = statsObj.run_testing()\n", - "\n", - "df_result['result_desc'].tolist()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "01672735-2ad5-42ae-9488-1962f3d0e63e", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
test_resultphi_corr_coeffphi_binsfour_fifths_testresult_descstatisticpvaluedoftbl_rowtblexpected_freqtbl_expected_diff
0statistically significant result0.39736moderatefailed 4/5 test at 0.167Testing for {'job_title': 'analyst'}, based on...5.2182460.0223511[Female, Male][[10, 1], [15, 18]][[6.25, 4.75], [18.75, 14.25]][[3.75, -3.75], [-3.75, 3.75]]
\n", - "
" - ], - "text/plain": [ - " test_result phi_corr_coeff phi_bins \n", - "0 statistically significant result 0.39736 moderate \\\n", - "\n", - " four_fifths_test \n", - "0 failed 4/5 test at 0.167 \\\n", - "\n", - " result_desc statistic pvalue \n", - "0 Testing for {'job_title': 'analyst'}, based on... 5.218246 0.022351 \\\n", - "\n", - " dof tbl_row tbl expected_freq \n", - "0 1 [Female, Male] [[10, 1], [15, 18]] [[6.25, 4.75], [18.75, 14.25]] \\\n", - "\n", - " tbl_expected_diff \n", - "0 [[3.75, -3.75], [-3.75, 3.75]] " - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_result" - ] - }, - { - "cell_type": "markdown", - "id": "94114f77-caa8-41a3-900a-44317c84f4b7", - "metadata": {}, - "source": [ - "to do:\n", - " \n", - "implement these tests\n", - "\n", - "\n", - "https://en.wikipedia.org/wiki/Disparate_impact\n", - "\n", - "Add handler for filtered size of group must be ...\n", - "\n", - "# need to check this size\n", - "# https://online.stat.psu.edu/stat500/lesson/8/8.2#:~:text=That%20equates%20to%20the%20Chi,count%20of%20at%20least%205.\n", - "\n", - "# make sure at least 5 in each slice, then at least 50" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1d975b96-c8af-4e46-9521-0c7fbe442ff1", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "hrailabs_dev", - "language": "python", - "name": "hrailabs" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/__pycache__/__init__.cpython-311.pyc b/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..27c8d58 Binary files /dev/null and b/__pycache__/__init__.cpython-311.pyc differ diff --git a/__pycache__/model.cpython-311.pyc b/__pycache__/model.cpython-311.pyc index 42d4609..ef93758 100644 Binary files a/__pycache__/model.cpython-311.pyc and b/__pycache__/model.cpython-311.pyc differ diff --git a/env.yaml b/env.yaml new file mode 100644 index 0000000..133e4f0 --- /dev/null +++ b/env.yaml @@ -0,0 +1,130 @@ +name: equity_toolkit +channels: + - defaults +dependencies: + - bzip2=1.0.8=he774522_0 + - ca-certificates=2023.01.10=haa95532_0 + - certifi=2022.12.7=py311haa95532_0 + - libffi=3.4.2=hd77b12b_6 + - openssl=1.1.1t=h2bbff1b_0 + - pip=23.0.1=py311haa95532_0 + - python=3.11.0=h966fe2a_3 + - setuptools=65.6.3=py311haa95532_0 + - sqlite=3.41.1=h2bbff1b_0 + - tk=8.6.12=h2bbff1b_0 + - vc=14.2=h21ff451_1 + - vs2015_runtime=14.27.29016=h5e58377_2 + - wheel=0.38.4=py311haa95532_0 + - wincertstore=0.2=py311haa95532_0 + - xz=5.2.10=h8cc25b3_1 + - zlib=1.2.13=h8cc25b3_0 + - pip: + - aiofiles==22.1.0 + - aiosqlite==0.18.0 + - anyio==3.6.2 + - argon2-cffi==21.3.0 + - argon2-cffi-bindings==21.2.0 + - arrow==1.2.3 + - asttokens==2.2.1 + - attrs==22.2.0 + - babel==2.12.1 + - backcall==0.2.0 + - beautifulsoup4==4.12.2 + - bleach==6.0.0 + - cffi==1.15.1 + - charset-normalizer==3.1.0 + - click==8.1.3 + - colorama==0.4.6 + - comm==0.1.3 + - dash==2.9.2 + - dash-bootstrap-components==1.6.0 + - dash-core-components==2.0.0 + - dash-html-components==2.0.0 + - dash-table==5.0.0 + - debugpy==1.6.7 + - decorator==5.1.1 + - defusedxml==0.7.1 + - executing==1.2.0 + - fastjsonschema==2.16.3 + - flask==2.2.3 + - fqdn==1.5.1 + - idna==3.4 + - ipykernel==6.22.0 + - ipython==8.12.0 + - ipython-genutils==0.2.0 + - isoduration==20.11.0 + - itsdangerous==2.1.2 + - jedi==0.18.2 + - jinja2==3.1.2 + - json5==0.9.11 + - jsonpointer==2.3 + - jsonschema==4.17.3 + - jupyter-client==8.1.0 + - jupyter-core==5.3.0 + - jupyter-events==0.6.3 + - jupyter-server==2.5.0 + - jupyter-server-fileid==0.8.0 + - jupyter-server-terminals==0.4.4 + - jupyter-server-ydoc==0.8.0 + - jupyter-ydoc==0.2.3 + - jupyterlab==3.6.3 + - jupyterlab-pygments==0.2.2 + - jupyterlab-server==2.22.0 + - markupsafe==2.1.2 + - mistune==2.0.5 + - mypy==1.11.2 + - mypy-extensions==1.0.0 + - nbclassic==0.5.5 + - nbclient==0.7.3 + - nbconvert==7.3.0 + - nbformat==5.8.0 + - nest-asyncio==1.5.6 + - notebook==6.5.4 + - notebook-shim==0.2.2 + - numpy==1.24.2 + - packaging==23.0 + - pandas==2.0.0 + - pandocfilters==1.5.0 + - parso==0.8.3 + - pickleshare==0.7.5 + - platformdirs==3.2.0 + - plotly==5.14.1 + - prometheus-client==0.16.0 + - prompt-toolkit==3.0.38 + - psutil==5.9.4 + - pure-eval==0.2.2 + - pycparser==2.21 + - pygments==2.14.0 + - pyrsistent==0.19.3 + - python-dateutil==2.8.2 + - python-json-logger==2.0.7 + - pytz==2023.3 + - pywin32==306 + - pywinpty==2.0.10 + - pyyaml==6.0 + - pyzmq==25.0.2 + - requests==2.28.2 + - rfc3339-validator==0.1.4 + - rfc3986-validator==0.1.1 + - scipy==1.10.1 + - send2trash==1.8.0 + - six==1.16.0 + - sniffio==1.3.0 + - soupsieve==2.4 + - stack-data==0.6.2 + - tenacity==8.2.2 + - terminado==0.17.1 + - tinycss2==1.2.1 + - tornado==6.2 + - traitlets==5.9.0 + - typing-extensions==4.12.2 + - tzdata==2023.3 + - uri-template==1.2.0 + - urllib3==1.26.15 + - wcwidth==0.2.6 + - webcolors==1.13 + - webencodings==0.5.1 + - websocket-client==1.5.1 + - werkzeug==2.2.3 + - y-py==0.5.9 + - ypy-websocket==0.8.2 \ No newline at end of file diff --git a/model.py b/model.py index 2302456..3956f79 100644 --- a/model.py +++ b/model.py @@ -4,11 +4,15 @@ from typing import List from typing import Tuple +import os import yaml import src.model_classes as mc -with open('config.yaml') as f: +package_dir = os.path.dirname(os.path.abspath(__file__)) +config_fp = os.path.join(package_dir, "config.yaml") + +with open(config_fp) as f: config = yaml.safe_load(f) class Model: diff --git a/notebooks/.ipynb_checkpoints/toolkit-checkpoint.ipynb b/notebooks/.ipynb_checkpoints/toolkit-checkpoint.ipynb new file mode 100644 index 0000000..d6ae82b --- /dev/null +++ b/notebooks/.ipynb_checkpoints/toolkit-checkpoint.ipynb @@ -0,0 +1,146 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "f9c027ac-4352-4da6-a6d7-394eade3031c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import yaml\n", + "import sys\n", + "import os\n", + "\n", + "project_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))\n", + "sys.path.append(project_dir)\n", + "import model as model\n", + "\n", + "config_fp = os.path.join(project_dir, \"config.yaml\")\n", + "with open(config_fp) as f:\n", + " config = yaml.safe_load(f)\n", + " \n", + "mod = model.Model(config)\n", + "\n", + "df_prep, tbl = mod.prep()\n", + "\n", + "df_result = mod.analysis(df_prep.copy(), tbl)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8c15a313-bfc9-44a9-9891-387ed56564be", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
test_resultphi_corr_coeffphi_binsfour_fifths_testresult_descstatisticpvaluealphadoftbl_rowstbl_colstblexpected_freqtbl_expected_diff
0Statistically significant result0.39736moderate4/5ths Test passed at a ratio of: 1.0.Testing for {'job_title': 'analyst'}, 4/5ths T...5.2182460.0223510.051[Female, Male][not_hired, hired][[10, 1], [15, 18]][[6.25, 4.75], [18.75, 14.25]][[3.75, -3.75], [-3.75, 3.75]]
\n", + "
" + ], + "text/plain": [ + " test_result phi_corr_coeff phi_bins \n", + "0 Statistically significant result 0.39736 moderate \\\n", + "\n", + " four_fifths_test \n", + "0 4/5ths Test passed at a ratio of: 1.0. \\\n", + "\n", + " result_desc statistic pvalue \n", + "0 Testing for {'job_title': 'analyst'}, 4/5ths T... 5.218246 0.022351 \\\n", + "\n", + " alpha dof tbl_rows tbl_cols tbl \n", + "0 0.05 1 [Female, Male] [not_hired, hired] [[10, 1], [15, 18]] \\\n", + "\n", + " expected_freq tbl_expected_diff \n", + "0 [[6.25, 4.75], [18.75, 14.25]] [[3.75, -3.75], [-3.75, 3.75]] " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_result" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "hrailabs_dev", + "language": "python", + "name": "hrailabs" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/toolkit.ipynb b/notebooks/toolkit.ipynb new file mode 100644 index 0000000..d6ae82b --- /dev/null +++ b/notebooks/toolkit.ipynb @@ -0,0 +1,146 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "f9c027ac-4352-4da6-a6d7-394eade3031c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import yaml\n", + "import sys\n", + "import os\n", + "\n", + "project_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))\n", + "sys.path.append(project_dir)\n", + "import model as model\n", + "\n", + "config_fp = os.path.join(project_dir, \"config.yaml\")\n", + "with open(config_fp) as f:\n", + " config = yaml.safe_load(f)\n", + " \n", + "mod = model.Model(config)\n", + "\n", + "df_prep, tbl = mod.prep()\n", + "\n", + "df_result = mod.analysis(df_prep.copy(), tbl)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8c15a313-bfc9-44a9-9891-387ed56564be", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
test_resultphi_corr_coeffphi_binsfour_fifths_testresult_descstatisticpvaluealphadoftbl_rowstbl_colstblexpected_freqtbl_expected_diff
0Statistically significant result0.39736moderate4/5ths Test passed at a ratio of: 1.0.Testing for {'job_title': 'analyst'}, 4/5ths T...5.2182460.0223510.051[Female, Male][not_hired, hired][[10, 1], [15, 18]][[6.25, 4.75], [18.75, 14.25]][[3.75, -3.75], [-3.75, 3.75]]
\n", + "
" + ], + "text/plain": [ + " test_result phi_corr_coeff phi_bins \n", + "0 Statistically significant result 0.39736 moderate \\\n", + "\n", + " four_fifths_test \n", + "0 4/5ths Test passed at a ratio of: 1.0. \\\n", + "\n", + " result_desc statistic pvalue \n", + "0 Testing for {'job_title': 'analyst'}, 4/5ths T... 5.218246 0.022351 \\\n", + "\n", + " alpha dof tbl_rows tbl_cols tbl \n", + "0 0.05 1 [Female, Male] [not_hired, hired] [[10, 1], [15, 18]] \\\n", + "\n", + " expected_freq tbl_expected_diff \n", + "0 [[6.25, 4.75], [18.75, 14.25]] [[3.75, -3.75], [-3.75, 3.75]] " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_result" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "hrailabs_dev", + "language": "python", + "name": "hrailabs" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/.ipynb_checkpoints/Pay_Gap_Reg-checkpoint.ipynb b/pay_equity/.ipynb_checkpoints/Pay_Gap_Reg-checkpoint.ipynb similarity index 100% rename from .ipynb_checkpoints/Pay_Gap_Reg-checkpoint.ipynb rename to pay_equity/.ipynb_checkpoints/Pay_Gap_Reg-checkpoint.ipynb diff --git a/Pay_Gap_Reg.ipynb b/pay_equity/Pay_Gap_Reg.ipynb similarity index 100% rename from Pay_Gap_Reg.ipynb rename to pay_equity/Pay_Gap_Reg.ipynb diff --git a/src/.ipynb_checkpoints/model_classes-checkpoint.py b/src/.ipynb_checkpoints/model_classes-checkpoint.py index d44109b..3b601b7 100644 --- a/src/.ipynb_checkpoints/model_classes-checkpoint.py +++ b/src/.ipynb_checkpoints/model_classes-checkpoint.py @@ -6,8 +6,12 @@ import pandas as pd import numpy as np +import os from scipy.stats import chi2_contingency +package_dir = os.path.dirname(os.path.abspath(__file__)) +main_dir = os.path.abspath(os.path.join(package_dir, "..")) + class Ingest: """ @@ -146,28 +150,29 @@ def run_load( """ filepath = self.filepath - + csv_fp = os.path.join(main_dir, filepath) + try: - return pd.read_csv(filepath, skiprows=0) + return pd.read_csv(csv_fp, skiprows=0) except FileNotFoundError: raise FileNotFoundError( - f"The file at {filepath} was not found. Please check the file path." + f"The file at {csv_fp} was not found. Please check the file path." ) except pd.errors.EmptyDataError: raise ValueError( - f"The file at {filepath} is empty and cannot be loaded." + f"The file at {csv_fp} is empty and cannot be loaded." ) except pd.errors.ParserError: raise ValueError( - f"The file at {filepath} contains malformed data and could not be parsed as a valid CSV." + f"The file at {csv_fp} contains malformed data and could not be parsed as a valid CSV." ) except PermissionError: raise PermissionError( - f"Permission denied when attempting to read the file at {filepath}." + f"Permission denied when attempting to read the file at {csv_fp}." f"Please check the file permissions." ) diff --git a/src/__pycache__/model_classes.cpython-311.pyc b/src/__pycache__/model_classes.cpython-311.pyc index 1e700e5..a3fee23 100644 Binary files a/src/__pycache__/model_classes.cpython-311.pyc and b/src/__pycache__/model_classes.cpython-311.pyc differ diff --git a/src/model_classes.py b/src/model_classes.py index d44109b..3b601b7 100644 --- a/src/model_classes.py +++ b/src/model_classes.py @@ -6,8 +6,12 @@ import pandas as pd import numpy as np +import os from scipy.stats import chi2_contingency +package_dir = os.path.dirname(os.path.abspath(__file__)) +main_dir = os.path.abspath(os.path.join(package_dir, "..")) + class Ingest: """ @@ -146,28 +150,29 @@ def run_load( """ filepath = self.filepath - + csv_fp = os.path.join(main_dir, filepath) + try: - return pd.read_csv(filepath, skiprows=0) + return pd.read_csv(csv_fp, skiprows=0) except FileNotFoundError: raise FileNotFoundError( - f"The file at {filepath} was not found. Please check the file path." + f"The file at {csv_fp} was not found. Please check the file path." ) except pd.errors.EmptyDataError: raise ValueError( - f"The file at {filepath} is empty and cannot be loaded." + f"The file at {csv_fp} is empty and cannot be loaded." ) except pd.errors.ParserError: raise ValueError( - f"The file at {filepath} contains malformed data and could not be parsed as a valid CSV." + f"The file at {csv_fp} contains malformed data and could not be parsed as a valid CSV." ) except PermissionError: raise PermissionError( - f"Permission denied when attempting to read the file at {filepath}." + f"Permission denied when attempting to read the file at {csv_fp}." f"Please check the file permissions." ) diff --git a/toolkit.ipynb b/toolkit.ipynb deleted file mode 100644 index aa39ad2..0000000 --- a/toolkit.ipynb +++ /dev/null @@ -1,1304 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 17, - "id": "ab695a7b-3043-4154-a59b-01e57feaf8f0", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[\"Testing for {'job_title': 'analyst'}, passed with 4/5 test at 1.0. Based on the results of the chi-square test of independence, there is statistically significant result for gender-based hiring discrimination against Female at the chosen significance level of 0.05. The phi correlation coefficient is 0.397, indicating a moderate effect size. The values on the positive diagonal of the 'tbl' indicate the distribution of hiring success across gen categories. Male had a higher proportion of successful outcomes compared to Female. Specifically, 54.5% of Male had success while only 54.5% of Female had success. This significant difference in hiring success rates suggests a potential gen bias, with Male success in hiring at a higher rate than Female.\"]" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from scipy.stats import chi2_contingency\n", - "import yaml\n", - "\n", - "from pandas import DataFrame\n", - "from typing import Dict\n", - "from typing import List\n", - "from typing import Any\n", - "from typing import Tuple\n", - "\n", - "with open('config.yaml') as f:\n", - " config = yaml.safe_load(f)\n", - "\n", - "class Ingest:\n", - " \n", - " \"\"\"\n", - " Class to ingest dataframe input.\n", - " \"\"\"\n", - " \n", - " def __init__(\n", - " self,\n", - " config: Dict[Any, Any]\n", - " ) -> None:\n", - " \n", - " \"\"\"\n", - " Inits class with the config file\n", - " and unpacks the config file.\n", - " \"\"\"\n", - " \n", - " self.config = config\n", - " \n", - " self.unpack_config()\n", - "\n", - " \n", - " def run(\n", - " self\n", - " ) -> DataFrame:\n", - " \n", - " \"\"\"\n", - " Run function for the class.\n", - " \n", - " :param None:\n", - " :return df:\n", - " DataFrame, ingested df\n", - " \"\"\"\n", - " \n", - " df = self.run_load()\n", - " \n", - " df = self.run_harmonize(df)\n", - " \n", - " return df\n", - " \n", - " \n", - " def unpack_config(\n", - " self\n", - " ) -> None:\n", - " \n", - " \"\"\"\n", - " Function to unpack config vars.\n", - " \n", - " :var filepath:\n", - " str, the relative filepath \n", - " :var group_variable:\n", - " str, the column name for the \n", - " group variable of interest e.g.\n", - " gender, which contains the target \n", - " class and non-target class e.g.\n", - " females and males.\n", - " :var group_target_val:\n", - " str, within the group_variable column,\n", - " contains the contains the target \n", - " class value e.g.\n", - " females.\n", - " :var group_other_val:\n", - " str, within the group_variable column,\n", - " contains the contains the non-target \n", - " class value e.g. males.\n", - " :var outcome_variable:\n", - " str, the column name for the \n", - " outcome variable of interest e.g.\n", - " hired, which contains the target \n", - " class and non-target class e.g.\n", - " hired and not-hired.\n", - " :var outcome_target_val:\n", - " str, within the outcome_variable column,\n", - " contains the contains the target \n", - " class value e.g.\n", - " hired.\n", - " :var outcome_other_val:\n", - " str, within the outcome_variable column,\n", - " contains the contains the non-target \n", - " class value e.g. not-hired.\n", - " :var grpers:\n", - " Dict[str,str], can be any set of filterable\n", - " columns to slice into particular groups within\n", - " the broader employee roster. The key is the column,\n", - " the value is the desired class within the column\n", - " e.g. job_title: analyst.\n", - " \"\"\"\n", - " \n", - " config = self.config\n", - " \n", - " try:\n", - " self.filepath: str = config[\"Ingest\"][\"filepath\"]\n", - " self.group_variable: str = config[\"Ingest\"][\"group_variable\"]\n", - " self.group_target_val: str = config[\"Ingest\"][\"group_target_val\"]\n", - " self.group_other_val: str = config[\"Ingest\"][\"group_other_val\"]\n", - " self.outcome_variable: str = config[\"Ingest\"][\"outcome_variable\"]\n", - " self.outcome_target_val: str = config[\"Ingest\"][\"outcome_target_val\"]\n", - " self.outcome_other_val: str = config[\"Ingest\"][\"outcome_other_val\"]\n", - " self.grpers: Dict[str, str] = config[\"Ingest\"][\"grpers\"]\n", - "\n", - " # Type validation\n", - " if not isinstance(self.filepath, str):\n", - " raise TypeError(\"Expected 'filepath' to be of type 'str'.\")\n", - " if not isinstance(self.group_variable, str):\n", - " raise TypeError(\"Expected 'group_variable' to be of type 'str'.\")\n", - " if not isinstance(self.group_target_val, str):\n", - " raise TypeError(\"Expected 'group_target_val' to be of type 'str'.\")\n", - " if not isinstance(self.group_other_val, str):\n", - " raise TypeError(\"Expected 'group_other_val' to be of type 'str'.\")\n", - " if not isinstance(self.outcome_variable, str):\n", - " raise TypeError(\"Expected 'outcome_variable' to be of type 'str'.\")\n", - " if not isinstance(self.outcome_target_val, str):\n", - " raise TypeError(\"Expected 'outcome_target_val' to be of type 'str'.\")\n", - " if not isinstance(self.outcome_other_val, str):\n", - " raise TypeError(\"Expected 'outcome_other_val' to be of type 'str'.\")\n", - " if not isinstance(self.grpers, dict):\n", - " raise TypeError(\"Expected 'grpers' to be of type 'dict'.\")\n", - "\n", - " except KeyError as e:\n", - " raise KeyError(f\"Missing key '{e.args[0]}' in the config file. \"\n", - " \"Please ensure the config file contains all required keys under the 'Ingest' section: \"\n", - " \"'filepath', 'group_variable', 'group_target_val', 'group_other_val', \"\n", - " \"'outcome_variable', 'outcome_target_val', 'outcome_other_val', and 'grpers'.\")\n", - "\n", - " except TypeError as e:\n", - " raise TypeError(f\"Config file error: {e}\")\n", - " \n", - " def run_load(\n", - " self\n", - " ) -> DataFrame:\n", - " \n", - " \"\"\"\n", - " Loads csv file. Assumes headers are row 0.\n", - " \n", - " :param None:\n", - " :return DataFrame:\n", - " \"\"\"\n", - " \n", - " filepath = self.filepath\n", - " \n", - " try:\n", - " return pd.read_csv(filepath, skiprows=0)\n", - "\n", - " except FileNotFoundError:\n", - " raise FileNotFoundError(\n", - " f\"The file at {filepath} was not found. Please check the file path.\"\n", - " )\n", - "\n", - " except pd.errors.EmptyDataError:\n", - " raise ValueError(\n", - " f\"The file at {filepath} is empty and cannot be loaded.\"\n", - " )\n", - "\n", - " except pd.errors.ParserError:\n", - " raise ValueError(\n", - " f\"The file at {filepath} contains malformed data and could not be parsed as a valid CSV.\"\n", - " )\n", - "\n", - " except PermissionError:\n", - " raise PermissionError(\n", - " f\"Permission denied when attempting to read the file at {filepath}.\"\n", - " f\"Please check the file permissions.\"\n", - " )\n", - "\n", - " except Exception as e:\n", - " raise Exception(\n", - " f\"An unexpected error occurred while loading the file: {str(e)}\"\n", - " )\n", - " \n", - " def run_harmonize(\n", - " self,\n", - " df: DataFrame\n", - " ) -> DataFrame: \n", - "\n", - " \"\"\"\n", - " Function to harmonize the dataset.\n", - " \n", - " :param df: \n", - " DataFrame, loaded df\n", - " :return df:\n", - " DataFrame, filtered down to target and other group and\n", - " harmonize the fields\n", - " \"\"\"\n", - " \n", - " group_variable = self.group_variable\n", - " group_target_val = self.group_target_val\n", - " group_other_val = self.group_other_val\n", - " outcome_variable = self.outcome_variable\n", - " outcome_target_val = self.outcome_target_val\n", - " outcome_other_val = self.outcome_other_val\n", - " grpers = self.grpers\n", - "\n", - " df = self._apply_filters(\n", - " df=df,\n", - " group_variable=group_variable,\n", - " group_target_val=group_target_val,\n", - " group_other_val=group_other_val,\n", - " grpers=grpers\n", - " )\n", - " \n", - " df = self._apply_harmonize(\n", - " df=df,\n", - " group_variable=group_variable,\n", - " group_target_val=group_target_val,\n", - " group_other_val=group_other_val,\n", - " outcome_variable=outcome_variable,\n", - " outcome_target_val=outcome_target_val,\n", - " outcome_other_val=outcome_other_val\n", - " )\n", - " \n", - " return df\n", - " \n", - " def _apply_filters(\n", - " self,\n", - " df: DataFrame,\n", - " group_variable: str,\n", - " group_target_val: str,\n", - " group_other_val: str,\n", - " grpers: Dict[str,str],\n", - " ) -> DataFrame:\n", - " \n", - " \"\"\"\n", - " Method to apply filters\n", - " \n", - " :param df:\n", - " DataFrame, target df\n", - " :param group_variable:\n", - " str, column name of the\n", - " target variable\n", - " :param group_target_val:\n", - " str, class target value of the group_variable\n", - " aka the protected class value\n", - " :param group_other_val:\n", - " str, class nontarget value of the group_variable\n", - " aka the nonprotected class value\n", - " :return df:\n", - " DataFrame, filtered df\n", - " \"\"\"\n", - " \n", - " df = df.loc[\n", - " df[group_variable].isin(\n", - " [\n", - " group_target_val, \n", - " group_other_val\n", - " ]\n", - " )\n", - " ]\n", - " \n", - " for k, v in grpers.items():\n", - " \n", - " df = df.loc[\n", - " df[k].isin([v])\n", - " ] \n", - " \n", - " return df\n", - " \n", - " def _apply_harmonize(\n", - " self,\n", - " df: DataFrame,\n", - " group_variable: str,\n", - " group_target_val: str,\n", - " group_other_val: str,\n", - " outcome_variable: str,\n", - " outcome_target_val: str,\n", - " outcome_other_val: str\n", - " ) -> DataFrame:\n", - " \n", - " \"\"\"\n", - " Method to harmonize targets.\n", - " \n", - " :param df:\n", - " DataFrame, target df\n", - " :param group_variable:\n", - " str, column name of the\n", - " target variable\n", - " :param group_target_val:\n", - " str, class target value of the group_variable\n", - " aka the protected class value\n", - " :param group_other_val:\n", - " str, class nontarget value of the group_variable\n", - " aka the nonprotected class value \n", - " :param outcome_variable:\n", - " str, the column name of the outcome \n", - " :param outcome_target_val:\n", - " str, class target value of the outcome_variable\n", - " aka success\n", - " :param outcome_other_val:\n", - " str, class nontarget value of the outcome_variable\n", - " :return df:\n", - " DataFrame, target df\n", - " \"\"\"\n", - " \n", - " # harmonize the group target\n", - " df['group_var_clean'] = np.where(\n", - " df[group_variable]==group_target_val, \n", - " 1,\n", - " np.where(\n", - " df[group_variable]==group_other_val, \n", - " 0, \n", - " -1\n", - " )\n", - " )\n", - " \n", - " # harmonize the outcome target\n", - " df['outcome_var_clean'] = np.where(\n", - " df[outcome_variable]==outcome_target_val, \n", - " 1, \n", - " np.where(\n", - " df[self.outcome_variable]==outcome_other_val,\n", - " 0, \n", - " -1\n", - " )\n", - " ) \n", - " \n", - " return df\n", - " \n", - "class Transform:\n", - " \n", - " \"\"\"\n", - " Class to transform dataframe inputs into \n", - " 2x2 contingency table.\n", - " \"\"\"\n", - " \n", - " def __init__(\n", - " self, \n", - " df: DataFrame\n", - " ) -> None:\n", - " \n", - " \"\"\"\n", - " :param df:\n", - " DataFrame, input df\n", - " \"\"\"\n", - " \n", - " self.df = df\n", - " \n", - " def run_build_cont_table(\n", - " self\n", - " ) -> List[int]:\n", - " \n", - " \"\"\"\n", - " Function to generate contingency table format.\n", - " \n", - " Places the target group val in the top row and the\n", - " target group other to the bottom row.\n", - " \n", - " Places no-success outcome on the first column and success\n", - " on the second column.\n", - " \n", - " :return tbl:\n", - " List[int], filtered down to target and other group.\n", - " \"\"\"\n", - " \n", - " df = self.df\n", - " \n", - " cols = [\n", - " 'group_var_clean', \n", - " 'outcome_var_clean'\n", - " ]\n", - " \n", - " df = df[cols]\n", - " \n", - " tbl = (\n", - " df.pivot_table(\n", - " index='group_var_clean',\n", - " columns='outcome_var_clean', \n", - " aggfunc=len\n", - " ).\n", - " sort_index(\n", - " axis=1, \n", - " ascending=True\n", - " ).\n", - " sort_index(ascending=False). # ensure always [1,0]\n", - " values.tolist()\n", - " ) \n", - " \n", - " return tbl\n", - " \n", - "class StatsTesting2x2Cont:\n", - " \n", - " \"\"\"\n", - " Class to perform 2x2 Contigency Table analysis\n", - " with Chi2 and Phi Correlation Coefficent Testing.\n", - "\n", - " Provides context into potential association between\n", - " variables and the strength of the association.\n", - " \"\"\"\n", - " \n", - " def __init__(\n", - " self,\n", - " config: Dict[Any, Any],\n", - " tbl: List[int],\n", - " df: DataFrame\n", - " ) -> None:\n", - " \n", - " \"\"\"\n", - " Inits the class variables and unpacks the\n", - " config variables.\n", - " \n", - " :param config:\n", - " Dict[str,Any], loaded config file.\n", - " :param tbl:\n", - " List[int], 2x2 cont table.\n", - " :param df:\n", - " DataFrame, original input DataFrame.\n", - " \"\"\"\n", - " \n", - " self.config = config\n", - " self.tbl = tbl\n", - " self.df = df\n", - "\n", - " self.unpack_config()\n", - "\n", - " def run_testing(\n", - " self\n", - " ) -> DataFrame:\n", - " \n", - " \"\"\"\n", - " Run function for the class.\n", - " \n", - " Runs hypothesis evaluation and builds\n", - " the output report DataFrame.\n", - " \n", - " :param None:\n", - " :return df_results:\n", - " DataFrame, with testing results.\n", - " \"\"\"\n", - " \n", - " alpha = self.alpha\n", - " tbl = self.tbl\n", - " process = self.process\n", - " group_variable = self.group_variable\n", - " group_target_val = self.group_target_val\n", - " group_other_val = self.group_other_val\n", - " bin_edges = self.bin_edges\n", - " bin_labels = self.bin_labels\n", - " \n", - " res = self.gen_hypothesis_eval(tbl)\n", - "\n", - " df_results = self.run_report_bld(\n", - " alpha=alpha,\n", - " res=res,\n", - " tbl=tbl,\n", - " process=process,\n", - " group_variable=group_variable,\n", - " group_target_val=group_target_val,\n", - " group_other_val=group_other_val,\n", - " bin_edges=bin_edges,\n", - " bin_labels=bin_labels\n", - " )\n", - " \n", - " return df_results\n", - " \n", - " def unpack_config(\n", - " self\n", - " ) -> None:\n", - " \n", - " \"\"\"\n", - " Function to unpack config variables.\n", - " \n", - " :param None:\n", - " :return None:\n", - " \"\"\"\n", - " \n", - " config = self.config\n", - "\n", - " try:\n", - " self.alpha: float = config[\"StatsTesting2x2Cont\"][\"alpha\"]\n", - " self.group_variable: str = config[\"Ingest\"][\"group_variable\"]\n", - " self.group_target_val: str = config[\"Ingest\"][\"group_target_val\"]\n", - " self.group_other_val: str = config[\"Ingest\"][\"group_other_val\"]\n", - " self.outcome_variable: str = config[\"Ingest\"][\"outcome_variable\"]\n", - " self.outcome_target_val: str = config[\"Ingest\"][\"outcome_target_val\"]\n", - " self.outcome_other_val: str = config[\"Ingest\"][\"outcome_other_val\"]\n", - " self.grpers: Dict[str, str] = config[\"Ingest\"][\"grpers\"]\n", - " self.testing: str = config[\"StatsTesting2x2Cont\"][\"testing\"]\n", - " self.process: str = config[\"StatsTesting2x2Cont\"][\"process\"]\n", - " self.bin_edges: List[float] = config[\"StatsTesting2x2Cont\"][\"phi_bin_edges\"]\n", - " self.bin_labels: List[str] = config[\"StatsTesting2x2Cont\"][\"phi_bin_labels\"]\n", - "\n", - " if not isinstance(self.alpha, float):\n", - " raise TypeError(\"Expected 'alpha' to be of type 'float'.\")\n", - " if not isinstance(self.group_variable, str):\n", - " raise TypeError(\"Expected 'group_variable' to be of type 'str'.\")\n", - " if not isinstance(self.group_target_val, str):\n", - " raise TypeError(\"Expected 'group_target_val' to be of type 'str'.\")\n", - " if not isinstance(self.group_other_val, str):\n", - " raise TypeError(\"Expected 'group_other_val' to be of type 'str'.\")\n", - " if not isinstance(self.outcome_variable, str):\n", - " raise TypeError(\"Expected 'outcome_variable' to be of type 'str'.\")\n", - " if not isinstance(self.outcome_target_val, str):\n", - " raise TypeError(\"Expected 'outcome_target_val' to be of type 'str'.\")\n", - " if not isinstance(self.outcome_other_val, str):\n", - " raise TypeError(\"Expected 'outcome_other_val' to be of type 'str'.\")\n", - " if not isinstance(self.grpers, dict):\n", - " raise TypeError(\"Expected 'grpers' to be of type 'dict'.\")\n", - " if not isinstance(self.testing, str):\n", - " raise TypeError(\"Expected 'testing' to be of type 'str'.\")\n", - " if not isinstance(self.process, str):\n", - " raise TypeError(\"Expected 'process' to be of type 'str'.\")\n", - " if not isinstance(\n", - " self.bin_edges, list\n", - " ) or not all(\n", - " isinstance(\n", - " i, (int, float)\n", - " ) for i in self.bin_edges\n", - " ):\n", - " raise TypeError(\"Expected 'bin_edges' to be a list of floats.\")\n", - " if not isinstance(\n", - " self.bin_labels, list\n", - " ) or not all(\n", - " isinstance(i, str) for i in self.bin_labels\n", - " ):\n", - " raise TypeError(\"Expected 'bin_labels' to be a list of strings.\")\n", - " \n", - " except KeyError as e:\n", - " raise KeyError(\n", - " f\"Missing key '{e.args[0]}' in the config file. \"\n", - " f\"Ensure all required keys are present in the 'Ingest' and 'StatsTesting2x2Cont' sections.\"\n", - " )\n", - "\n", - " except TypeError as e:\n", - " raise TypeError(f\"Config file error: {e}\")\n", - "\n", - " \n", - " def gen_hypothesis_eval(\n", - " self,\n", - " tbl: List[int]\n", - " ) -> chi2_contingency:\n", - " \n", - " \"\"\"\n", - " Function to generate the chi2_contigency\n", - " statistic and result.\n", - " \"\"\"\n", - " \n", - " #size = np.shape(tbl)\n", - " #tbl_len = len(tbl)\n", - " \n", - " res = chi2_contingency(\n", - " tbl\n", - " )\n", - " \n", - " return res\n", - " \n", - " def run_report_bld(\n", - " self,\n", - " alpha: float,\n", - " res: chi2_contingency,\n", - " tbl: List[int],\n", - " process: str,\n", - " group_variable: str,\n", - " group_target_val: str,\n", - " group_other_val: str,\n", - " bin_edges: List[float],\n", - " bin_labels: List[str]\n", - " ) -> DataFrame:\n", - " \n", - " \"\"\"\n", - " Runs report for statistical testing\n", - " chi2_contingency results\n", - " \n", - " :param alpha:\n", - " float, alpha value for significance evaluation.\n", - " :param res:\n", - " chi2_contingency, result of the chi2_contingency.\n", - " :param tbl:\n", - " List[int], the contingency table.\n", - " :param process: \n", - " str, the name of the business process\n", - " being tested, e.g. 'hiring'.\n", - " :param group_variable:\n", - " str, column name of the\n", - " target variable.\n", - " :param group_target_val:\n", - " str, class target value of the group_variable\n", - " aka the protected class value.\n", - " :param group_other_val:\n", - " str, class nontarget value of the group_variable\n", - " aka the nonprotected class value. \n", - " :param bin_edges:\n", - " List[float], edges for phi\n", - " bins.\n", - " :param bin_labels:\n", - " List[str], labels for the phi\n", - " bins.\n", - " :return df:\n", - " DataFrame, target\n", - " \"\"\"\n", - " \n", - " pvalue = res[1]\n", - " \n", - " df = pd.DataFrame()\n", - "\n", - " df = self._gen_significance_test(\n", - " df=df,\n", - " pvalue=pvalue,\n", - " alpha=alpha\n", - " )\n", - " \n", - " (\n", - " df,\n", - " A,\n", - " B,\n", - " C,\n", - " D,\n", - " total_target_grp,\n", - " total_non_target_grp,\n", - " diagonals,\n", - " percent_target_succ,\n", - " percent_non_target_succ,\n", - " phi_numerator,\n", - " phi_denominator\n", - " ) = self._gen_table_calcs(\n", - " df=df,\n", - " tbl=tbl,\n", - " )\n", - " \n", - " if res[1] <= alpha:\n", - " df, phi_result = self._gen_phi_coefficient(\n", - " df=df,\n", - " tbl=tbl,\n", - " bin_edges=bin_edges,\n", - " bin_labels=bin_labels,\n", - " process=process,\n", - " group_variable=group_variable,\n", - " group_target_val=group_target_val,\n", - " group_other_val=group_other_val,\n", - " diagonals=diagonals,\n", - " numerator=phi_numerator,\n", - " denominator=phi_denominator,\n", - " percent_target_succ=percent_non_target_succ,\n", - " percent_non_target_succ=percent_non_target_succ,\n", - " )\n", - " \n", - " else:\n", - " df['phi_corr_coeff'] = np.nan\n", - " df['phi_bins'] = np.nan\n", - " \n", - " phi_result = \"\"\n", - " \n", - " df = self._gen_four_fifths_test(\n", - " df,\n", - " percent_target_succ=percent_non_target_succ,\n", - " percent_non_target_succ=percent_non_target_succ\n", - " )\n", - " \n", - " df = self._gen_outcome_meta(\n", - " df,\n", - " round(res[1],3),\n", - " phi_result\n", - " )\n", - " \n", - " df = self._gen_unpack_stats(\n", - " df,\n", - " res\n", - " )\n", - " \n", - " return df\n", - " \n", - " def _gen_unpack_stats(\n", - " self,\n", - " df: DataFrame,\n", - " res: chi2_contingency\n", - " ) -> DataFrame:\n", - " \n", - " \"\"\"\n", - " Method to unpack test stats from\n", - " chi2_contingency results.\n", - " \n", - " :param df:\n", - " DataFrame, output df.\n", - " :param res:\n", - " chi2_contingency, results array.\n", - " :return df:\n", - " DataFrame, output df.\n", - " \"\"\"\n", - " \n", - " group_target_val = self.group_target_val\n", - " group_other_val = self.group_other_val\n", - " rows = [group_target_val] + [group_other_val]\n", - " \n", - " df['statistic'] = res[0]\n", - " df['pvalue'] = res[1]\n", - " df['dof'] = res[2]\n", - " df['tbl_row'] = [rows]\n", - " df['tbl'] = [tbl]\n", - " df['expected_freq'] = [res[3]]\n", - " df['tbl_expected_diff'] = [tbl - res[3]]\n", - " \n", - " return df\n", - " \n", - " def _gen_significance_test(\n", - " self,\n", - " df: DataFrame,\n", - " pvalue: float,\n", - " alpha: float\n", - " ):\n", - " \"\"\"\n", - " Method to report on test significance.\n", - " \n", - " :param df:\n", - " DataFrame, results df.\n", - " :param pval:\n", - " int, pvalue.\n", - " :param alpha:\n", - " float, the alpha value for testing eval.\n", - " :return df:\n", - " DataFrame with metadata added. \n", - " \"\"\"\n", - " \n", - " if pvalue <= alpha:\n", - " val = 'statistically significant result'\n", - " \n", - " else:\n", - " val = 'no statistically significant result'\n", - " \n", - " df['test_result'] = [val]\n", - " \n", - " return df\n", - " \n", - " def _gen_phi_coefficient(\n", - " self,\n", - " df: DataFrame,\n", - " tbl: List[int],\n", - " process: str,\n", - " group_variable: str,\n", - " group_target_val: str,\n", - " group_other_val: str,\n", - " bin_edges: List[float],\n", - " bin_labels: List[str],\n", - " diagonals: List[float],\n", - " numerator: float,\n", - " denominator: float,\n", - " percent_target_succ: float,\n", - " percent_non_target_succ: float,\n", - " ) -> DataFrame:\n", - " \n", - " \"\"\"\n", - " Method to generate the phi coefficient.\n", - " \n", - " :param df:\n", - " DataFrame, the results df.\n", - " :param tbl:\n", - " List[int], the 2x2 cont table.\n", - " :param process: \n", - " str, the name of the business process\n", - " being tested, e.g. 'hiring'.\n", - " :param group_variable:\n", - " str, column name of the\n", - " target variable.\n", - " :param group_target_val:\n", - " str, class target value of the group_variable\n", - " aka the protected class value.\n", - " :param group_other_val:\n", - " str, class nontarget value of the group_variable\n", - " aka the nonprotected class value. \n", - " :param bin_edges:\n", - " List[float], edges for phi\n", - " bins.\n", - " :param bin_labels:\n", - " List[str], lab\n", - " :return df:\n", - " DataFrame, output df.\n", - " \"\"\"\n", - " phi = numerator / denominator if denominator != 0 else 0\n", - "\n", - " df['phi_corr_coeff'] = phi\n", - " \n", - " df = self._gen_prep_phi_bins(\n", - " df=df,\n", - " bin_edges=bin_edges,\n", - " bin_labels=bin_labels\n", - " )\n", - "\n", - " df, phi_result = self._gen_prep_diagonals(\n", - " df=df,\n", - " diagonals=diagonals,\n", - " process=process,\n", - " group_variable=group_variable,\n", - " group_other_val=group_other_val,\n", - " group_target_val=group_target_val,\n", - " percent_non_target_succ=percent_non_target_succ,\n", - " percent_target_succ=percent_target_succ,\n", - " )\n", - " \n", - " return df, phi_result\n", - " \n", - " def _gen_table_calcs(\n", - " self,\n", - " df: DataFrame,\n", - " tbl: List[int]\n", - " ) -> Tuple[\n", - " DataFrame, float, float, float, float,\n", - " float, float, float, float, \n", - " float, float, float\n", - " ]:\n", - " \n", - " \"\"\"\n", - " Method to generate phi bins. Provides additional\n", - " explainability on the magnitude of association, when \n", - " an association is found.\n", - " \n", - " :param df:\n", - " DataFrame, output df.\n", - " :param tbl:\n", - " List[int], 2x2 contingency.\n", - " :return [\n", - " df, A, B, C, D, total_target_grp,\n", - " total_non_target_grp, diagonals,\n", - " percent_target_succ, percent_non_target_succ,\n", - " phi_numerator, phi_denominator\n", - " ]:\n", - " Tuple[DataFrame, float, float, float, float,\n", - " float, float, float, float, \n", - " float, float, float\n", - " ]\n", - " \"\"\"\n", - " \n", - " # females, males; no succ, succ\n", - " A, B = tbl[0] \n", - " C, D = tbl[1]\n", - " \n", - " total_target_grp = A + B\n", - " total_non_target_grp = C + D\n", - " diagonals = (A + D) > (B + C)\n", - " percent_target_succ = (B / total_target_grp) * 100\n", - " percent_non_target_succ = (D / total_non_target_grp) * 100\n", - " phi_numerator = (A * D) - (B * C)\n", - " phi_denominator = np.sqrt((A + B) * (C + D) * (A + C) * (B + D)) \n", - " \n", - " return (\n", - " df,\n", - " A,\n", - " B,\n", - " C,\n", - " D,\n", - " total_target_grp,\n", - " total_non_target_grp,\n", - " diagonals,\n", - " percent_target_succ,\n", - " percent_non_target_succ,\n", - " phi_numerator,\n", - " phi_denominator\n", - " )\n", - " \n", - " def _gen_prep_phi_bins(\n", - " self,\n", - " df: DataFrame,\n", - " bin_edges: List[float],\n", - " bin_labels: List[str]\n", - " ) -> DataFrame:\n", - " \n", - " \"\"\"\n", - " Method to generate pandas bins for \n", - " phi coeff.\n", - " \n", - " :param df:\n", - " DataFrame, output df.\n", - " :param bin_edges:\n", - " List[float], edges for phi\n", - " bins.\n", - " :param bin_labels:\n", - " List[str], labels for the phi\n", - " bins.\n", - " :return df:\n", - " DataFrame, output df.\n", - " \"\"\"\n", - " \n", - " df['phi_bins'] = pd.cut(\n", - " df['phi_corr_coeff'], \n", - " bins=bin_edges, \n", - " labels=bin_labels, \n", - " include_lowest=True\n", - " )\n", - " \n", - " return df\n", - " \n", - " def _gen_four_fifths_test(\n", - " self,\n", - " df: DataFrame,\n", - " percent_target_succ: float,\n", - " percent_non_target_succ: float\n", - " ) -> DataFrame:\n", - " \n", - " ratio = percent_target_succ / percent_non_target_succ\n", - " \n", - " if ratio < .8:\n", - " ratio_desc = f'failed with 4/5 test at {round(ratio,3)}'\n", - " elif ratio >= .8:\n", - " ratio_desc = f'passed with 4/5 test at {round(ratio,3)}'\n", - " else:\n", - " ratio_desc = 'error calculating 4/5 test'\n", - " \n", - " df['four_fifths_test'] = ratio_desc\n", - " return df\n", - " \n", - " def _gen_prep_diagonals(\n", - " self,\n", - " df: DataFrame,\n", - " diagonals: bool,\n", - " process: str,\n", - " group_variable: str,\n", - " group_other_val: str,\n", - " group_target_val: str,\n", - " percent_non_target_succ: float,\n", - " percent_target_succ: float,\n", - " ) -> Tuple[DataFrame, str]:\n", - " \n", - " \"\"\"\n", - " Method to generate the magnitude of the\n", - " assocation using phi coefficient analysis.\n", - " \n", - " :param df:\n", - " DataFrame, output df.\n", - " :param diagonals:\n", - " bool,\n", - " :param process: \n", - " str, the name of the business process\n", - " being tested, e.g. 'hiring'.\n", - " :param group_variable:\n", - " str, column name of the\n", - " target variable.\n", - " :param group_target_val:\n", - " str, class target value of the group_variable\n", - " aka the protected class value.\n", - " :param group_other_val:\n", - " str, class nontarget value of the group_variable\n", - " aka the nonprotected class value. \n", - " :param percent_non_target_succ:\n", - " float, the success percentage attained\n", - " for the the non-target group.\n", - " :param percent_target_succ:\n", - " float, the success percentage attained for the\n", - " target class.\n", - " :return (df, phi_col):\n", - " Tuple[df, phi_col]\n", - " \"\"\"\n", - " \n", - " phi_bin = df['phi_bins'].values[0] \n", - " phi_corr_coeff = df['phi_corr_coeff'].values[0] \n", - "\n", - " if diagonals:\n", - " diagonal_msg = (\n", - " f\"The values on the positive diagonal of the 'tbl' indicate the distribution of {process} success across {group_variable} categories.\"\n", - " f\" {group_other_val} had a higher proportion of successful outcomes compared to {group_target_val}.\"\n", - " f\" Specifically, {percent_non_target_succ:.1f}% of {group_other_val} had success while only {percent_target_succ:.1f}%\"\n", - " f\" of {group_target_val} had success.\"\n", - " f\" This significant difference in {process} success rates suggests a potential {group_variable} bias, with {group_other_val} success in {process}\"\n", - " f\" at a higher rate than {group_target_val}.\"\n", - " )\n", - " \n", - " else:\n", - " diagonal_msg = \"the diagonal values are not substantially higher, suggesting the relationship might be more nuanced.\"\n", - " \n", - " phi_col = f\"The phi correlation coefficient is {phi_corr_coeff:.3f}, indicating a {phi_bin} effect size. {diagonal_msg}\"\n", - " \n", - " return df, phi_col\n", - " \n", - " def _gen_outcome_meta(\n", - " self,\n", - " df: DataFrame,\n", - " pval: float,\n", - " phi_result: str\n", - " ) -> DataFrame:\n", - " \n", - " \"\"\"\n", - " Method to generate meta data for \n", - " reporting dataframe\n", - " \n", - " :param df:\n", - " DataFrame, results df\n", - " :param pval:\n", - " int, pvalue\n", - " :param phi_result:\n", - " str, result of phi testing.\n", - " :return df:\n", - " DataFrame with metadata added\n", - " \"\"\"\n", - " \n", - " grpers = self.grpers\n", - " result = df['test_result'].values[0]\n", - " phi_col = df['phi_corr_coeff'].values[0]\n", - " testing = self.testing\n", - " process = self.process\n", - " group_target_val = self.group_target_val\n", - " alpha = self.alpha\n", - " four_fifths = df['four_fifths_test'].values[0]\n", - " \n", - " col = f\"Testing for {grpers}, {four_fifths}. Based on the results of the chi-square test of independence, there is {result} for {testing}-based {process} discrimination against {group_target_val} at the chosen significance level of {alpha}.\"\n", - "\n", - " if result == \"statistically significant result\":\n", - " col = f\"{col} {phi_result}\"\n", - " \n", - " df['result_desc'] = col\n", - " \n", - " return df\n", - " \n", - "# pipeline\n", - "\n", - "ingestObj = Ingest(config)\n", - "df = ingestObj.run()\n", - "\n", - "transObj = Transform(\n", - " df.copy()\n", - ")\n", - "tbl = transObj.run_build_cont_table()\n", - "\n", - "statsObj = StatsTesting2x2Cont(\n", - " config,\n", - " tbl,\n", - " df.copy() # need to add some more context in plain text\n", - ")\n", - "df_result = statsObj.run_testing()\n", - "\n", - "df_result['result_desc'].tolist()" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "d303e1d1-c69a-4b59-9489-14574000bd55", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import yaml\n", - "import model\n", - "\n", - "with open('config.yaml') as f:\n", - " config = yaml.safe_load(f)\n", - " \n", - "model = model.Model(config)\n", - "\n", - "df_prep, tbl = model.prep()\n", - "\n", - "df_result = model.analysis(df_prep.copy(), tbl)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "cebbe841-c185-443d-ac02-7ccd8c50e005", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
test_resultphi_corr_coeffphi_binsfour_fifths_testresult_descstatisticpvaluedoftbl_rowstbl_colstblexpected_freqtbl_expected_diff
0statistically significant result0.39736moderatepassed with 4/5 test at 1.0Testing for {'job_title': 'analyst'}, passed w...5.2182460.0223511[Female, Male][hired, not_hired][[10, 1], [15, 18]][[6.25, 4.75], [18.75, 14.25]][[3.75, -3.75], [-3.75, 3.75]]
\n", - "
" - ], - "text/plain": [ - " test_result phi_corr_coeff phi_bins \n", - "0 statistically significant result 0.39736 moderate \\\n", - "\n", - " four_fifths_test \n", - "0 passed with 4/5 test at 1.0 \\\n", - "\n", - " result_desc statistic pvalue \n", - "0 Testing for {'job_title': 'analyst'}, passed w... 5.218246 0.022351 \\\n", - "\n", - " dof tbl_rows tbl_cols tbl \n", - "0 1 [Female, Male] [hired, not_hired] [[10, 1], [15, 18]] \\\n", - "\n", - " expected_freq tbl_expected_diff \n", - "0 [[6.25, 4.75], [18.75, 14.25]] [[3.75, -3.75], [-3.75, 3.75]] " - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_result" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "01672735-2ad5-42ae-9488-1962f3d0e63e", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
test_resultphi_corr_coeffphi_binsfour_fifths_testresult_descstatisticpvaluedoftbl_rowtblexpected_freqtbl_expected_diff
0statistically significant result0.39736moderatefailed 4/5 test at 0.167Testing for {'job_title': 'analyst'}, based on...5.2182460.0223511[Female, Male][[10, 1], [15, 18]][[6.25, 4.75], [18.75, 14.25]][[3.75, -3.75], [-3.75, 3.75]]
\n", - "
" - ], - "text/plain": [ - " test_result phi_corr_coeff phi_bins \n", - "0 statistically significant result 0.39736 moderate \\\n", - "\n", - " four_fifths_test \n", - "0 failed 4/5 test at 0.167 \\\n", - "\n", - " result_desc statistic pvalue \n", - "0 Testing for {'job_title': 'analyst'}, based on... 5.218246 0.022351 \\\n", - "\n", - " dof tbl_row tbl expected_freq \n", - "0 1 [Female, Male] [[10, 1], [15, 18]] [[6.25, 4.75], [18.75, 14.25]] \\\n", - "\n", - " tbl_expected_diff \n", - "0 [[3.75, -3.75], [-3.75, 3.75]] " - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_result" - ] - }, - { - "cell_type": "markdown", - "id": "94114f77-caa8-41a3-900a-44317c84f4b7", - "metadata": {}, - "source": [ - "to do:\n", - " \n", - "implement these tests\n", - "\n", - "\n", - "https://en.wikipedia.org/wiki/Disparate_impact\n", - "\n", - "Add handler for filtered size of group must be ...\n", - "\n", - "# need to check this size\n", - "# https://online.stat.psu.edu/stat500/lesson/8/8.2#:~:text=That%20equates%20to%20the%20Chi,count%20of%20at%20least%205.\n", - "\n", - "# make sure at least 5 in each slice, then at least 50" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1d975b96-c8af-4e46-9521-0c7fbe442ff1", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "hrailabs_dev", - "language": "python", - "name": "hrailabs" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}