diff --git a/Apply_Rate.ipynb b/Apply_Rate.ipynb new file mode 100644 index 0000000..a81f4dd --- /dev/null +++ b/Apply_Rate.ipynb @@ -0,0 +1,2003 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Apply_Rate.ipynb", + "version": "0.3.2", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "accelerator": "TPU" + }, + "cells": [ + { + "metadata": { + "colab_type": "code", + "id": "y_s5rbXeIduB", + "colab": {} + }, + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support, accuracy_score, roc_auc_score" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "ISco9swtpr0G", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "code", + "id": "Nj1l0nzcIsV4", + "outputId": "f2e04170-ff43-4fa6-b88a-7c385026b504", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "cell_type": "code", + "source": [ + "df = pd.read_csv('./Apply_Rate_2019.csv')\n", + "df.shape" + ], + "execution_count": 2, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(1200890, 10)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 2 + } + ] + }, + { + "metadata": { + "id": "FP85NcoV742h", + "colab_type": "code", + "outputId": "e0a8f02b-b9c7-43f1-e29f-e788aa946adf", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 85 + } + }, + "cell_type": "code", + "source": [ + "df.columns" + ], + "execution_count": 3, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Index(['title_proximity_tfidf', 'description_proximity_tfidf',\n", + " 'main_query_tfidf', 'query_jl_score', 'query_title_score', 'city_match',\n", + " 'job_age_days', 'apply', 'search_date_pacific', 'class_id'],\n", + " dtype='object')" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 3 + } + ] + }, + { + "metadata": { + "id": "75VbG78-79Ro", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "not_cat_cols = ['title_proximity_tfidf', 'description_proximity_tfidf', 'main_query_tfidf', 'query_jl_score', 'query_title_score', 'job_age_days']" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "code", + "id": "MSCrtPkiNSmL", + "outputId": "6b8da50e-68a8-44a6-daeb-26f9d3ac40a3", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 359 + } + }, + "cell_type": "code", + "source": [ + "df.head().T" + ], + "execution_count": 5, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " 0 1 \\\n", + "title_proximity_tfidf 0 11.874 \n", + "description_proximity_tfidf 0 0.043637 \n", + "main_query_tfidf 0 2.52708 \n", + "query_jl_score 0.00482431 0.0119353 \n", + "query_title_score 0.00662807 0.011208 \n", + "city_match 0 0 \n", + "job_age_days 5 5 \n", + "apply 0 0 \n", + "search_date_pacific 2018-01-21 2018-01-21 \n", + "class_id -7613806991329176388 -1361819269400927213 \n", + "\n", + " 2 3 \\\n", + "title_proximity_tfidf 11.874 0 \n", + "description_proximity_tfidf 0.043637 0 \n", + "main_query_tfidf 2.52708 3.69581 \n", + "query_jl_score 0.0119353 0.0130464 \n", + "query_title_score 0.011208 0.0333403 \n", + "city_match 0 0 \n", + "job_age_days 5 5 \n", + "apply 1 0 \n", + "search_date_pacific 2018-01-21 2018-01-23 \n", + "class_id -1361819269400927213 -3623880078591607667 \n", + "\n", + " 4 \n", + "title_proximity_tfidf NaN \n", + "description_proximity_tfidf NaN \n", + "main_query_tfidf 7.8689 \n", + "query_jl_score 0.00806452 \n", + "query_title_score 0.00877193 \n", + "city_match 0 \n", + "job_age_days 7 \n", + "apply 0 \n", + "search_date_pacific 2018-01-24 \n", + "class_id -7054510112423797411 " + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01234
title_proximity_tfidf011.87411.8740NaN
description_proximity_tfidf00.0436370.0436370NaN
main_query_tfidf02.527082.527083.695817.8689
query_jl_score0.004824310.01193530.01193530.01304640.00806452
query_title_score0.006628070.0112080.0112080.03334030.00877193
city_match00000
job_age_days55557
apply00100
search_date_pacific2018-01-212018-01-212018-01-212018-01-232018-01-24
class_id-7613806991329176388-1361819269400927213-1361819269400927213-3623880078591607667-7054510112423797411
\n", + "
" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 5 + } + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "fZNz52RpL1fh", + "outputId": "1288bf15-1fd8-4be6-a948-a0c0df4863c2", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 297 + } + }, + "cell_type": "code", + "source": [ + "df.describe()" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " title_proximity_tfidf description_proximity_tfidf main_query_tfidf \\\n", + "count 948319.000000 948319.000000 1.200890e+06 \n", + "mean 5.889800 0.096119 1.770301e+00 \n", + "std 9.749672 0.284249 2.237259e+00 \n", + "min 0.000000 0.000000 0.000000e+00 \n", + "25% 0.000000 0.000000 0.000000e+00 \n", + "50% 0.000000 0.000000 2.509947e-01 \n", + "75% 11.642839 0.063633 3.423265e+00 \n", + "max 93.516720 7.301161 1.774189e+01 \n", + "\n", + " query_jl_score query_title_score city_match job_age_days \\\n", + "count 1.200890e+06 1.200890e+06 944335.000000 1.200890e+06 \n", + "mean 1.711024e-02 3.580456e-02 0.563676 1.010130e+01 \n", + "std 1.220871e-02 4.886690e-02 0.495929 1.738912e+01 \n", + "min 1.000000e-06 9.998021e-07 0.000000 0.000000e+00 \n", + "25% 1.063830e-02 1.256780e-02 0.000000 2.000000e+00 \n", + "50% 1.578868e-02 2.066189e-02 1.000000 6.000000e+00 \n", + "75% 2.043710e-02 3.760018e-02 1.000000 1.400000e+01 \n", + "max 2.556893e-01 1.294088e+00 1.000000 1.340000e+03 \n", + "\n", + " apply class_id \n", + "count 1.200890e+06 1.200890e+06 \n", + "mean 8.993580e-02 -3.956263e+17 \n", + "std 2.860899e-01 5.664017e+18 \n", + "min 0.000000e+00 -9.049388e+18 \n", + "25% 0.000000e+00 -4.593390e+18 \n", + "50% 0.000000e+00 -1.614083e+18 \n", + "75% 0.000000e+00 4.315422e+18 \n", + "max 1.000000e+00 9.204124e+18 " + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
title_proximity_tfidfdescription_proximity_tfidfmain_query_tfidfquery_jl_scorequery_title_scorecity_matchjob_age_daysapplyclass_id
count948319.000000948319.0000001.200890e+061.200890e+061.200890e+06944335.0000001.200890e+061.200890e+061.200890e+06
mean5.8898000.0961191.770301e+001.711024e-023.580456e-020.5636761.010130e+018.993580e-02-3.956263e+17
std9.7496720.2842492.237259e+001.220871e-024.886690e-020.4959291.738912e+012.860899e-015.664017e+18
min0.0000000.0000000.000000e+001.000000e-069.998021e-070.0000000.000000e+000.000000e+00-9.049388e+18
25%0.0000000.0000000.000000e+001.063830e-021.256780e-020.0000002.000000e+000.000000e+00-4.593390e+18
50%0.0000000.0000002.509947e-011.578868e-022.066189e-021.0000006.000000e+000.000000e+00-1.614083e+18
75%11.6428390.0636333.423265e+002.043710e-023.760018e-021.0000001.400000e+010.000000e+004.315422e+18
max93.5167207.3011611.774189e+012.556893e-011.294088e+001.0000001.340000e+031.000000e+009.204124e+18
\n", + "
" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 6 + } + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "-03F32bhK3sM", + "outputId": "de1356d3-7f18-4db9-e2fa-400cc2782fb8", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + } + }, + "cell_type": "code", + "source": [ + "# Number of unique entries\n", + "df.astype(object).describe(include='all').loc['unique', :]" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "title_proximity_tfidf 225538\n", + "description_proximity_tfidf 342918\n", + "main_query_tfidf 517282\n", + "query_jl_score 181318\n", + "query_title_score 343323\n", + "city_match 2\n", + "job_age_days 491\n", + "apply 2\n", + "search_date_pacific 7\n", + "class_id 157\n", + "Name: unique, dtype: object" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 7 + } + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "jkXL6FOEIsVP", + "outputId": "1fd059ba-6e2a-41ff-c9b0-888190287617", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + } + }, + "cell_type": "code", + "source": [ + "# Number of Null Entries\n", + "df.isnull().sum(axis=0)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "title_proximity_tfidf 252571\n", + "description_proximity_tfidf 252571\n", + "main_query_tfidf 0\n", + "query_jl_score 0\n", + "query_title_score 0\n", + "city_match 256555\n", + "job_age_days 0\n", + "apply 0\n", + "search_date_pacific 0\n", + "class_id 0\n", + "dtype: int64" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 8 + } + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "Hdpth4B3K3u9", + "outputId": "1960176d-9950-4922-de9c-e755f2331827", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 229 + } + }, + "cell_type": "code", + "source": [ + "# Correlation Matrix\n", + "df.corr().style.background_gradient()" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
title_proximity_tfidf description_proximity_tfidf main_query_tfidf query_jl_score query_title_score city_match job_age_days apply class_id
title_proximity_tfidf10.1078180.7045790.2081320.371528-0.06069270.04313870.04824170.01719
description_proximity_tfidf0.10781810.02266450.09599290.00358764-0.01020310.0114363-0.003290660.00566131
main_query_tfidf0.7045790.022664510.01073820.209315-0.07738820.08754670.03905010.0185585
query_jl_score0.2081320.09599290.010738210.387003-0.01813720.01713820.05744310.0778891
query_title_score0.3715280.003587640.2093150.3870031-0.04222430.01698770.06383220.0618706
city_match-0.0606927-0.0102031-0.0773882-0.0181372-0.04222431-0.000613780.00308262-0.00458481
job_age_days0.04313870.01143630.08754670.01713820.0169877-0.000613781-0.01182860.00395271
apply0.0482417-0.003290660.03905010.05744310.06383220.00308262-0.011828610.0116747
class_id0.017190.005661310.01855850.07788910.0618706-0.004584810.003952710.01167471
" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 9 + } + ] + }, + { + "metadata": { + "id": "mLXEQ_VMpr0u", + "colab_type": "code", + "outputId": "13a6fa0c-0a71-4e46-af41-2ac58f479210", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1321 + } + }, + "cell_type": "code", + "source": [ + "df.hist(figsize=(20,20))" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([[,\n", + " ,\n", + " ],\n", + " [,\n", + " ,\n", + " ],\n", + " [,\n", + " ,\n", + " ]],\n", + " dtype=object)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 10 + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "metadata": { + "id": "gjZp5GCFpr00", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "avzys-1vpr03", + "colab_type": "code", + "outputId": "e6d9eeac-dcf9-4bfa-9d35-28553ba6f71f", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 102 + } + }, + "cell_type": "code", + "source": [ + "print(df['apply'].value_counts())\n", + "print(df['apply'].value_counts()[0] / len(df) * 100)\n", + "print(df['apply'].value_counts()[1] / len(df) * 100)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "0 1092887\n", + "1 108003\n", + "Name: apply, dtype: int64\n", + "91.00642023832324\n", + "8.993579761676756\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "VwxCMX4ppr07", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "df.fillna(0, inplace=True)\n", + "df['class_id'] = df['class_id'].astype('category').cat.codes" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "code", + "id": "PtnbSobGK3yA", + "colab": {} + }, + "cell_type": "code", + "source": [ + "from sklearn.preprocessing import StandardScaler\n", + "# Preprocessing\n", + "def preprocess(to_remove, normalize=False):\n", + " #Normalization\n", + " df[not_cat_cols] = (df[not_cat_cols] - df[not_cat_cols].mean()) / (df[not_cat_cols].max() - df[not_cat_cols].min())\n", + " \n", + " df_train = df[df.search_date_pacific != '2018-01-27']\n", + " df_test = df[df.search_date_pacific == '2018-01-27']\n", + " \n", + " global X_train\n", + " X_train = df_train.drop(columns=to_remove).values\n", + " global y_train \n", + " y_train = df_train['apply'].values\n", + " global X_test \n", + " X_test = df_test.drop(columns=to_remove).values\n", + " global y_test \n", + " y_test = df_test['apply'].values\n", + " \n", + " #Normalization\n", + "# if normalize: \n", + "# scaler = StandardScaler()\n", + "# scaler.fit(X_train)\n", + "# X_train = scaler.transform(X_train)\n", + "# X_test = scaler.transform(X_test)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "code", + "id": "u1bgS9EBZTvx", + "colab": {} + }, + "cell_type": "code", + "source": [ + "def classify(clf):\n", + " clf.fit(X_train, y_train)\n", + " y_pred = clf.predict_proba(X_test)\n", + " print(clf)\n", + " print(\"ROC AUC:\", roc_auc_score(y_test, y_pred[:,1]))\n", + " y_pred = clf.predict(X_test)\n", + " print(classification_report(y_test, y_pred))\n", + " print(\"Detailed confusion matrix:\")\n", + " print(confusion_matrix(y_test, y_pred))\n", + " print(\"Accuracy Score:\", accuracy_score(y_test, y_pred))\n", + " \n", + " \n", + "from imblearn.over_sampling import SMOTE\n", + "\n", + "def classify_with_smote(clf):\n", + " smote = SMOTE(random_state=0, n_jobs=-1, sampling_strategy=4/6)\n", + " global X_train, y_train \n", + " X_train, y_train = smote.fit_resample(X_train, y_train)\n", + " classify(clf)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "code", + "id": "UoK60GcGZT61", + "colab": {} + }, + "cell_type": "code", + "source": [ + "to_remove = ['apply', 'search_date_pacific', 'class_id']\n", + "preprocess(to_remove)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "ply2Gwi-DNZX", + "colab_type": "code", + "outputId": "6aa4d57a-e73c-4dd0-fdfd-045fe4fd993b", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 391 + } + }, + "cell_type": "code", + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "clf = LogisticRegression(random_state=0, class_weight='balanced', n_jobs=-1)\n", + "classify(clf)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "c:\\python36\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", + " FutureWarning)\n", + "c:\\python36\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:1300: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 4.\n", + " \" = {}.\".format(effective_n_jobs(self.n_jobs)))\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "LogisticRegression(C=1.0, class_weight='balanced', dual=False,\n", + " fit_intercept=True, intercept_scaling=1, max_iter=100,\n", + " multi_class='warn', n_jobs=-1, penalty='l2', random_state=0,\n", + " solver='warn', tol=0.0001, verbose=0, warm_start=False)\n", + "ROC AUC: 0.5883860726501579\n", + " precision recall f1-score support\n", + "\n", + " 0 0.93 0.66 0.77 105940\n", + " 1 0.12 0.47 0.19 10586\n", + "\n", + " micro avg 0.64 0.64 0.64 116526\n", + " macro avg 0.52 0.56 0.48 116526\n", + "weighted avg 0.85 0.64 0.72 116526\n", + "\n", + "Detailed confusion matrix:\n", + "[[69921 36019]\n", + " [ 5658 4928]]\n", + "Accuracy Score: 0.6423373324408287\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "_v-XzpYeDSMh", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "dtty4IbnDR8s", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "code", + "id": "ysIqx5ELKIlV", + "colab": {} + }, + "cell_type": "code", + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "iR1UKr8vpr1J", + "colab_type": "code", + "outputId": "6c9927e4-9c19-4004-d15c-e3aaf4f50fef", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 357 + } + }, + "cell_type": "code", + "source": [ + "from xgboost import XGBClassifier\n", + "\n", + "clf = XGBClassifier(random_state=0, max_depth=8, n_jobs=-1, scale_pos_weight=6)\n", + "classify(clf)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n", + " colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,\n", + " max_depth=8, min_child_weight=1, missing=None, n_estimators=100,\n", + " n_jobs=-1, nthread=None, objective='binary:logistic',\n", + " random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=6,\n", + " seed=None, silent=True, subsample=1)\n", + "ROC AUC: 0.605199221682646\n", + " precision recall f1-score support\n", + "\n", + " 0 0.92 0.95 0.93 105940\n", + " 1 0.20 0.13 0.16 10586\n", + "\n", + " micro avg 0.87 0.87 0.87 116526\n", + " macro avg 0.56 0.54 0.54 116526\n", + "weighted avg 0.85 0.87 0.86 116526\n", + "\n", + "Detailed confusion matrix:\n", + "[[100476 5464]\n", + " [ 9217 1369]]\n", + "Accuracy Score: 0.8740109503458455\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "n86ejPSp5lcQ", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "xtQ3D4CGpr1M", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "p54c6Dk6pr1Q", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "to_remove = ['apply', 'search_date_pacific', 'class_id']\n", + "preprocess(to_remove, normalize=True)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "dT2lBa4mpr1T", + "colab_type": "code", + "outputId": "106e4d82-bd76-4391-9e43-a8b39be06bd8", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 391 + } + }, + "cell_type": "code", + "source": [ + "clf = XGBClassifier(random_state=0, max_depth=8, n_jobs=-1)\n", + "classify(clf)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n", + " colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,\n", + " max_depth=8, min_child_weight=1, missing=None, n_estimators=100,\n", + " n_jobs=-1, nthread=None, objective='binary:logistic',\n", + " random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,\n", + " seed=None, silent=True, subsample=1)\n", + "ROC AUC: 0.6068051184895856\n", + " precision recall f1-score support\n", + "\n", + " 0 0.91 1.00 0.95 105940\n", + " 1 0.00 0.00 0.00 10586\n", + "\n", + " micro avg 0.91 0.91 0.91 116526\n", + " macro avg 0.45 0.50 0.48 116526\n", + "weighted avg 0.83 0.91 0.87 116526\n", + "\n", + "Detailed confusion matrix:\n", + "[[105940 0]\n", + " [ 10586 0]]\n", + "Accuracy Score: 0.9091533220053893\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "c:\\python36\\lib\\site-packages\\sklearn\\metrics\\classification.py:1143: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n", + " 'precision', 'predicted', average, warn_for)\n" + ], + "name": "stderr" + } + ] + }, + { + "metadata": { + "id": "b4XwOKjEpr1X", + "colab_type": "code", + "outputId": "59747b3d-a31f-46ec-88f8-f57286319bc0", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 357 + } + }, + "cell_type": "code", + "source": [ + "clf = XGBClassifier(random_state=0, max_depth=8, n_jobs=-1, scale_pos_weight=6)\n", + "classify(clf)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n", + " colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,\n", + " max_depth=8, min_child_weight=1, missing=None, n_estimators=100,\n", + " n_jobs=-1, nthread=None, objective='binary:logistic',\n", + " random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=6,\n", + " seed=None, silent=True, subsample=1)\n", + "ROC AUC: 0.605199221682646\n", + " precision recall f1-score support\n", + "\n", + " 0 0.92 0.95 0.93 105940\n", + " 1 0.20 0.13 0.16 10586\n", + "\n", + " micro avg 0.87 0.87 0.87 116526\n", + " macro avg 0.56 0.54 0.54 116526\n", + "weighted avg 0.85 0.87 0.86 116526\n", + "\n", + "Detailed confusion matrix:\n", + "[[100476 5464]\n", + " [ 9217 1369]]\n", + "Accuracy Score: 0.8740109503458455\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "tt2x67Z9upui", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "wcVOSenYuprC", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 391 + }, + "outputId": "781cf3ca-c275-4ff3-b818-1365f59ede8d" + }, + "cell_type": "code", + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "clf = LogisticRegression(random_state=0, class_weight='balanced', n_jobs=-1)\n", + "classify_with_smote(clf)" + ], + "execution_count": 14, + "outputs": [ + { + "output_type": "stream", + "text": [ + "c:\\python36\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", + " FutureWarning)\n", + "c:\\python36\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:1300: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 4.\n", + " \" = {}.\".format(effective_n_jobs(self.n_jobs)))\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "LogisticRegression(C=1.0, class_weight='balanced', dual=False,\n", + " fit_intercept=True, intercept_scaling=1, max_iter=100,\n", + " multi_class='warn', n_jobs=-1, penalty='l2', random_state=0,\n", + " solver='warn', tol=0.0001, verbose=0, warm_start=False)\n", + "ROC AUC: 0.5886791449776352\n", + " precision recall f1-score support\n", + "\n", + " 0 0.93 0.65 0.76 105940\n", + " 1 0.12 0.48 0.19 10586\n", + "\n", + " micro avg 0.64 0.64 0.64 116526\n", + " macro avg 0.52 0.56 0.48 116526\n", + "weighted avg 0.85 0.64 0.71 116526\n", + "\n", + "Detailed confusion matrix:\n", + "[[68991 36949]\n", + " [ 5555 5031]]\n", + "Accuracy Score: 0.6352402039029916\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "O1Ri7Qrppr1a", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 357 + }, + "outputId": "5713e2a4-c311-47f0-d663-fb02455397db" + }, + "cell_type": "code", + "source": [ + "from xgboost import XGBClassifier\n", + "\n", + "clf = XGBClassifier(random_state=0, max_depth=8, n_jobs=-1)\n", + "classify_with_smote(clf)" + ], + "execution_count": 15, + "outputs": [ + { + "output_type": "stream", + "text": [ + "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n", + " colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,\n", + " max_depth=8, min_child_weight=1, missing=None, n_estimators=100,\n", + " n_jobs=-1, nthread=None, objective='binary:logistic',\n", + " random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,\n", + " seed=None, silent=True, subsample=1)\n", + "ROC AUC: 0.5579752325505624\n", + " precision recall f1-score support\n", + "\n", + " 0 0.91 1.00 0.95 105940\n", + " 1 0.21 0.00 0.01 10586\n", + "\n", + " micro avg 0.91 0.91 0.91 116526\n", + " macro avg 0.56 0.50 0.48 116526\n", + "weighted avg 0.85 0.91 0.87 116526\n", + "\n", + "Detailed confusion matrix:\n", + "[[105835 105]\n", + " [ 10558 28]]\n", + "Accuracy Score: 0.9084925252733296\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "_Wtlbu4cpr1e", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "qGpFL24Dpr1g", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "code", + "id": "54TQ-BPdCnAO", + "colab": {} + }, + "cell_type": "code", + "source": [ + "to_remove = ['apply', 'search_date_pacific']\n", + "preprocess(to_remove)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "code", + "id": "fUd-4KTGCm6D", + "outputId": "12f27aaa-8de7-4b63-b748-3c7acb7bfbe6", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 391 + } + }, + "cell_type": "code", + "source": [ + "clf = XGBClassifier(random_state=0, max_depth=8, n_jobs=-1)\n", + "classify(clf)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n", + " colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,\n", + " max_depth=8, min_child_weight=1, missing=None, n_estimators=100,\n", + " n_jobs=-1, nthread=None, objective='binary:logistic',\n", + " random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,\n", + " seed=None, silent=True, subsample=1)\n", + "ROC AUC: 0.622422824896411\n", + " precision recall f1-score support\n", + "\n", + " 0 0.91 1.00 0.95 105940\n", + " 1 0.00 0.00 0.00 10586\n", + "\n", + " micro avg 0.91 0.91 0.91 116526\n", + " macro avg 0.45 0.50 0.48 116526\n", + "weighted avg 0.83 0.91 0.87 116526\n", + "\n", + "Detailed confusion matrix:\n", + "[[105940 0]\n", + " [ 10586 0]]\n", + "Accuracy Score: 0.9091533220053893\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "c:\\python36\\lib\\site-packages\\sklearn\\metrics\\classification.py:1143: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n", + " 'precision', 'predicted', average, warn_for)\n" + ], + "name": "stderr" + } + ] + }, + { + "metadata": { + "id": "7NQSHkahpr1u", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "Q0FRcPUHpr1x", + "colab_type": "code", + "outputId": "aa544c0b-2b2a-4bdf-cd38-e31f3da2607c", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 391 + } + }, + "cell_type": "code", + "source": [ + "clf = LogisticRegression(random_state=0, class_weight='balanced', n_jobs=-1)\n", + "classify(clf)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "c:\\python36\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", + " FutureWarning)\n", + "c:\\python36\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:1300: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 4.\n", + " \" = {}.\".format(effective_n_jobs(self.n_jobs)))\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "LogisticRegression(C=1.0, class_weight='balanced', dual=False,\n", + " fit_intercept=True, intercept_scaling=1, max_iter=100,\n", + " multi_class='warn', n_jobs=-1, penalty='l2', random_state=0,\n", + " solver='warn', tol=0.0001, verbose=0, warm_start=False)\n", + "ROC AUC: 0.5880638495794542\n", + " precision recall f1-score support\n", + "\n", + " 0 0.93 0.66 0.77 105940\n", + " 1 0.12 0.47 0.19 10586\n", + "\n", + " micro avg 0.64 0.64 0.64 116526\n", + " macro avg 0.52 0.56 0.48 116526\n", + "weighted avg 0.85 0.64 0.72 116526\n", + "\n", + "Detailed confusion matrix:\n", + "[[69888 36052]\n", + " [ 5657 4929]]\n", + "Accuracy Score: 0.6420627156171155\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "eMhUBEHFpr1z", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "QPukJmg1pr15", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "to_remove = ['apply', 'search_date_pacific']\n", + "preprocess(to_remove, normalize=True)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "R8kBQxyQpr19", + "colab_type": "code", + "outputId": "69aefcf7-df86-4a24-df94-1aa9f458bf5c", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 391 + } + }, + "cell_type": "code", + "source": [ + "clf = XGBClassifier(random_state=0, max_depth=8, n_jobs=-1, reg_lambda=6)\n", + "classify(clf)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n", + " colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,\n", + " max_depth=8, min_child_weight=1, missing=None, n_estimators=100,\n", + " n_jobs=-1, nthread=None, objective='binary:logistic',\n", + " random_state=0, reg_alpha=0, reg_lambda=6, scale_pos_weight=1,\n", + " seed=None, silent=True, subsample=1)\n", + "ROC AUC: 0.6235080462899393\n", + " precision recall f1-score support\n", + "\n", + " 0 0.91 1.00 0.95 105940\n", + " 1 0.00 0.00 0.00 10586\n", + "\n", + " micro avg 0.91 0.91 0.91 116526\n", + " macro avg 0.45 0.50 0.48 116526\n", + "weighted avg 0.83 0.91 0.87 116526\n", + "\n", + "Detailed confusion matrix:\n", + "[[105940 0]\n", + " [ 10586 0]]\n", + "Accuracy Score: 0.9091533220053893\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "c:\\python36\\lib\\site-packages\\sklearn\\metrics\\classification.py:1143: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n", + " 'precision', 'predicted', average, warn_for)\n" + ], + "name": "stderr" + } + ] + }, + { + "metadata": { + "id": "GrVxrMi9pr2B", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "NZbA-F7Dpr2D", + "colab_type": "code", + "outputId": "23484e5f-f88d-4ac5-a0e2-0bfb85831aac", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 391 + } + }, + "cell_type": "code", + "source": [ + "clf = LogisticRegression(C= 0.001, random_state=0, class_weight='balanced', n_jobs=-1)\n", + "classify(clf)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "c:\\python36\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", + " FutureWarning)\n", + "c:\\python36\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:1300: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 4.\n", + " \" = {}.\".format(effective_n_jobs(self.n_jobs)))\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "LogisticRegression(C=0.001, class_weight='balanced', dual=False,\n", + " fit_intercept=True, intercept_scaling=1, max_iter=100,\n", + " multi_class='warn', n_jobs=-1, penalty='l2', random_state=0,\n", + " solver='warn', tol=0.0001, verbose=0, warm_start=False)\n", + "ROC AUC: 0.5797081134261731\n", + " precision recall f1-score support\n", + "\n", + " 0 0.92 0.69 0.79 105940\n", + " 1 0.12 0.43 0.19 10586\n", + "\n", + " micro avg 0.66 0.66 0.66 116526\n", + " macro avg 0.52 0.56 0.49 116526\n", + "weighted avg 0.85 0.66 0.73 116526\n", + "\n", + "Detailed confusion matrix:\n", + "[[72615 33325]\n", + " [ 6034 4552]]\n", + "Accuracy Score: 0.6622298886085509\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "8fmipsQ8pr2I", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "5wl0o0wYpr2K", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "#Took forever to run\n", + "\n", + "# from sklearn.svm import SVC\n", + "\n", + "# clf = SVC(probability=True, C=0.01)\n", + "# classify(clf)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "NnCEPM1upr2M", + "colab_type": "code", + "outputId": "d0de354f-be06-4e67-f428-471749bb6121", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 357 + } + }, + "cell_type": "code", + "source": [ + "clf = XGBClassifier(random_state=0, max_depth=8, n_jobs=-1, reg_lambda=6)\n", + "classify_with_smote(clf)\n" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n", + " colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,\n", + " max_depth=8, min_child_weight=1, missing=None, n_estimators=100,\n", + " n_jobs=-1, nthread=None, objective='binary:logistic',\n", + " random_state=0, reg_alpha=0, reg_lambda=6, scale_pos_weight=1,\n", + " seed=None, silent=True, subsample=1)\n", + "ROC AUC: 0.5835380348539883\n", + " precision recall f1-score support\n", + "\n", + " 0 0.91 1.00 0.95 105940\n", + " 1 0.25 0.00 0.01 10586\n", + "\n", + " micro avg 0.91 0.91 0.91 116526\n", + " macro avg 0.58 0.50 0.48 116526\n", + "weighted avg 0.85 0.91 0.87 116526\n", + "\n", + "Detailed confusion matrix:\n", + "[[105827 113]\n", + " [ 10548 38]]\n", + "Accuracy Score: 0.9085096888248116\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "WsVIUdsXpr2R", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "XWuApdaVOeN9", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "uWAE7UW2OeJt", + "colab_type": "code", + "outputId": "77142bf7-7ce2-406d-cdb5-f4276959319e", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 391 + } + }, + "cell_type": "code", + "source": [ + "clf = LogisticRegression(random_state=0, class_weight='balanced', n_jobs=-1)\n", + "classify_with_smote(clf)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "c:\\python36\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", + " FutureWarning)\n", + "c:\\python36\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:1300: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 4.\n", + " \" = {}.\".format(effective_n_jobs(self.n_jobs)))\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "LogisticRegression(C=1.0, class_weight='balanced', dual=False,\n", + " fit_intercept=True, intercept_scaling=1, max_iter=100,\n", + " multi_class='warn', n_jobs=-1, penalty='l2', random_state=0,\n", + " solver='warn', tol=0.0001, verbose=0, warm_start=False)\n", + "ROC AUC: 0.5876126158339006\n", + " precision recall f1-score support\n", + "\n", + " 0 0.93 0.63 0.75 105940\n", + " 1 0.12 0.49 0.19 10586\n", + "\n", + " micro avg 0.62 0.62 0.62 116526\n", + " macro avg 0.52 0.56 0.47 116526\n", + "weighted avg 0.85 0.62 0.70 116526\n", + "\n", + "Detailed confusion matrix:\n", + "[[66972 38968]\n", + " [ 5376 5210]]\n", + "Accuracy Score: 0.6194497365394848\n" + ], + "name": "stdout" + } + ] + } + ] +} \ No newline at end of file diff --git a/Report.pdf b/Report.pdf new file mode 100644 index 0000000..81d94dc Binary files /dev/null and b/Report.pdf differ diff --git a/apply_rate.py b/apply_rate.py new file mode 100644 index 0000000..d9ae7a6 --- /dev/null +++ b/apply_rate.py @@ -0,0 +1,182 @@ +# -*- coding: utf-8 -*- +"""Apply_Rate.ipynb + +Automatically generated by Colaboratory. + +Original file is located at + https://colab.research.google.com/drive/1RyVVT1F9WtI2PnKwhQgprOh6CSdjRVzm +""" + +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from sklearn.model_selection import GridSearchCV +from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support, accuracy_score, roc_auc_score + + + +df = pd.read_csv('./Apply_Rate_2019.csv') +df.shape + +df.columns + +not_cat_cols = ['title_proximity_tfidf', 'description_proximity_tfidf', 'main_query_tfidf', 'query_jl_score', 'query_title_score', 'job_age_days'] + +df.head().T + +df.describe() + +# Number of unique entries +df.astype(object).describe(include='all').loc['unique', :] + +# Number of Null Entries +df.isnull().sum(axis=0) + +# Correlation Matrix +df.corr().style.background_gradient() + +df.hist(figsize=(20,20)) + + + +print(df['apply'].value_counts()) +print(df['apply'].value_counts()[0] / len(df) * 100) +print(df['apply'].value_counts()[1] / len(df) * 100) + +df.fillna(0, inplace=True) +df['class_id'] = df['class_id'].astype('category').cat.codes + +from sklearn.preprocessing import StandardScaler +# Preprocessing +def preprocess(to_remove, normalize=False): + #Normalization + df[not_cat_cols] = (df[not_cat_cols] - df[not_cat_cols].mean()) / (df[not_cat_cols].max() - df[not_cat_cols].min()) + + df_train = df[df.search_date_pacific != '2018-01-27'] + df_test = df[df.search_date_pacific == '2018-01-27'] + + global X_train + X_train = df_train.drop(columns=to_remove).values + global y_train + y_train = df_train['apply'].values + global X_test + X_test = df_test.drop(columns=to_remove).values + global y_test + y_test = df_test['apply'].values + + #Normalization +# if normalize: +# scaler = StandardScaler() +# scaler.fit(X_train) +# X_train = scaler.transform(X_train) +# X_test = scaler.transform(X_test) + +def classify(clf): + clf.fit(X_train, y_train) + y_pred = clf.predict_proba(X_test) + print(clf) + print("ROC AUC:", roc_auc_score(y_test, y_pred[:,1])) + y_pred = clf.predict(X_test) + print(classification_report(y_test, y_pred)) + print("Detailed confusion matrix:") + print(confusion_matrix(y_test, y_pred)) + print("Accuracy Score:", accuracy_score(y_test, y_pred)) + + +from imblearn.over_sampling import SMOTE + +def classify_with_smote(clf): + smote = SMOTE(random_state=0, n_jobs=-1, sampling_strategy=4/6) + global X_train, y_train + X_train, y_train = smote.fit_resample(X_train, y_train) + classify(clf) + +to_remove = ['apply', 'search_date_pacific', 'class_id'] +preprocess(to_remove) + +from sklearn.linear_model import LogisticRegression + +clf = LogisticRegression(random_state=0, class_weight='balanced', n_jobs=-1) +classify(clf) + + + + + + + +from xgboost import XGBClassifier + +clf = XGBClassifier(random_state=0, max_depth=8, n_jobs=-1, scale_pos_weight=6) +classify(clf) + + + + + +to_remove = ['apply', 'search_date_pacific', 'class_id'] +preprocess(to_remove, normalize=True) + +clf = XGBClassifier(random_state=0, max_depth=8, n_jobs=-1) +classify(clf) + +clf = XGBClassifier(random_state=0, max_depth=8, n_jobs=-1, scale_pos_weight=6) +classify(clf) + + + +from sklearn.linear_model import LogisticRegression +clf = LogisticRegression(random_state=0, class_weight='balanced', n_jobs=-1) +classify_with_smote(clf) + +from xgboost import XGBClassifier + +clf = XGBClassifier(random_state=0, max_depth=8, n_jobs=-1) +classify_with_smote(clf) + + + + + +to_remove = ['apply', 'search_date_pacific'] +preprocess(to_remove) + +clf = XGBClassifier(random_state=0, max_depth=8, n_jobs=-1) +classify(clf) + + + +clf = LogisticRegression(random_state=0, class_weight='balanced', n_jobs=-1) +classify(clf) + + + +to_remove = ['apply', 'search_date_pacific'] +preprocess(to_remove, normalize=True) + +clf = XGBClassifier(random_state=0, max_depth=8, n_jobs=-1, reg_lambda=6) +classify(clf) + + + +clf = LogisticRegression(C= 0.001, random_state=0, class_weight='balanced', n_jobs=-1) +classify(clf) + + + +#Took forever to run + +# from sklearn.svm import SVC + +# clf = SVC(probability=True, C=0.01) +# classify(clf) + +clf = XGBClassifier(random_state=0, max_depth=8, n_jobs=-1, reg_lambda=6) +classify_with_smote(clf) + + + + + +clf = LogisticRegression(random_state=0, class_weight='balanced', n_jobs=-1) +classify_with_smote(clf) \ No newline at end of file