Word2Vec Classification (#24)

Word2Vec Embeddings, Dimensionality Reduction, and Classification
s2t2 · Nov 19, 2023 · d4ef2eb · d4ef2eb
1 parent e52e04e
commit d4ef2eb
Show file tree

Hide file tree

Showing 181 changed files with 34,562 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -53,7 +53,7 @@ BUCKET_NAME="my-bucket"
 
 ## Usage
 
-### OpenAI Embeddings
+### OpenAI Service
 
 Fetch some example embeddings from OpenAI API:
 
@@ -74,11 +74,18 @@ python -m app.dataset
 
 Perform machine learning and other analyses on the data:
 
+OpenAI Embeddings:
+
   + [Dimensionality Reduction](app/reduction/README.md)
   + [Clustering](app/clustering/README.md)
   + [Classification](app/classification/README.md)
   + [Reduced Classification](app/reduced_classification/README.md)
 
+Word2Vec Embeddings:
+
+  + [Dimensionality Reduction](app/word2vec_embeddings/README.md)
+  + [Classification](app/word2vec_classification/README.md)
+
 
 ## Testing
 

diff --git a/app/classification/logistic_regression.py b/app/classification/logistic_regression.py
@@ -10,8 +10,8 @@
 
 class LogisticRegressionPipeline(ClassificationPipeline):
 
-    def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None):
-        super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath)
+    def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None, will_upload=False):
+        super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath, will_upload=will_upload)
 
         self.model = LogisticRegression(random_state=99) #multi_class="auto"
         self.model_dirname = "logistic_regression"

diff --git a/app/classification/pipeline.py b/app/classification/pipeline.py
@@ -33,7 +33,7 @@
 class ClassificationPipeline(ABC):
     """Supports binary and multiclass classification."""
 
-    def __init__(self, ds=None, x_scale=False, y_col="is_bot", param_grid=None, k_folds=K_FOLDS, results_dirpath=None):
+    def __init__(self, ds=None, x_scale=False, y_col="is_bot", param_grid=None, k_folds=K_FOLDS, results_dirpath=None, will_upload=True):
 
         self.ds = ds or Dataset()
         self.x_scale = x_scale
@@ -69,6 +69,8 @@ def __init__(self, ds=None, x_scale=False, y_col="is_bot", param_grid=None, k_fo
         self.k_folds = k_folds
         self._results_dirpath = results_dirpath
 
+        self.will_upload = bool(will_upload)
+
         # values set after training:
         self.gs = None
         self.results = None
@@ -80,6 +82,7 @@ def __init__(self, ds=None, x_scale=False, y_col="is_bot", param_grid=None, k_fo
         self.model_dirname = None
         self.param_grid = param_grid or {}
 
+
     @property
     def model_type(self):
         return self.model.__class__.__name__
@@ -97,12 +100,12 @@ def perform(self):
             self.plot_roc_curve()
 
         # upload to cloud storage :-D
-        self.storage = ModelStorage(local_dirpath=self.results_dirpath)
-        self.storage.save_and_upload_model(self.model)
+        if self.will_upload:
+            self.storage = ModelStorage(local_dirpath=self.results_dirpath)
+            self.storage.save_and_upload_model(self.model)
 
 
     def train_eval(self):
-
         self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x, self.y, shuffle=True, test_size=0.2, random_state=99)
         print("X TRAIN:", self.x_train.shape)
         print("Y TRAIN:", self.y_train.shape)

diff --git a/app/classification/random_forest.py b/app/classification/random_forest.py
@@ -7,8 +7,8 @@
 
 class RandomForestPipeline(ClassificationPipeline):
 
-    def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None):
-        super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath)
+    def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None, will_upload=False):
+        super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath, will_upload=will_upload)
 
         self.model = RandomForestClassifier(random_state=99)
         self.model_dirname = "random_forest"

diff --git a/app/classification/xgboost.py b/app/classification/xgboost.py
@@ -21,8 +21,8 @@
 
 class XGBoostPipeline(ClassificationPipeline):
 
-    def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None):
-        super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath)
+    def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None, will_upload=False):
+        super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath, will_upload=will_upload)
 
         # UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release.
         # To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].

diff --git a/app/word2vec_classification/README.md b/app/word2vec_classification/README.md
@@ -0,0 +1,13 @@
+## Word2Vec Classification
+
+Save word2vec embeddings dataset with original user labels:
+
+```sh
+python -m app.word2vec_classification.dataset
+```
+
+Perform classification using the word2vec embeddings dataset:
+
+```sh
+FIG_SAVE=true FIG_SHOW=false python -m app.word2vec_classification.job
+```
diff --git a/app/word2vec_classification/dataset.py b/app/word2vec_classification/dataset.py
@@ -0,0 +1,72 @@
+
+import os
+from functools import cached_property
+from pandas import read_csv
+
+from app import DATA_DIRPATH
+from app.dataset import Dataset
+from app.word2vec_embeddings.pipeline import WORD2VEC_RESULTS_DIRPATH
+
+
+WORD2VEC_EMBEDDINGS_CSV_FILEPATH = os.path.join(WORD2VEC_RESULTS_DIRPATH, "document_vectors.csv")
+
+WORD2VEC_DATASET_PATH = os.path.join(DATA_DIRPATH, "word2vec", "botometer_sample_word2vec_embeddings_20230825.csv.gz")
+
+class Word2VecDataset():
+
+    def __init__(self, force_recompile=False):
+
+        self.csv_filepath = WORD2VEC_DATASET_PATH
+
+        #super().__init__(csv_filepath=WORD2VEC_DATASET_PATH)
+
+        self.force_recompile = force_recompile
+
+        #self.title = f"Word2Vec Embeddings"
+
+        #breakpoint()
+        #self.feature_cols = "TODO:" # feature_colnames(self.reducer_name, self.n_components)
+
+
+    @cached_property
+    def df(self):
+        """Override parent method, compile dataset from reduction results."""
+        if os.path.isfile(self.csv_filepath) and not self.force_recompile:
+            print("LOADING EXISTING DATASET FROM FILE...")
+            return read_csv(self.csv_filepath)
+        else:
+            print("COMPILING DATASET FROM RESULTS FILES...")
+            ds = Dataset()
+            labels_df = ds.labels #[colname for colname in  df.columns if not colname.isnumeric()]
+            embeddings_df = read_csv(WORD2VEC_EMBEDDINGS_CSV_FILEPATH)
+            df = labels_df.merge(embeddings_df, left_on="user_id", right_on="user_id")
+
+            # write dataset (for faster loading later):
+            df.to_csv(self.csv_filepath, index=False)
+            return df
+
+
+    @cached_property
+    def x(self):
+        """Override parent method, use feature cols specified below."""
+        return self.df[self.feature_cols].copy()
+
+    @property
+    def feature_cols(self):
+        """Features 0 through 99 (word2vec embeddings) """
+        return [colname for colname in  self.df.columns if colname.isnumeric()]
+
+
+    #@property
+    #def label_cols(self):
+    #    return [colname for colname in  self.df.columns if not colname.isnumeric()]
+
+
+
+if __name__ == "__main__":
+
+
+
+    ds = Word2VecDataset()
+
+    print(ds.df.head())
diff --git a/app/word2vec_classification/job.py b/app/word2vec_classification/job.py
@@ -0,0 +1,35 @@
+import os
+
+from app import RESULTS_DIRPATH
+from app.classification import Y_COLS, Y_COLS_BINARY, Y_COLS_MULTICLASS
+from app.classification.logistic_regression import LogisticRegressionPipeline
+from app.classification.random_forest import RandomForestPipeline
+from app.classification.xgboost import XGBoostPipeline
+
+from app.word2vec_embeddings.pipeline import WORD2VEC_RESULTS_DIRPATH
+from app.word2vec_classification.dataset import Word2VecDataset
+
+
+CLASSIFICATION_RESULTS_DIRPATH = os.path.join(RESULTS_DIRPATH, "word2vec_classification")
+
+
+if __name__ == "__main__":
+
+    ds = Word2VecDataset()
+
+    will_upload = True
+    for y_col in Y_COLS:
+        results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "logistic_regression")
+        pipeline = LogisticRegressionPipeline(ds=ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload)
+        pipeline.perform()
+
+        #continue
+
+        results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "xgboost")
+        pipeline = XGBoostPipeline(ds=ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload)
+        pipeline.perform()
+
+        # the slowest can go last:
+        results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "random_forest")
+        pipeline = RandomForestPipeline(ds=ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload)
+        pipeline.perform()
diff --git a/app/word2vec_embeddings/README.md b/app/word2vec_embeddings/README.md
@@ -11,6 +11,8 @@ python -m app.word2vec_embeddings.pipeline
 # FIG_SAVE=true FIG_SHOW=true python -m app.word2vec_embeddings.pipeline
 ```
 
+### Dimensionality Reduction
+
 Perform dimensionality reduction on the resulting word and document embeddings, respectively:
 
 ```sh

diff --git a/app/word2vec_embeddings/pipeline.py b/app/word2vec_embeddings/pipeline.py
@@ -168,6 +168,7 @@ def save_document_vectors(self):
 
     ds = Dataset()
     df = ds.df
+    df.index = df["user_id"]
 
     #df["tokens"] = df["tweet_texts"].apply(tokenizer)
     #print(df["tokens"].head())

diff --git a/data/word2vec/.gitkeep b/data/word2vec/.gitkeep
diff --git a/index.html b/index.html
@@ -38,7 +38,13 @@ <h1>Word2Vec Embeddings 2023</h1>
         <p class="lead">How about embeddings from Word2Vec?</h2>
 
         <section>
-            <h3><a href="results/word2vec_embeddings/index.html">Dimensionality Reduction Results</a></h3>
+            <h3><a href="results/word2vec_embeddings/index.html#user-embeddings-container">Dimensionality Reduction Results</a></h3>
+        </section>
+
+
+        <section>
+            <h3><a href="results/word2vec_classification/index.html">Classification Results</a></h3>
+
         </section>
 
         <hr>

diff --git a/results/word2vec_classification/fourway_label/logistic_regression/confusion.html b/results/word2vec_classification/fourway_label/logistic_regression/confusion.html
diff --git a/results/word2vec_classification/fourway_label/logistic_regression/confusion.png b/results/word2vec_classification/fourway_label/logistic_regression/confusion.png
diff --git a/results/word2vec_classification/fourway_label/logistic_regression/explainability.json b/results/word2vec_classification/fourway_label/logistic_regression/explainability.json
@@ -0,0 +1,105 @@
+{
+    "intercept": 0.9382,
+    "coefs": {
+        "0": -0.2987,
+        "1": 1.1251,
+        "2": 0.5167,
+        "3": 0.7698,
+        "4": 0.6621,
+        "5": -0.3478,
+        "6": -0.2379,
+        "7": 0.4313,
+        "8": 0.5406,
+        "9": -0.4077,
+        "10": -0.4653,
+        "11": -1.2287,
+        "12": -1.5283,
+        "13": 0.2226,
+        "14": -0.1849,
+        "15": -0.8318,
+        "16": -0.8783,
+        "17": 1.0968,
+        "18": -0.2249,
+        "19": 0.9544,
+        "20": 0.6006,
+        "21": -0.4967,
+        "22": -0.4993,
+        "23": -0.113,
+        "24": -0.2544,
+        "25": -0.5919,
+        "26": -0.095,
+        "27": -0.1456,
+        "28": 0.1785,
+        "29": 0.4678,
+        "30": 0.0852,
+        "31": 0.9346,
+        "32": -0.8233,
+        "33": 0.4591,
+        "34": -0.5005,
+        "35": 0.6023,
+        "36": -1.2788,
+        "37": -0.6132,
+        "38": 1.2333,
+        "39": -0.1937,
+        "40": 0.7407,
+        "41": 1.439,
+        "42": -0.2891,
+        "43": -0.4341,
+        "44": 0.5409,
+        "45": -0.4362,
+        "46": -0.502,
+        "47": 0.3304,
+        "48": -0.9698,
+        "49": -0.4308,
+        "50": -0.4414,
+        "51": 0.3116,
+        "52": -0.2524,
+        "53": -0.2637,
+        "54": -1.6601,
+        "55": -0.9327,
+        "56": 0.3113,
+        "57": -0.9448,
+        "58": 0.87,
+        "59": -0.5171,
+        "60": 0.9748,
+        "61": -0.3876,
+        "62": 0.1627,
+        "63": 0.0444,
+        "64": -0.417,
+        "65": -0.0841,
+        "66": -0.2945,
+        "67": -0.5939,
+        "68": -0.3787,
+        "69": -0.2174,
+        "70": 1.0597,
+        "71": 0.2496,
+        "72": 0.4212,
+        "73": 0.0405,
+        "74": -0.0664,
+        "75": 0.7908,
+        "76": 0.5717,
+        "77": -0.0473,
+        "78": -0.2323,
+        "79": 0.1206,
+        "80": -0.3137,
+        "81": 0.1218,
+        "82": 0.2265,
+        "83": -0.2579,
+        "84": -0.416,
+        "85": 0.6002,
+        "86": -0.4951,
+        "87": -0.1856,
+        "88": -0.9771,
+        "89": -0.6262,
+        "90": -0.5218,
+        "91": 1.1698,
+        "92": -1.221,
+        "93": 0.017,
+        "94": 1.0369,
+        "95": -0.2823,
+        "96": -0.2179,
+        "97": 0.4352,
+        "98": -0.6326,
+        "99": -1.1682
+    }
+}
diff --git a/results/word2vec_classification/fourway_label/logistic_regression/model.joblib b/results/word2vec_classification/fourway_label/logistic_regression/model.joblib
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,8 @@ python -m app.word2vec_embeddings.pipeline @@
     # FIG_SAVE=true FIG_SHOW=true python -m app.word2vec_embeddings.pipeline
     ```
+    ### Dimensionality Reduction
     Perform dimensionality reduction on the resulting word and document embeddings, respectively:
     ```sh
@@ Expand Down @@