Skip to content

Commit

Permalink
Word2Vec Classification (#24)
Browse files Browse the repository at this point in the history
Word2Vec Embeddings, Dimensionality Reduction, and Classification
  • Loading branch information
s2t2 authored Nov 19, 2023
1 parent e52e04e commit d4ef2eb
Show file tree
Hide file tree
Showing 181 changed files with 34,562 additions and 12 deletions.
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ BUCKET_NAME="my-bucket"

## Usage

### OpenAI Embeddings
### OpenAI Service

Fetch some example embeddings from OpenAI API:

Expand All @@ -74,11 +74,18 @@ python -m app.dataset

Perform machine learning and other analyses on the data:

OpenAI Embeddings:

+ [Dimensionality Reduction](app/reduction/README.md)
+ [Clustering](app/clustering/README.md)
+ [Classification](app/classification/README.md)
+ [Reduced Classification](app/reduced_classification/README.md)

Word2Vec Embeddings:

+ [Dimensionality Reduction](app/word2vec_embeddings/README.md)
+ [Classification](app/word2vec_classification/README.md)


## Testing

Expand Down
4 changes: 2 additions & 2 deletions app/classification/logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@

class LogisticRegressionPipeline(ClassificationPipeline):

def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None):
super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath)
def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None, will_upload=False):
super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath, will_upload=will_upload)

self.model = LogisticRegression(random_state=99) #multi_class="auto"
self.model_dirname = "logistic_regression"
Expand Down
11 changes: 7 additions & 4 deletions app/classification/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
class ClassificationPipeline(ABC):
"""Supports binary and multiclass classification."""

def __init__(self, ds=None, x_scale=False, y_col="is_bot", param_grid=None, k_folds=K_FOLDS, results_dirpath=None):
def __init__(self, ds=None, x_scale=False, y_col="is_bot", param_grid=None, k_folds=K_FOLDS, results_dirpath=None, will_upload=True):

self.ds = ds or Dataset()
self.x_scale = x_scale
Expand Down Expand Up @@ -69,6 +69,8 @@ def __init__(self, ds=None, x_scale=False, y_col="is_bot", param_grid=None, k_fo
self.k_folds = k_folds
self._results_dirpath = results_dirpath

self.will_upload = bool(will_upload)

# values set after training:
self.gs = None
self.results = None
Expand All @@ -80,6 +82,7 @@ def __init__(self, ds=None, x_scale=False, y_col="is_bot", param_grid=None, k_fo
self.model_dirname = None
self.param_grid = param_grid or {}


@property
def model_type(self):
return self.model.__class__.__name__
Expand All @@ -97,12 +100,12 @@ def perform(self):
self.plot_roc_curve()

# upload to cloud storage :-D
self.storage = ModelStorage(local_dirpath=self.results_dirpath)
self.storage.save_and_upload_model(self.model)
if self.will_upload:
self.storage = ModelStorage(local_dirpath=self.results_dirpath)
self.storage.save_and_upload_model(self.model)


def train_eval(self):

self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x, self.y, shuffle=True, test_size=0.2, random_state=99)
print("X TRAIN:", self.x_train.shape)
print("Y TRAIN:", self.y_train.shape)
Expand Down
4 changes: 2 additions & 2 deletions app/classification/random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@

class RandomForestPipeline(ClassificationPipeline):

def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None):
super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath)
def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None, will_upload=False):
super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath, will_upload=will_upload)

self.model = RandomForestClassifier(random_state=99)
self.model_dirname = "random_forest"
Expand Down
4 changes: 2 additions & 2 deletions app/classification/xgboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@

class XGBoostPipeline(ClassificationPipeline):

def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None):
super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath)
def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None, will_upload=False):
super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath, will_upload=will_upload)

# UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release.
# To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
Expand Down
13 changes: 13 additions & 0 deletions app/word2vec_classification/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
## Word2Vec Classification

Save word2vec embeddings dataset with original user labels:

```sh
python -m app.word2vec_classification.dataset
```

Perform classification using the word2vec embeddings dataset:

```sh
FIG_SAVE=true FIG_SHOW=false python -m app.word2vec_classification.job
```
72 changes: 72 additions & 0 deletions app/word2vec_classification/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@

import os
from functools import cached_property
from pandas import read_csv

from app import DATA_DIRPATH
from app.dataset import Dataset
from app.word2vec_embeddings.pipeline import WORD2VEC_RESULTS_DIRPATH


WORD2VEC_EMBEDDINGS_CSV_FILEPATH = os.path.join(WORD2VEC_RESULTS_DIRPATH, "document_vectors.csv")

WORD2VEC_DATASET_PATH = os.path.join(DATA_DIRPATH, "word2vec", "botometer_sample_word2vec_embeddings_20230825.csv.gz")

class Word2VecDataset():

def __init__(self, force_recompile=False):

self.csv_filepath = WORD2VEC_DATASET_PATH

#super().__init__(csv_filepath=WORD2VEC_DATASET_PATH)

self.force_recompile = force_recompile

#self.title = f"Word2Vec Embeddings"

#breakpoint()
#self.feature_cols = "TODO:" # feature_colnames(self.reducer_name, self.n_components)


@cached_property
def df(self):
"""Override parent method, compile dataset from reduction results."""
if os.path.isfile(self.csv_filepath) and not self.force_recompile:
print("LOADING EXISTING DATASET FROM FILE...")
return read_csv(self.csv_filepath)
else:
print("COMPILING DATASET FROM RESULTS FILES...")
ds = Dataset()
labels_df = ds.labels #[colname for colname in df.columns if not colname.isnumeric()]
embeddings_df = read_csv(WORD2VEC_EMBEDDINGS_CSV_FILEPATH)
df = labels_df.merge(embeddings_df, left_on="user_id", right_on="user_id")

# write dataset (for faster loading later):
df.to_csv(self.csv_filepath, index=False)
return df


@cached_property
def x(self):
"""Override parent method, use feature cols specified below."""
return self.df[self.feature_cols].copy()

@property
def feature_cols(self):
"""Features 0 through 99 (word2vec embeddings) """
return [colname for colname in self.df.columns if colname.isnumeric()]


#@property
#def label_cols(self):
# return [colname for colname in self.df.columns if not colname.isnumeric()]



if __name__ == "__main__":



ds = Word2VecDataset()

print(ds.df.head())
35 changes: 35 additions & 0 deletions app/word2vec_classification/job.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import os

from app import RESULTS_DIRPATH
from app.classification import Y_COLS, Y_COLS_BINARY, Y_COLS_MULTICLASS
from app.classification.logistic_regression import LogisticRegressionPipeline
from app.classification.random_forest import RandomForestPipeline
from app.classification.xgboost import XGBoostPipeline

from app.word2vec_embeddings.pipeline import WORD2VEC_RESULTS_DIRPATH
from app.word2vec_classification.dataset import Word2VecDataset


CLASSIFICATION_RESULTS_DIRPATH = os.path.join(RESULTS_DIRPATH, "word2vec_classification")


if __name__ == "__main__":

ds = Word2VecDataset()

will_upload = True
for y_col in Y_COLS:
results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "logistic_regression")
pipeline = LogisticRegressionPipeline(ds=ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload)
pipeline.perform()

#continue

results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "xgboost")
pipeline = XGBoostPipeline(ds=ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload)
pipeline.perform()

# the slowest can go last:
results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "random_forest")
pipeline = RandomForestPipeline(ds=ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload)
pipeline.perform()
2 changes: 2 additions & 0 deletions app/word2vec_embeddings/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ python -m app.word2vec_embeddings.pipeline
# FIG_SAVE=true FIG_SHOW=true python -m app.word2vec_embeddings.pipeline
```

### Dimensionality Reduction

Perform dimensionality reduction on the resulting word and document embeddings, respectively:

```sh
Expand Down
1 change: 1 addition & 0 deletions app/word2vec_embeddings/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ def save_document_vectors(self):

ds = Dataset()
df = ds.df
df.index = df["user_id"]

#df["tokens"] = df["tweet_texts"].apply(tokenizer)
#print(df["tokens"].head())
Expand Down
Empty file added data/word2vec/.gitkeep
Empty file.
8 changes: 7 additions & 1 deletion index.html
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,13 @@ <h1>Word2Vec Embeddings 2023</h1>
<p class="lead">How about embeddings from Word2Vec?</h2>

<section>
<h3><a href="results/word2vec_embeddings/index.html">Dimensionality Reduction Results</a></h3>
<h3><a href="results/word2vec_embeddings/index.html#user-embeddings-container">Dimensionality Reduction Results</a></h3>
</section>


<section>
<h3><a href="results/word2vec_classification/index.html">Classification Results</a></h3>

</section>

<hr>
Expand Down

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
{
"intercept": 0.9382,
"coefs": {
"0": -0.2987,
"1": 1.1251,
"2": 0.5167,
"3": 0.7698,
"4": 0.6621,
"5": -0.3478,
"6": -0.2379,
"7": 0.4313,
"8": 0.5406,
"9": -0.4077,
"10": -0.4653,
"11": -1.2287,
"12": -1.5283,
"13": 0.2226,
"14": -0.1849,
"15": -0.8318,
"16": -0.8783,
"17": 1.0968,
"18": -0.2249,
"19": 0.9544,
"20": 0.6006,
"21": -0.4967,
"22": -0.4993,
"23": -0.113,
"24": -0.2544,
"25": -0.5919,
"26": -0.095,
"27": -0.1456,
"28": 0.1785,
"29": 0.4678,
"30": 0.0852,
"31": 0.9346,
"32": -0.8233,
"33": 0.4591,
"34": -0.5005,
"35": 0.6023,
"36": -1.2788,
"37": -0.6132,
"38": 1.2333,
"39": -0.1937,
"40": 0.7407,
"41": 1.439,
"42": -0.2891,
"43": -0.4341,
"44": 0.5409,
"45": -0.4362,
"46": -0.502,
"47": 0.3304,
"48": -0.9698,
"49": -0.4308,
"50": -0.4414,
"51": 0.3116,
"52": -0.2524,
"53": -0.2637,
"54": -1.6601,
"55": -0.9327,
"56": 0.3113,
"57": -0.9448,
"58": 0.87,
"59": -0.5171,
"60": 0.9748,
"61": -0.3876,
"62": 0.1627,
"63": 0.0444,
"64": -0.417,
"65": -0.0841,
"66": -0.2945,
"67": -0.5939,
"68": -0.3787,
"69": -0.2174,
"70": 1.0597,
"71": 0.2496,
"72": 0.4212,
"73": 0.0405,
"74": -0.0664,
"75": 0.7908,
"76": 0.5717,
"77": -0.0473,
"78": -0.2323,
"79": 0.1206,
"80": -0.3137,
"81": 0.1218,
"82": 0.2265,
"83": -0.2579,
"84": -0.416,
"85": 0.6002,
"86": -0.4951,
"87": -0.1856,
"88": -0.9771,
"89": -0.6262,
"90": -0.5218,
"91": 1.1698,
"92": -1.221,
"93": 0.017,
"94": 1.0369,
"95": -0.2823,
"96": -0.2179,
"97": 0.4352,
"98": -0.6326,
"99": -1.1682
}
}
Binary file not shown.
Loading

0 comments on commit d4ef2eb

Please sign in to comment.