-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Word2Vec Embeddings, Dimensionality Reduction, and Classification
- Loading branch information
Showing
181 changed files
with
34,562 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
## Word2Vec Classification | ||
|
||
Save word2vec embeddings dataset with original user labels: | ||
|
||
```sh | ||
python -m app.word2vec_classification.dataset | ||
``` | ||
|
||
Perform classification using the word2vec embeddings dataset: | ||
|
||
```sh | ||
FIG_SAVE=true FIG_SHOW=false python -m app.word2vec_classification.job | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
|
||
import os | ||
from functools import cached_property | ||
from pandas import read_csv | ||
|
||
from app import DATA_DIRPATH | ||
from app.dataset import Dataset | ||
from app.word2vec_embeddings.pipeline import WORD2VEC_RESULTS_DIRPATH | ||
|
||
|
||
WORD2VEC_EMBEDDINGS_CSV_FILEPATH = os.path.join(WORD2VEC_RESULTS_DIRPATH, "document_vectors.csv") | ||
|
||
WORD2VEC_DATASET_PATH = os.path.join(DATA_DIRPATH, "word2vec", "botometer_sample_word2vec_embeddings_20230825.csv.gz") | ||
|
||
class Word2VecDataset(): | ||
|
||
def __init__(self, force_recompile=False): | ||
|
||
self.csv_filepath = WORD2VEC_DATASET_PATH | ||
|
||
#super().__init__(csv_filepath=WORD2VEC_DATASET_PATH) | ||
|
||
self.force_recompile = force_recompile | ||
|
||
#self.title = f"Word2Vec Embeddings" | ||
|
||
#breakpoint() | ||
#self.feature_cols = "TODO:" # feature_colnames(self.reducer_name, self.n_components) | ||
|
||
|
||
@cached_property | ||
def df(self): | ||
"""Override parent method, compile dataset from reduction results.""" | ||
if os.path.isfile(self.csv_filepath) and not self.force_recompile: | ||
print("LOADING EXISTING DATASET FROM FILE...") | ||
return read_csv(self.csv_filepath) | ||
else: | ||
print("COMPILING DATASET FROM RESULTS FILES...") | ||
ds = Dataset() | ||
labels_df = ds.labels #[colname for colname in df.columns if not colname.isnumeric()] | ||
embeddings_df = read_csv(WORD2VEC_EMBEDDINGS_CSV_FILEPATH) | ||
df = labels_df.merge(embeddings_df, left_on="user_id", right_on="user_id") | ||
|
||
# write dataset (for faster loading later): | ||
df.to_csv(self.csv_filepath, index=False) | ||
return df | ||
|
||
|
||
@cached_property | ||
def x(self): | ||
"""Override parent method, use feature cols specified below.""" | ||
return self.df[self.feature_cols].copy() | ||
|
||
@property | ||
def feature_cols(self): | ||
"""Features 0 through 99 (word2vec embeddings) """ | ||
return [colname for colname in self.df.columns if colname.isnumeric()] | ||
|
||
|
||
#@property | ||
#def label_cols(self): | ||
# return [colname for colname in self.df.columns if not colname.isnumeric()] | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
|
||
|
||
|
||
ds = Word2VecDataset() | ||
|
||
print(ds.df.head()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import os | ||
|
||
from app import RESULTS_DIRPATH | ||
from app.classification import Y_COLS, Y_COLS_BINARY, Y_COLS_MULTICLASS | ||
from app.classification.logistic_regression import LogisticRegressionPipeline | ||
from app.classification.random_forest import RandomForestPipeline | ||
from app.classification.xgboost import XGBoostPipeline | ||
|
||
from app.word2vec_embeddings.pipeline import WORD2VEC_RESULTS_DIRPATH | ||
from app.word2vec_classification.dataset import Word2VecDataset | ||
|
||
|
||
CLASSIFICATION_RESULTS_DIRPATH = os.path.join(RESULTS_DIRPATH, "word2vec_classification") | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
ds = Word2VecDataset() | ||
|
||
will_upload = True | ||
for y_col in Y_COLS: | ||
results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "logistic_regression") | ||
pipeline = LogisticRegressionPipeline(ds=ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload) | ||
pipeline.perform() | ||
|
||
#continue | ||
|
||
results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "xgboost") | ||
pipeline = XGBoostPipeline(ds=ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload) | ||
pipeline.perform() | ||
|
||
# the slowest can go last: | ||
results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "random_forest") | ||
pipeline = RandomForestPipeline(ds=ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload) | ||
pipeline.perform() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
14 changes: 14 additions & 0 deletions
14
results/word2vec_classification/fourway_label/logistic_regression/confusion.html
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file added
BIN
+43.7 KB
results/word2vec_classification/fourway_label/logistic_regression/confusion.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
105 changes: 105 additions & 0 deletions
105
results/word2vec_classification/fourway_label/logistic_regression/explainability.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
{ | ||
"intercept": 0.9382, | ||
"coefs": { | ||
"0": -0.2987, | ||
"1": 1.1251, | ||
"2": 0.5167, | ||
"3": 0.7698, | ||
"4": 0.6621, | ||
"5": -0.3478, | ||
"6": -0.2379, | ||
"7": 0.4313, | ||
"8": 0.5406, | ||
"9": -0.4077, | ||
"10": -0.4653, | ||
"11": -1.2287, | ||
"12": -1.5283, | ||
"13": 0.2226, | ||
"14": -0.1849, | ||
"15": -0.8318, | ||
"16": -0.8783, | ||
"17": 1.0968, | ||
"18": -0.2249, | ||
"19": 0.9544, | ||
"20": 0.6006, | ||
"21": -0.4967, | ||
"22": -0.4993, | ||
"23": -0.113, | ||
"24": -0.2544, | ||
"25": -0.5919, | ||
"26": -0.095, | ||
"27": -0.1456, | ||
"28": 0.1785, | ||
"29": 0.4678, | ||
"30": 0.0852, | ||
"31": 0.9346, | ||
"32": -0.8233, | ||
"33": 0.4591, | ||
"34": -0.5005, | ||
"35": 0.6023, | ||
"36": -1.2788, | ||
"37": -0.6132, | ||
"38": 1.2333, | ||
"39": -0.1937, | ||
"40": 0.7407, | ||
"41": 1.439, | ||
"42": -0.2891, | ||
"43": -0.4341, | ||
"44": 0.5409, | ||
"45": -0.4362, | ||
"46": -0.502, | ||
"47": 0.3304, | ||
"48": -0.9698, | ||
"49": -0.4308, | ||
"50": -0.4414, | ||
"51": 0.3116, | ||
"52": -0.2524, | ||
"53": -0.2637, | ||
"54": -1.6601, | ||
"55": -0.9327, | ||
"56": 0.3113, | ||
"57": -0.9448, | ||
"58": 0.87, | ||
"59": -0.5171, | ||
"60": 0.9748, | ||
"61": -0.3876, | ||
"62": 0.1627, | ||
"63": 0.0444, | ||
"64": -0.417, | ||
"65": -0.0841, | ||
"66": -0.2945, | ||
"67": -0.5939, | ||
"68": -0.3787, | ||
"69": -0.2174, | ||
"70": 1.0597, | ||
"71": 0.2496, | ||
"72": 0.4212, | ||
"73": 0.0405, | ||
"74": -0.0664, | ||
"75": 0.7908, | ||
"76": 0.5717, | ||
"77": -0.0473, | ||
"78": -0.2323, | ||
"79": 0.1206, | ||
"80": -0.3137, | ||
"81": 0.1218, | ||
"82": 0.2265, | ||
"83": -0.2579, | ||
"84": -0.416, | ||
"85": 0.6002, | ||
"86": -0.4951, | ||
"87": -0.1856, | ||
"88": -0.9771, | ||
"89": -0.6262, | ||
"90": -0.5218, | ||
"91": 1.1698, | ||
"92": -1.221, | ||
"93": 0.017, | ||
"94": 1.0369, | ||
"95": -0.2823, | ||
"96": -0.2179, | ||
"97": 0.4352, | ||
"98": -0.6326, | ||
"99": -1.1682 | ||
} | ||
} |
Binary file added
BIN
+5.15 KB
results/word2vec_classification/fourway_label/logistic_regression/model.joblib
Binary file not shown.
Oops, something went wrong.