From 09c3c83111b7a8bd39b5e6e9ee52519155f040ca Mon Sep 17 00:00:00 2001 From: Ian Eaves Date: Mon, 28 Jun 2021 14:06:11 -0500 Subject: [PATCH 1/2] rewrite --- .gitignore | 1 + requirements.txt | 51 +----- setup.py | 4 +- sherlock/__init__.py | 5 + .../classes_retrain_minimal_sample.npy | Bin .../classes_retrained_sherlock.npy | Bin .../{deploy => data}/classes_sherlock.npy | Bin .../feature_column_identifiers/char_col.tsv | 0 .../feature_column_identifiers/par_col.tsv | 0 .../feature_column_identifiers/rest_col.tsv | 0 .../feature_column_identifiers/word_col.tsv | 0 .../par_vec_trained_400.pkl | Bin .../data}/retrain_minimal_sample_model.json | 0 {models => sherlock/data}/sherlock_model.json | 0 {models => sherlock/data}/sherlock_weights.h5 | Bin sherlock/defaults.py | 94 ++++++++++ sherlock/deploy/__init__.py | 0 sherlock/deploy/model_helpers.py | 58 ------ sherlock/deploy/predict_sherlock.py | 61 ------- sherlock/features/__init__.py | 3 + sherlock/features/bag_of_characters.py | 45 ++--- sherlock/features/bag_of_words.py | 103 ++++++----- sherlock/features/paragraph_vectors.py | 75 ++++---- sherlock/features/preprocessing.py | 172 ++++++------------ sherlock/features/word_embeddings.py | 90 ++++----- sherlock/helpers.py | 20 +- sherlock/model.py | 56 ++++++ .../{deploy/train_sherlock.py => training.py} | 47 ++--- 28 files changed, 427 insertions(+), 458 deletions(-) rename sherlock/{deploy => data}/classes_retrain_minimal_sample.npy (100%) rename sherlock/{deploy => data}/classes_retrained_sherlock.npy (100%) rename sherlock/{deploy => data}/classes_sherlock.npy (100%) rename sherlock/{features => data}/feature_column_identifiers/char_col.tsv (100%) rename sherlock/{features => data}/feature_column_identifiers/par_col.tsv (100%) rename sherlock/{features => data}/feature_column_identifiers/rest_col.tsv (100%) rename sherlock/{features => data}/feature_column_identifiers/word_col.tsv (100%) rename sherlock/{features => data}/par_vec_trained_400.pkl (100%) rename {models => sherlock/data}/retrain_minimal_sample_model.json (100%) rename {models => sherlock/data}/sherlock_model.json (100%) rename {models => sherlock/data}/sherlock_weights.h5 (100%) create mode 100644 sherlock/defaults.py delete mode 100644 sherlock/deploy/__init__.py delete mode 100644 sherlock/deploy/model_helpers.py delete mode 100644 sherlock/deploy/predict_sherlock.py create mode 100644 sherlock/model.py rename sherlock/{deploy/train_sherlock.py => training.py} (76%) diff --git a/.gitignore b/.gitignore index 266781b..089e942 100644 --- a/.gitignore +++ b/.gitignore @@ -74,6 +74,7 @@ target/ # Jupyter Notebook .ipynb_checkpoints +*.ipynb # pyenv .python-version diff --git a/requirements.txt b/requirements.txt index a64ddeb..b3a7354 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,43 +1,10 @@ -absl-py==0.7.1 -astor==0.8.0 -boto==2.49.0 -boto3==1.9.188 -botocore==1.12.188 -certifi==2019.6.16 -chardet==3.0.4 -docutils==0.14 -gast==0.2.2 -gensim==3.8.0 -google-pasta==0.1.7 +gensim>=3.8.0 googledrivedownloader==0.4 -grpcio==1.22.0 -h5py==2.9.0 -idna==2.8 -jmespath==0.9.4 -joblib==0.13.2 -Keras-Applications==1.0.8 -Keras-Preprocessing==1.1.0 -Markdown==3.1.1 -nltk==3.4.4 -matplotlib==2.2.5 -numpy==1.16.1 -pandas==0.24.2 -pre-commit==2.8.2 -protobuf==3.9.0 -pyarrow==2.0.0 -python-dateutil==2.8.0 -pytz==2019.1 -requests==2.22.0 -s3transfer==0.2.1 -scikit-learn==0.20 -six==1.12.0 -smart-open==1.8.4 -tensorboard==1.14.0 -tensorflow==1.14.0 -tensorflow-estimator==1.14.0 -termcolor==1.1.0 -tqdm==4.51.0 -urllib3==1.25.3 -Werkzeug==0.15.4 -wrapt==1.11.2 --e . +joblib>=0.13.2 +nltk>=3.4.4 +numpy~=1.21.0 +pandas~=1.2.5 +pre-commit~=2.8.2 +scikit-learn~=0.24.2 +tensorflow>=1.14.0 +scipy~=1.7.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 1ab3e39..9c07312 100644 --- a/setup.py +++ b/setup.py @@ -8,5 +8,5 @@ description="Package for semantic type detection using Sherlock", url="https://github.com/mitmedialab/sherlock-project", packages=setuptools.find_packages(), - package_dir={"sherlock": "sherlock"} -) \ No newline at end of file + package_dir={"sherlock": "sherlock"}, +) diff --git a/sherlock/__init__.py b/sherlock/__init__.py index e69de29..86f08af 100644 --- a/sherlock/__init__.py +++ b/sherlock/__init__.py @@ -0,0 +1,5 @@ +from sherlock.defaults import * +from sherlock.features import * +from sherlock.helpers import * +from sherlock.model import * +from sherlock.training import * diff --git a/sherlock/deploy/classes_retrain_minimal_sample.npy b/sherlock/data/classes_retrain_minimal_sample.npy similarity index 100% rename from sherlock/deploy/classes_retrain_minimal_sample.npy rename to sherlock/data/classes_retrain_minimal_sample.npy diff --git a/sherlock/deploy/classes_retrained_sherlock.npy b/sherlock/data/classes_retrained_sherlock.npy similarity index 100% rename from sherlock/deploy/classes_retrained_sherlock.npy rename to sherlock/data/classes_retrained_sherlock.npy diff --git a/sherlock/deploy/classes_sherlock.npy b/sherlock/data/classes_sherlock.npy similarity index 100% rename from sherlock/deploy/classes_sherlock.npy rename to sherlock/data/classes_sherlock.npy diff --git a/sherlock/features/feature_column_identifiers/char_col.tsv b/sherlock/data/feature_column_identifiers/char_col.tsv similarity index 100% rename from sherlock/features/feature_column_identifiers/char_col.tsv rename to sherlock/data/feature_column_identifiers/char_col.tsv diff --git a/sherlock/features/feature_column_identifiers/par_col.tsv b/sherlock/data/feature_column_identifiers/par_col.tsv similarity index 100% rename from sherlock/features/feature_column_identifiers/par_col.tsv rename to sherlock/data/feature_column_identifiers/par_col.tsv diff --git a/sherlock/features/feature_column_identifiers/rest_col.tsv b/sherlock/data/feature_column_identifiers/rest_col.tsv similarity index 100% rename from sherlock/features/feature_column_identifiers/rest_col.tsv rename to sherlock/data/feature_column_identifiers/rest_col.tsv diff --git a/sherlock/features/feature_column_identifiers/word_col.tsv b/sherlock/data/feature_column_identifiers/word_col.tsv similarity index 100% rename from sherlock/features/feature_column_identifiers/word_col.tsv rename to sherlock/data/feature_column_identifiers/word_col.tsv diff --git a/sherlock/features/par_vec_trained_400.pkl b/sherlock/data/par_vec_trained_400.pkl similarity index 100% rename from sherlock/features/par_vec_trained_400.pkl rename to sherlock/data/par_vec_trained_400.pkl diff --git a/models/retrain_minimal_sample_model.json b/sherlock/data/retrain_minimal_sample_model.json similarity index 100% rename from models/retrain_minimal_sample_model.json rename to sherlock/data/retrain_minimal_sample_model.json diff --git a/models/sherlock_model.json b/sherlock/data/sherlock_model.json similarity index 100% rename from models/sherlock_model.json rename to sherlock/data/sherlock_model.json diff --git a/models/sherlock_weights.h5 b/sherlock/data/sherlock_weights.h5 similarity index 100% rename from models/sherlock_weights.h5 rename to sherlock/data/sherlock_weights.h5 diff --git a/sherlock/defaults.py b/sherlock/defaults.py new file mode 100644 index 0000000..cb2714b --- /dev/null +++ b/sherlock/defaults.py @@ -0,0 +1,94 @@ +import os +from collections import OrderedDict +from itertools import chain +from typing import Optional + +import numpy as np +import pandas as pd +import tensorflow as tf +from sklearn.preprocessing import LabelEncoder +from tensorflow.keras.callbacks import EarlyStopping +from tensorflow.keras.models import model_from_json + +from sherlock import make_data_path + +DEFAULT_FEATURE_ORDER = ["char", "word", "par", "rest"] + + +def default_features() -> OrderedDict: + """Get feature identifiers per feature set, to map features to feature sets. + + Returns + ------- + feature_cols_dict + Dictionary with lists of feature identifiers per feature set. + """ + feature_cols_dict = OrderedDict() + feature_path = make_data_path("feature_column_identifiers") + + for feature_set in DEFAULT_FEATURE_ORDER: + feature_file = os.path.join(feature_path, f"{feature_set}_col.tsv") + feature_data = pd.read_csv( + feature_file, sep="\t", index_col=0, header=None, squeeze=True + ) + feature_cols_dict[feature_set] = feature_data.to_list() + return feature_cols_dict + + +def default_encoder(): + encoder = LabelEncoder() + class_file_path = make_data_path("classes_sherlock.npy") + encoder.classes_ = np.load(class_file_path, allow_pickle=True) + return encoder + + +def construct_model( + model_path: Optional[str] = None, + weight_path: Optional[str] = None, + with_weights: bool = True, +): + """Load model architecture and populate with pretrained weights. + + Parameters + ---------- + model_path + Location of model file + weight_path + Location of weight file + with_weights + Whether to populate the model with trained weights. + + Returns + ------- + model + Compiled model. + callbacks + Callback configuration for model retraining. + """ + if model_path is None: + model_path = make_data_path("sherlock_model.json") + + if weight_path is None: + weight_path = make_data_path("sherlock_weights.h5") + + with open(model_path, "r") as model_file: + model = model_from_json(model_file.read()) + + if with_weights: + model.load_weights(weight_path) + + learning_rate = 0.0001 + callbacks = [EarlyStopping(monitor="val_loss", patience=5)] + model.compile( + optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), + loss="categorical_crossentropy", + metrics=["categorical_accuracy"], + ) + + return model, callbacks + + +DEFAULT_FEATURES_DICT: OrderedDict = default_features() +DEFAULT_FEATURES = list(chain(*[cols for cols in DEFAULT_FEATURES_DICT.values()])) +DEFAULT_ENCODER = default_encoder() +DEFAULT_MODEL, DEFAULT_CALLBACKS = construct_model() diff --git a/sherlock/deploy/__init__.py b/sherlock/deploy/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/sherlock/deploy/model_helpers.py b/sherlock/deploy/model_helpers.py deleted file mode 100644 index 50b2da4..0000000 --- a/sherlock/deploy/model_helpers.py +++ /dev/null @@ -1,58 +0,0 @@ -import pandas as pd -import tensorflow as tf -from tensorflow.keras.callbacks import EarlyStopping -from tensorflow.keras.models import model_from_json - - -def categorize_features() -> dict: - """Get feature identifiers per feature set, to map features to feature sets. - - Returns - ------- - feature_cols_dict - Dictionary with lists of feature identifiers per feature set. - """ - feature_cols_dict = {} - for feature_set in ['char', 'word', 'par', 'rest']: - feature_cols_dict[feature_set] = pd.read_csv( - f"../sherlock/features/feature_column_identifiers/{feature_set}_col.tsv", - sep='\t', index_col=0, header=None, squeeze=True, - ).to_list() - return feature_cols_dict - - -def construct_sherlock_model(nn_id: str, with_weights: bool): - """Load model architecture and populate with pretrained weights. - - Parameters - ---------- - nn_id - Identifier for retrained model. - with_weights - Whether to populate the model with trained weights. - - Returns - ------- - sherlock_model - Compiled sherlock model. - callbacks - Callback configuration for model retraining. - """ - - lr = 0.0001 - callbacks = [EarlyStopping(monitor="val_loss", patience=5)] - - file = open(f"../models/sherlock_model.json", "r") - sherlock_model = model_from_json(file.read()) - file.close() - - if with_weights: - sherlock_model.load_weights(f"../models/{nn_id}_weights.h5") - - sherlock_model.compile( - optimizer=tf.keras.optimizers.Adam(lr=lr), - loss='categorical_crossentropy', - metrics=['categorical_accuracy'] - ) - - return sherlock_model, callbacks diff --git a/sherlock/deploy/predict_sherlock.py b/sherlock/deploy/predict_sherlock.py deleted file mode 100644 index 031dbd0..0000000 --- a/sherlock/deploy/predict_sherlock.py +++ /dev/null @@ -1,61 +0,0 @@ -import numpy as np -import pandas as pd -import tensorflow as tf - -from sklearn.preprocessing import LabelEncoder - -from sherlock.deploy import model_helpers - - -def _transform_predictions_to_classes(y_pred, nn_id) -> np.array: - """Get predicted semantic types from prediction vectors. - - Parameters - ---------- - y_pred - Nested vector with for each sample a vector of likelihoods per semantic type. - nn_id - Identifier of model to use. - - Returns - ------- - y_pred - Predicted semantic labels. - """ - y_pred_int = np.argmax(y_pred, axis=1) - encoder = LabelEncoder() - encoder.classes_ = np.load( - f"../sherlock/deploy/classes_{nn_id}.npy", - allow_pickle=True - ) - y_pred = encoder.inverse_transform(y_pred_int) - - return y_pred - - -def predict_sherlock(X: pd.DataFrame, nn_id: str) -> np.array: - """Use sherlock model to generate predictions for X. - - Parameters - ---------- - X - Featurized data set to generate predictions for. - nn_id - Identifier of a trained model to use for generating predictions. - - Returns - ------- - Array with predictions for X. - """ - sherlock_model, _ = model_helpers.construct_sherlock_model(nn_id, with_weights=True) - feature_cols_dict = model_helpers.categorize_features() - y_pred = sherlock_model.predict( - [ - X[feature_cols_dict['char']].values, - X[feature_cols_dict['word']].values, - X[feature_cols_dict['par']].values, - X[feature_cols_dict['rest']].values - ] - ) - - return _transform_predictions_to_classes(y_pred, nn_id) diff --git a/sherlock/features/__init__.py b/sherlock/features/__init__.py index e69de29..4082c48 100644 --- a/sherlock/features/__init__.py +++ b/sherlock/features/__init__.py @@ -0,0 +1,3 @@ +from sherlock.features import (bag_of_characters, bag_of_words, + paragraph_vectors, preprocessing, + word_embeddings) diff --git a/sherlock/features/bag_of_characters.py b/sherlock/features/bag_of_characters.py index 9fc3793..3365df1 100644 --- a/sherlock/features/bag_of_characters.py +++ b/sherlock/features/bag_of_characters.py @@ -1,41 +1,34 @@ import string -import numpy as np -from scipy.stats import skew, kurtosis from collections import OrderedDict +import numpy as np +from scipy.stats import kurtosis, skew + +ignore_chars = {"\n", "\\", "\v", "\r", "\t", "^"} +characters_to_check = [f"[{c}]" for c in string.printable if c not in ignore_chars] +characters_to_check.extend(["[\\\\]", "[\^]"]) + # Input: a single column in the form of pandas series # Output: ordered dictionary holding bag of character features def extract_bag_of_characters_features(data): - - characters_to_check = ( - ['['+ c + ']' for c in string.printable if c not in ('\n', '\\', '\v', '\r', '\t', '^')] - + ['[\\\\]', '[\^]'] - ) - f = OrderedDict() - data_no_null = data.dropna() all_value_features = OrderedDict() for c in characters_to_check: - all_value_features['n_{}'.format(c)] = data_no_null.str.count(c) - + all_value_features[f"n_{c}"] = data.str.count(c) + for value_feature_name, value_features in all_value_features.items(): - f['{}-agg-any'.format(value_feature_name)] = any(value_features) - f['{}-agg-all'.format(value_feature_name)] = all(value_features) - f['{}-agg-mean'.format(value_feature_name)] = np.mean(value_features) - f['{}-agg-var'.format(value_feature_name)] = np.var(value_features) - f['{}-agg-min'.format(value_feature_name)] = np.min(value_features) - f['{}-agg-max'.format(value_feature_name)] = np.max(value_features) - f['{}-agg-median'.format(value_feature_name)] = np.median(value_features) - f['{}-agg-sum'.format(value_feature_name)] = np.sum(value_features) - f['{}-agg-kurtosis'.format(value_feature_name)] = kurtosis(value_features) - f['{}-agg-skewness'.format(value_feature_name)] = skew(value_features) + f[f"{value_feature_name}-agg-any"] = any(value_features) + f[f"{value_feature_name}-agg-all"] = all(value_features) + f[f"{value_feature_name}-agg-mean"] = np.mean(value_features) + f[f"{value_feature_name}-agg-var"] = np.var(value_features) + f[f"{value_feature_name}-agg-min"] = np.min(value_features) + f[f"{value_feature_name}-agg-max"] = np.max(value_features) + f[f"{value_feature_name}-agg-median"] = np.median(value_features) + f[f"{value_feature_name}-agg-sum"] = np.sum(value_features) + f[f"{value_feature_name}-agg-kurtosis"] = kurtosis(value_features) + f[f"{value_feature_name}-agg-skewness"] = skew(value_features) return f - - - - - diff --git a/sherlock/features/bag_of_words.py b/sherlock/features/bag_of_words.py index 53d961f..ce6b7c3 100644 --- a/sherlock/features/bag_of_words.py +++ b/sherlock/features/bag_of_words.py @@ -1,80 +1,79 @@ import math +from collections import OrderedDict + import nltk import numpy as np -from scipy.stats import skew, kurtosis -from collections import OrderedDict +from scipy.stats import kurtosis, skew # Input: a single column in the form of a pandas series # Output: ordered dictionary holding bag of words features -def extract_bag_of_words_features(data, n_val): - +def extract_bag_of_words_features(data): + f = OrderedDict() - data = data.dropna() - - #n_val = data.size - - if not n_val: return - + n_val = data.shape[0] + # Entropy of column freq_dist = nltk.FreqDist(data) - probs = [freq_dist.freq(l) for l in freq_dist] - f['col_entropy'] = -sum(p * math.log(p,2) for p in probs) + probs = np.array([freq_dist.freq(item) for item in freq_dist]) + f["col_entropy"] = (probs * np.log2(probs)).sum() # Fraction of cells with unique content - num_unique = data.nunique() - f['frac_unique'] = num_unique / n_val + num_unique = len(freq_dist) + f["frac_unique"] = num_unique / n_val # Fraction of cells with numeric content -> frac text cells doesn't add information - num_cells = np.sum(data.str.contains('[0-9]', regex=True)) - text_cells = np.sum(data.str.contains('[a-z]|[A-Z]', regex=True)) - f['frac_numcells'] = num_cells / n_val - f['frac_textcells'] = text_cells / n_val - + num_cells = data.str.count("[0-9]") + text_cells = data.str.count("[a-z]|[A-Z]") + f["frac_numcells"] = num_cells[num_cells > 0].shape[0] / n_val + f["frac_textcells"] = text_cells[text_cells > 0].shape[0] / n_val + # Average + std number of numeric tokens in cells - num_reg = '[0-9]' - f['avg_num_cells'] = np.mean(data.str.count(num_reg)) - f['std_num_cells'] = np.std(data.str.count(num_reg)) - + num_agg = num_cells.agg(["mean", "std"]).to_dict() + f["avg_num_cells"] = num_agg["mean"] + f["std_num_cells"] = num_agg["std"] + # Average + std number of textual tokens in cells - text_reg = '[a-z]|[A-Z]' - f['avg_text_cells'] = np.mean(data.str.count(text_reg)) - f['std_text_cells'] = np.std(data.str.count(text_reg)) - + text_agg = text_cells.agg(["mean", "std"]).to_dict() + f["avg_text_cells"] = text_agg["mean"] + f["std_text_cells"] = text_agg["std"] + # Average + std number of special characters in each cell spec_reg = '[[!@#$%^&*(),.?":{}|<>]]' - f['avg_spec_cells'] = np.mean(data.str.count(spec_reg)) - f['std_spec_cells'] = np.std(data.str.count(spec_reg)) - + spec_agg = data.str.count(spec_reg).agg(["mean", "std"]).to_dict() + f["avg_spec_cells"] = spec_agg["mean"] + f["std_spec_cells"] = spec_agg["std"] + # Average number of words in each cell space_reg = '[" "]' - f['avg_word_cells'] = np.mean(data.str.count(space_reg) + 1) - f['std_word_cells'] = np.std(data.str.count(space_reg) + 1) + word_agg = (data.str.count(space_reg) + 1).agg(["mean", "std"]).to_dict() + f["avg_word_cells"] = word_agg["mean"] + f["std_word_cells"] = word_agg["std"] all_value_features = OrderedDict() - data_no_null = data.dropna() + data_no_null = data.dropna() if data.hasnans else data - f['n_values'] = n_val + f["n_values"] = n_val - all_value_features['length'] = data_no_null.apply(len) + all_value_features["length"] = data_no_null.apply(len) for value_feature_name, value_features in all_value_features.items(): - f['{}-agg-any'.format(value_feature_name)] = any(value_features) - f['{}-agg-all'.format(value_feature_name)] = all(value_features) - f['{}-agg-mean'.format(value_feature_name)] = np.mean(value_features) - f['{}-agg-var'.format(value_feature_name)] = np.var(value_features) - f['{}-agg-min'.format(value_feature_name)] = np.min(value_features) - f['{}-agg-max'.format(value_feature_name)] = np.max(value_features) - f['{}-agg-median'.format(value_feature_name)] = np.median(value_features) - f['{}-agg-sum'.format(value_feature_name)] = np.sum(value_features) - f['{}-agg-kurtosis'.format(value_feature_name)] = kurtosis(value_features) - f['{}-agg-skewness'.format(value_feature_name)] = skew(value_features) - - n_none = data.size - data_no_null.size - len([ e for e in data if e == '']) - f['none-agg-has'] = n_none > 0 - f['none-agg-percent'] = n_none / len(data) - f['none-agg-num'] = n_none - f['none-agg-all'] = (n_none == len(data)) - + f["{}-agg-any".format(value_feature_name)] = any(value_features) + f["{}-agg-all".format(value_feature_name)] = all(value_features) + f["{}-agg-mean".format(value_feature_name)] = np.mean(value_features) + f["{}-agg-var".format(value_feature_name)] = np.var(value_features) + f["{}-agg-min".format(value_feature_name)] = np.min(value_features) + f["{}-agg-max".format(value_feature_name)] = np.max(value_features) + f["{}-agg-median".format(value_feature_name)] = np.median(value_features) + f["{}-agg-sum".format(value_feature_name)] = np.sum(value_features) + f["{}-agg-kurtosis".format(value_feature_name)] = kurtosis(value_features) + f["{}-agg-skewness".format(value_feature_name)] = skew(value_features) + + n_none = data.size - data_no_null.size - (data == "").sum() + f["none-agg-has"] = n_none > 0 + f["none-agg-percent"] = n_none / len(data) + f["none-agg-num"] = n_none + f["none-agg-all"] = n_none == len(data) + return f diff --git a/sherlock/features/paragraph_vectors.py b/sherlock/features/paragraph_vectors.py index 551042d..4affbab 100644 --- a/sherlock/features/paragraph_vectors.py +++ b/sherlock/features/paragraph_vectors.py @@ -1,19 +1,34 @@ -import pandas as pd import random +from collections import OrderedDict +from functools import cache +from typing import Union +import numpy as np +import pandas as pd from gensim.models.doc2vec import Doc2Vec, TaggedDocument +from sherlock import make_data_path + + +@cache +def get_paragraph_vector_model(): + par_vec_file = make_data_path(f"par_vec_trained_{400}.pkl") + paragraph_vector_model = Doc2Vec.load(par_vec_file) + return paragraph_vector_model + # Input: a collection of columns stored in a dataframe column 'values' # Output: tagged columns. # Only needed for training. def tagcol_paragraph_embeddings_features(train_data): - # Expects a dataframe with a 'values' column - train_data_values = train_data['values'] + train_data_values = train_data["values"] random.seed(13) - columns = [TaggedDocument(random.sample(col, min(1000, len(col))), [i]) for i, col in enumerate(train_data_values.values)] - + columns = [ + TaggedDocument(random.sample(col, min(1000, len(col))), [i]) + for i, col in enumerate(train_data_values.values) + ] + return columns @@ -21,39 +36,37 @@ def tagcol_paragraph_embeddings_features(train_data): # Output: a stored retrained model # Only needed for training. def train_paragraph_embeddings_features(columns, dim): - # Train Doc2Vec model - model = Doc2Vec(columns, dm=0, negative=3, workers=8, vector_size=dim, epochs=20, min_count=2, seed=13) + model = Doc2Vec( + columns, + dm=0, + negative=3, + workers=8, + vector_size=dim, + epochs=20, + min_count=2, + seed=13, + ) # Save trained model - model_file = '../sherlock/features/par_vec_retrained_{}.pkl'.format(dim) + model_file = make_data_path(f"par_vec_retrained_{dim}.pkl") model.save(model_file) model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) - + # Input: a single column in the form of a pandas Series. # Output: ordered dictionary holding paragraph vector features -def infer_paragraph_embeddings_features(data, dim): - - # Load pretrained paragraph vector model - model = Doc2Vec.load('../sherlock/features/par_vec_trained_{}.pkl'.format(dim)) - - f = pd.DataFrame() - - if len(data) > 1000: - random.seed(13) - vec = random.sample(data, 1000) - else: - vec = data - - # Infer paragraph vector for data sample - f = f.append(pd.Series(model.infer_vector(vec, steps=20, - alpha=0.025)), ignore_index=True) +def infer_paragraph_embeddings_features( + data: Union[np.array, pd.Series, list] +) -> OrderedDict: + # pandas and numpy + if not isinstance(data, list): + data = data.tolist() - col_names = [] - for i, col in enumerate(f): - col_names.append('par_vec_{}'.format(i)) - - f.columns = col_names + model = get_paragraph_vector_model() + embedding = model.infer_vector(data, steps=20, alpha=0.025) - return f + res = OrderedDict() + for i, v in enumerate(embedding): + res[f"par_vec_{i}"] = v + return res diff --git a/sherlock/features/preprocessing.py b/sherlock/features/preprocessing.py index 90aa190..25eea84 100644 --- a/sherlock/features/preprocessing.py +++ b/sherlock/features/preprocessing.py @@ -1,39 +1,38 @@ +import os +import random from ast import literal_eval from collections import OrderedDict -import random -import os -from typing import Union +from functools import cache +from typing import Optional, Union -from google_drive_downloader import GoogleDriveDownloader as gd import numpy as np import pandas as pd -from tqdm import tqdm +from google_drive_downloader import GoogleDriveDownloader as gd -from sherlock.features.bag_of_characters import extract_bag_of_characters_features +from sherlock import make_data_path +from sherlock.features.bag_of_characters import \ + extract_bag_of_characters_features from sherlock.features.bag_of_words import extract_bag_of_words_features +from sherlock.features.paragraph_vectors import \ + infer_paragraph_embeddings_features from sherlock.features.word_embeddings import extract_word_embeddings_features -from sherlock.features.paragraph_vectors import infer_paragraph_embeddings_features def prepare_feature_extraction(): """Download embedding files from Google Drive if they do not exist yet.""" - word_embedding_file = '../sherlock/features/glove.6B.50d.txt' - paragraph_vector_file = '../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy' - - print( - f"""Preparing feature extraction by downloading 2 files: - \n {word_embedding_file} and \n {paragraph_vector_file}. - """ + word_embedding_file = make_data_path("glove.6B.50d.txt") + paragraph_vector_file = make_data_path( + "par_vec_trained_400.pkl.docvecs.vectors_docs.npy" ) if not os.path.exists(word_embedding_file): - print('Downloading GloVe word embedding vectors.') + print("Downloading GloVe word embedding vectors.") file_name = word_embedding_file gd.download_file_from_google_drive( - file_id='1kayd5oNRQm8-NCvA8pIrtezbQ-B1_Vmk', + file_id="1kayd5oNRQm8-NCvA8pIrtezbQ-B1_Vmk", dest_path=file_name, unzip=False, - showsize=True + showsize=True, ) print("GloVe word embedding vectors were downloaded.") @@ -42,125 +41,68 @@ def prepare_feature_extraction(): print("Downloading pretrained paragraph vectors.") file_name = paragraph_vector_file gd.download_file_from_google_drive( - file_id='1vdyGJ4aB71FCaNqJKYX387eVufcH4SAu', + file_id="1vdyGJ4aB71FCaNqJKYX387eVufcH4SAu", dest_path=file_name, unzip=False, - showsize=True + showsize=True, ) - + print("Trained paragraph vector model was downloaded.") - - print("All files for extracting word and paragraph embeddings are present.") - -def prepare_word_embeddings(): - word_vectors_f = open('../sherlock/features/glove.6B.50d.txt', encoding='utf-8') + +@cache +def prepare_word_embeddings(): + word_vectors_f = open(make_data_path("glove.6B.50d.txt"), encoding="utf-8") word_to_embedding = {} for w in word_vectors_f: - - term, vector = w.strip().split(' ', 1) - vector = np.array(vector.split(' '), dtype=float) + term, vector = w.strip().split(" ", 1) + vector = np.array(vector.split(" "), dtype=float) word_to_embedding[term] = vector return word_to_embedding - -def convert_string_lists_to_lists( - data: Union[pd.DataFrame, pd.Series], - labels: Union[pd.DataFrame, pd.Series], - data_column_name: str = None, - labels_column_name: str = None, -) -> pd.Series: - """Convert strings of arrays with values to arrays of strings of values. - Each row in de dataframe or series corresponds to a column, represented by a string of a list. - Each string-list will be converted to a list with string values. - - Parameters - ---------- - data - Data to convert column from. - labels - Labels of each row corresponding to semantic column type. - data_column_name - Name of column of the data to convert. - labels_column_name - Name of column with the labels to convert. - - Returns - ------- - converted_data - Series with all rows a list of string values. - converted_labels - List with labels. - """ - tqdm.pandas() - - if isinstance(data, pd.DataFrame): - if data_column_name is None: raise ValueError("Missing column name of data.") - converted_data = data[data_column_name].progress_apply(literal_eval) - elif isinstance(data, pd.Series): - converted_data = data.progress_apply(literal_eval) - else: - raise TypeError("Unexpected data type of samples.") - - if isinstance(labels, pd.DataFrame): - if labels_column_name is None: raise ValueError("Missing column name of labels.") - converted_labels = labels[labels_column_name].to_list() - elif isinstance(labels, pd.Series): - converted_labels = labels.to_list() - else: - raise TypeError("Unexpected data type of labels.") - - return converted_data, converted_labels - - -def extract_features(data: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame: + +def extract_feature(data: pd.Series, n_samples=None): + word_to_embedding = prepare_word_embeddings() + + if n_samples: + random.seed(13) + n_samples = min(data.shape[0], n_samples) + data = pd.Series(random.choices(data, k=n_samples)) + + not_na_idx = data.notna() if data.hasnans else np.array([True] * data.shape[0]) + data[not_na_idx] = data[not_na_idx].astype(str) + + f = ( + extract_bag_of_characters_features(data[not_na_idx]) + | extract_word_embeddings_features(data[not_na_idx], word_to_embedding) + | extract_bag_of_words_features(data) + | infer_paragraph_embeddings_features(data[not_na_idx]) + ) + + return f + + +def extract_features( + data: Union[pd.DataFrame, pd.Series], n_samples: Optional[int] = None +) -> pd.DataFrame: """Extract features from raw data. - + Parameters ---------- data A pandas DataFrame or Series with each row a list of string values. - + n_samples + An optional integer indicating the number of samples to use for feature extraction + Returns ------- DataFrame with featurized column samples. """ prepare_feature_extraction() - word_to_embedding = prepare_word_embeddings() - - features_list = [] - df_par = pd.DataFrame() - n_samples = 1000 - vec_dim = 400 - i = 0 - for raw_sample in data: - - i = i + 1 - if i % 100 == 0: - print(f"Extracting features for data column: {i}") - - n_values = len(raw_sample) - - if n_samples > n_values: - n_samples = n_values - - random.seed(13) - raw_sample = pd.Series(random.choices(raw_sample, k=n_samples)).astype(str) - - f = OrderedDict( - list(extract_bag_of_characters_features(raw_sample).items()) + - list(extract_word_embeddings_features(raw_sample, word_to_embedding).items()) + - list(extract_bag_of_words_features(raw_sample, n_values).items()) - ) - features_list.append(f) + if isinstance(data, pd.Series): + return pd.DataFrame([extract_feature(data, n_samples)]) - df_par = df_par.append(infer_paragraph_embeddings_features(raw_sample, vec_dim)) - - return pd.concat( - [pd.DataFrame(features_list).reset_index(drop=True), df_par.reset_index(drop=True)], - axis=1, - sort=False - ) + return pd.DataFrame([extract_feature(data[col], n_samples) for col in data.columns]) diff --git a/sherlock/features/word_embeddings.py b/sherlock/features/word_embeddings.py index 35eb0f8..a3c2d75 100644 --- a/sherlock/features/word_embeddings.py +++ b/sherlock/features/word_embeddings.py @@ -1,57 +1,65 @@ -import numpy as np - -from scipy import stats from collections import OrderedDict +import numpy as np +import pandas as pd +from scipy import stats -# Input: a single column in the form of a pandas series -# Output: ordered dictionary holding word embedding features -def extract_word_embeddings_features(values, word_to_embedding): - - num_embeddings = 50 - f = OrderedDict() - embeddings = [] +NUM_EMBEDDINGS = 50 - values = values.dropna() - for v in values: +def make_default_response(): + default_response = OrderedDict() + default_response.setdefault("word_embedding_feature", 0) + for i in range(NUM_EMBEDDINGS): + default_response.setdefault("word_embedding_avg_{}".format(i), np.nan) + for i in range(NUM_EMBEDDINGS): + default_response.setdefault("word_embedding_std_{}".format(i), np.nan) + for i in range(NUM_EMBEDDINGS): + default_response.setdefault("word_embedding_med_{}".format(i), np.nan) + for i in range(NUM_EMBEDDINGS): + default_response.setdefault("word_embedding_mode_{}".format(i), np.nan) + return default_response - v = str(v).lower() - if v in word_to_embedding: - embeddings.append(word_to_embedding.get(v)) +def embedding_getter(word_to_embedding): + def inner(value: str): + if value in word_to_embedding: + embedding = word_to_embedding.get(value) else: - words = v.split(' ') - embeddings_to_all_words = [] + embeddings = [ + word_to_embedding.get(v) + for v in value.split(" ") + if v in word_to_embedding + ] + embedding = np.mean(embeddings, axis=0) + return embedding - for w in words: - if w in word_to_embedding: - embeddings_to_all_words.append(word_to_embedding.get(w)) - if embeddings_to_all_words: - mean_of_word_embeddings = np.nanmean(embeddings_to_all_words, axis=0) - embeddings.append(mean_of_word_embeddings) + return inner - if len(embeddings) == 0: - for i in range(num_embeddings): f['word_embedding_avg_{}'.format(i)] = np.nan - for i in range(num_embeddings): f['word_embedding_std_{}'.format(i)] = np.nan - for i in range(num_embeddings): f['word_embedding_med_{}'.format(i)] = np.nan - for i in range(num_embeddings): f['word_embedding_mode_{}'.format(i)] = np.nan - f['word_embedding_feature'] = 0 +# Input: a single column in the form of a pandas series +# Output: ordered dictionary holding word embedding features +def extract_word_embeddings_features( + values: pd.Series, word_to_embedding: dict +) -> OrderedDict: + f = make_default_response() - return f + get_embedding = embedding_getter(word_to_embedding) + embeddings = [get_embedding(val) for val in values.str.lower()] - else: - mean_embeddings = np.nanmean(embeddings, axis=0) - med_embeddings = np.nanmedian(embeddings, axis=0) - std_embeddings = np.nanstd(embeddings, axis=0) - mode_embeddings = stats.mode(embeddings, axis=0, nan_policy='omit')[0].flatten() + if len(embeddings) == 0: + return f - for i, e in enumerate(mean_embeddings): f['word_embedding_avg_{}'.format(i)] = e - for i, e in enumerate(std_embeddings): f['word_embedding_std_{}'.format(i)] = e - for i, e in enumerate(med_embeddings): f['word_embedding_med_{}'.format(i)] = e - for i, e in enumerate(mode_embeddings): f['word_embedding_mode_{}'.format(i)] = e + f["word_embedding_feature"] = 1 + mean_embeddings = np.nanmean(embeddings, axis=0) + med_embeddings = np.nanmedian(embeddings, axis=0) + std_embeddings = np.nanstd(embeddings, axis=0) + mode_embeddings = stats.mode(embeddings, axis=0, nan_policy="omit")[0].flatten() - f['word_embedding_feature'] = 1 + for i in range(NUM_EMBEDDINGS): + f[f"word_embedding_avg_{i}"] = mean_embeddings[i] + f[f"word_embedding_std_{i}"] = std_embeddings[i] + f[f"word_embedding_med_{i}"] = med_embeddings[i] + f[f"word_embedding_mode_{i}"] = mode_embeddings[i] - return f + return f diff --git a/sherlock/helpers.py b/sherlock/helpers.py index fe6b6c5..ea2013a 100644 --- a/sherlock/helpers.py +++ b/sherlock/helpers.py @@ -1,23 +1,29 @@ import os +import pkg_resources from google_drive_downloader import GoogleDriveDownloader as gd +DATA_PATH = pkg_resources.resource_filename("sherlock", "data/") + + +def make_data_path(path): + return os.path.join(DATA_PATH, path) + def download_data(): """Download raw and preprocessed data files. The data is downloaded from Google Drive and stored in the 'data/' directory. """ - data_dir = '../data/data.zip' + data_dir = make_data_path("data.zip") print(f"Downloading the raw and preprocessed data into {data_dir}.") if not os.path.exists(data_dir): - print('Downloading data directory.') - dir_name = data_dir + print("Downloading data directory.") gd.download_file_from_google_drive( - file_id='1-g0zbKFAXz7zKZc0Dnh74uDBpZCv4YqU', - dest_path=dir_name, + file_id="1-g0zbKFAXz7zKZc0Dnh74uDBpZCv4YqU", + dest_path=data_dir, unzip=True, - showsize=True + showsize=True, ) - print('Data was downloaded.') \ No newline at end of file + print("Data was downloaded.") diff --git a/sherlock/model.py b/sherlock/model.py new file mode 100644 index 0000000..f8dd9b5 --- /dev/null +++ b/sherlock/model.py @@ -0,0 +1,56 @@ +from typing import List + +import numpy as np +import pandas as pd +from sklearn.base import ClassifierMixin, TransformerMixin +from sklearn.pipeline import Pipeline + +from sherlock import defaults +from sherlock.features import preprocessing + + +class SherlockTransformer(TransformerMixin): + def __init__(self): + self.feature_dict = defaults.DEFAULT_FEATURES_DICT + + def transform(self, X: pd.DataFrame): + X = preprocessing.extract_features(X).astype("float32") + return [X[cols].values for cols in self.feature_dict.values()] + + +class SherlockModel(ClassifierMixin): + def __init__(self): + self.encoder = defaults.DEFAULT_ENCODER + self.model = defaults.DEFAULT_MODEL + + def predict_proba(self, X: List[pd.DataFrame]) -> np.array: + return self.model.predict(X) + + def predict_log_proba(self, X: List[pd.DataFrame]) -> np.array: + return np.log(self.model.predict(X)) + + def predict(self, X: List[pd.DataFrame]) -> np.array: + y_pred = self.predict_proba(X) + y_pred_int = np.argmax(y_pred, axis=1) + return self.encoder.inverse_transform(y_pred_int) + + def fit(self, X: List[pd.DataFrame], y: pd.Series) -> np.array: + self.model.fit(X, y) + + +class SherlockPipeline(Pipeline): + def __init__(self): + steps = [("transformer", SherlockTransformer()), ("model", SherlockModel())] + super().__init__(steps) + + def named_proba(self, X: pd.DataFrame, top_n=None): + y_pred = self.predict_proba(X) + result = dict() + for i, col in enumerate(X.columns): + temp = sorted( + zip(self.steps[-1].encoder.classes_, y_pred[i]), + key=lambda item: item[1], + reverse=True, + ) + result[col] = temp[0:top_n] if top_n else temp + return result diff --git a/sherlock/deploy/train_sherlock.py b/sherlock/training.py similarity index 76% rename from sherlock/deploy/train_sherlock.py rename to sherlock/training.py index 17de34e..d34626b 100644 --- a/sherlock/deploy/train_sherlock.py +++ b/sherlock/training.py @@ -1,17 +1,16 @@ import numpy as np -import tensorflow as tf import pandas as pd - +import tensorflow as tf from sklearn.preprocessing import LabelEncoder -from sherlock.deploy import model_helpers +from sherlock import defaults SEED = 13 def _get_categorical_label_encodings(y_train, y_val, nn_id) -> (list, list): """Encode semantic type string labels as categoricals. - + Parameters ---------- y_train @@ -20,11 +19,11 @@ def _get_categorical_label_encodings(y_train, y_val, nn_id) -> (list, list): Validation labels. nn_id Identifier of retrained model. - + Returns ------- y_train_cat - Categorical encodings of train labels. + Categorical encodings of train labels. y_val_cat Categorical encodings of validation labels. """ @@ -48,7 +47,7 @@ def _get_categorical_label_encodings(y_train, y_val, nn_id) -> (list, list): def _save_retrained_sherlock_model(sherlock_model, nn_id: str): """Save weights of retrained sherlock model. - + Parameters ---------- sherlock_model @@ -72,7 +71,7 @@ def train_sherlock( nn_id: str, ): """Train weights of sherlock model from existing NN architecture. - + Parameters ---------- X_train @@ -86,40 +85,42 @@ def train_sherlock( nn_id Identifier for retrained model. """ - + if nn_id == "sherlock": raise ValueError( """nn_id cannot be equal to 'sherlock' to avoid overwriting pretrained model. """ ) - - feature_cols = model_helpers.categorize_features() + + feature_cols = defaults.default_features() y_train_cat, y_val_cat = _get_categorical_label_encodings(y_train, y_val, nn_id) - sherlock_model, callbacks = model_helpers.construct_sherlock_model(nn_id, False) + sherlock_model, callbacks = defaults.construct_sherlock_model(nn_id, False) print("Successfully loaded and compiled model, now fitting model on data.") sherlock_model.fit( [ - X_train[feature_cols['char']].values, - X_train[feature_cols['word']].values, - X_train[feature_cols['par']].values, - X_train[feature_cols['rest']].values, + X_train[feature_cols["char"]].values, + X_train[feature_cols["word"]].values, + X_train[feature_cols["par"]].values, + X_train[feature_cols["rest"]].values, ], y_train_cat, validation_data=( [ - X_val[feature_cols['char']].values, - X_val[feature_cols['word']].values, - X_val[feature_cols['par']].values, - X_val[feature_cols['rest']].values, + X_val[feature_cols["char"]].values, + X_val[feature_cols["word"]].values, + X_val[feature_cols["par"]].values, + X_val[feature_cols["rest"]].values, ], - y_val_cat + y_val_cat, ), - callbacks=callbacks, epochs=100, batch_size=256 + callbacks=callbacks, + epochs=100, + batch_size=256, ) _save_retrained_sherlock_model(sherlock_model, nn_id) - print('Retrained Sherlock.') + print("Retrained Sherlock.") From c71e9b97173541923b5068278e8d9c834f777519 Mon Sep 17 00:00:00 2001 From: Ian Eaves Date: Mon, 28 Jun 2021 14:13:38 -0500 Subject: [PATCH 2/2] entropy calculation fix --- sherlock/features/bag_of_words.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sherlock/features/bag_of_words.py b/sherlock/features/bag_of_words.py index ce6b7c3..5455801 100644 --- a/sherlock/features/bag_of_words.py +++ b/sherlock/features/bag_of_words.py @@ -16,7 +16,7 @@ def extract_bag_of_words_features(data): # Entropy of column freq_dist = nltk.FreqDist(data) probs = np.array([freq_dist.freq(item) for item in freq_dist]) - f["col_entropy"] = (probs * np.log2(probs)).sum() + f["col_entropy"] = -(probs * np.log2(probs)).sum() # Fraction of cells with unique content num_unique = len(freq_dist)