From 09c3c83111b7a8bd39b5e6e9ee52519155f040ca Mon Sep 17 00:00:00 2001
From: Ian Eaves <ian.k.eaves@gmail.com>
Date: Mon, 28 Jun 2021 14:06:11 -0500
Subject: [PATCH 1/2] rewrite

---
 .gitignore                                    |   1 +
 requirements.txt                              |  51 +-----
 setup.py                                      |   4 +-
 sherlock/__init__.py                          |   5 +
 .../classes_retrain_minimal_sample.npy        | Bin
 .../classes_retrained_sherlock.npy            | Bin
 .../{deploy => data}/classes_sherlock.npy     | Bin
 .../feature_column_identifiers/char_col.tsv   |   0
 .../feature_column_identifiers/par_col.tsv    |   0
 .../feature_column_identifiers/rest_col.tsv   |   0
 .../feature_column_identifiers/word_col.tsv   |   0
 .../par_vec_trained_400.pkl                   | Bin
 .../data}/retrain_minimal_sample_model.json   |   0
 {models => sherlock/data}/sherlock_model.json |   0
 {models => sherlock/data}/sherlock_weights.h5 | Bin
 sherlock/defaults.py                          |  94 ++++++++++
 sherlock/deploy/__init__.py                   |   0
 sherlock/deploy/model_helpers.py              |  58 ------
 sherlock/deploy/predict_sherlock.py           |  61 -------
 sherlock/features/__init__.py                 |   3 +
 sherlock/features/bag_of_characters.py        |  45 ++---
 sherlock/features/bag_of_words.py             | 103 ++++++-----
 sherlock/features/paragraph_vectors.py        |  75 ++++----
 sherlock/features/preprocessing.py            | 172 ++++++------------
 sherlock/features/word_embeddings.py          |  90 ++++-----
 sherlock/helpers.py                           |  20 +-
 sherlock/model.py                             |  56 ++++++
 .../{deploy/train_sherlock.py => training.py} |  47 ++---
 28 files changed, 427 insertions(+), 458 deletions(-)
 rename sherlock/{deploy => data}/classes_retrain_minimal_sample.npy (100%)
 rename sherlock/{deploy => data}/classes_retrained_sherlock.npy (100%)
 rename sherlock/{deploy => data}/classes_sherlock.npy (100%)
 rename sherlock/{features => data}/feature_column_identifiers/char_col.tsv (100%)
 rename sherlock/{features => data}/feature_column_identifiers/par_col.tsv (100%)
 rename sherlock/{features => data}/feature_column_identifiers/rest_col.tsv (100%)
 rename sherlock/{features => data}/feature_column_identifiers/word_col.tsv (100%)
 rename sherlock/{features => data}/par_vec_trained_400.pkl (100%)
 rename {models => sherlock/data}/retrain_minimal_sample_model.json (100%)
 rename {models => sherlock/data}/sherlock_model.json (100%)
 rename {models => sherlock/data}/sherlock_weights.h5 (100%)
 create mode 100644 sherlock/defaults.py
 delete mode 100644 sherlock/deploy/__init__.py
 delete mode 100644 sherlock/deploy/model_helpers.py
 delete mode 100644 sherlock/deploy/predict_sherlock.py
 create mode 100644 sherlock/model.py
 rename sherlock/{deploy/train_sherlock.py => training.py} (76%)

diff --git a/.gitignore b/.gitignore
index 266781b..089e942 100644
--- a/.gitignore
+++ b/.gitignore
@@ -74,6 +74,7 @@ target/
 
 # Jupyter Notebook
 .ipynb_checkpoints
+*.ipynb
 
 # pyenv
 .python-version
diff --git a/requirements.txt b/requirements.txt
index a64ddeb..b3a7354 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,43 +1,10 @@
-absl-py==0.7.1
-astor==0.8.0
-boto==2.49.0
-boto3==1.9.188
-botocore==1.12.188
-certifi==2019.6.16
-chardet==3.0.4
-docutils==0.14
-gast==0.2.2
-gensim==3.8.0
-google-pasta==0.1.7
+gensim>=3.8.0
 googledrivedownloader==0.4
-grpcio==1.22.0
-h5py==2.9.0
-idna==2.8
-jmespath==0.9.4
-joblib==0.13.2
-Keras-Applications==1.0.8
-Keras-Preprocessing==1.1.0
-Markdown==3.1.1
-nltk==3.4.4
-matplotlib==2.2.5
-numpy==1.16.1
-pandas==0.24.2
-pre-commit==2.8.2
-protobuf==3.9.0
-pyarrow==2.0.0
-python-dateutil==2.8.0
-pytz==2019.1
-requests==2.22.0
-s3transfer==0.2.1
-scikit-learn==0.20
-six==1.12.0
-smart-open==1.8.4
-tensorboard==1.14.0
-tensorflow==1.14.0
-tensorflow-estimator==1.14.0
-termcolor==1.1.0
-tqdm==4.51.0
-urllib3==1.25.3
-Werkzeug==0.15.4
-wrapt==1.11.2
--e .
+joblib>=0.13.2
+nltk>=3.4.4
+numpy~=1.21.0
+pandas~=1.2.5
+pre-commit~=2.8.2
+scikit-learn~=0.24.2
+tensorflow>=1.14.0
+scipy~=1.7.0
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 1ab3e39..9c07312 100644
--- a/setup.py
+++ b/setup.py
@@ -8,5 +8,5 @@
     description="Package for semantic type detection using Sherlock",
     url="https://github.com/mitmedialab/sherlock-project",
     packages=setuptools.find_packages(),
-    package_dir={"sherlock": "sherlock"}
-)
\ No newline at end of file
+    package_dir={"sherlock": "sherlock"},
+)
diff --git a/sherlock/__init__.py b/sherlock/__init__.py
index e69de29..86f08af 100644
--- a/sherlock/__init__.py
+++ b/sherlock/__init__.py
@@ -0,0 +1,5 @@
+from sherlock.defaults import *
+from sherlock.features import *
+from sherlock.helpers import *
+from sherlock.model import *
+from sherlock.training import *
diff --git a/sherlock/deploy/classes_retrain_minimal_sample.npy b/sherlock/data/classes_retrain_minimal_sample.npy
similarity index 100%
rename from sherlock/deploy/classes_retrain_minimal_sample.npy
rename to sherlock/data/classes_retrain_minimal_sample.npy
diff --git a/sherlock/deploy/classes_retrained_sherlock.npy b/sherlock/data/classes_retrained_sherlock.npy
similarity index 100%
rename from sherlock/deploy/classes_retrained_sherlock.npy
rename to sherlock/data/classes_retrained_sherlock.npy
diff --git a/sherlock/deploy/classes_sherlock.npy b/sherlock/data/classes_sherlock.npy
similarity index 100%
rename from sherlock/deploy/classes_sherlock.npy
rename to sherlock/data/classes_sherlock.npy
diff --git a/sherlock/features/feature_column_identifiers/char_col.tsv b/sherlock/data/feature_column_identifiers/char_col.tsv
similarity index 100%
rename from sherlock/features/feature_column_identifiers/char_col.tsv
rename to sherlock/data/feature_column_identifiers/char_col.tsv
diff --git a/sherlock/features/feature_column_identifiers/par_col.tsv b/sherlock/data/feature_column_identifiers/par_col.tsv
similarity index 100%
rename from sherlock/features/feature_column_identifiers/par_col.tsv
rename to sherlock/data/feature_column_identifiers/par_col.tsv
diff --git a/sherlock/features/feature_column_identifiers/rest_col.tsv b/sherlock/data/feature_column_identifiers/rest_col.tsv
similarity index 100%
rename from sherlock/features/feature_column_identifiers/rest_col.tsv
rename to sherlock/data/feature_column_identifiers/rest_col.tsv
diff --git a/sherlock/features/feature_column_identifiers/word_col.tsv b/sherlock/data/feature_column_identifiers/word_col.tsv
similarity index 100%
rename from sherlock/features/feature_column_identifiers/word_col.tsv
rename to sherlock/data/feature_column_identifiers/word_col.tsv
diff --git a/sherlock/features/par_vec_trained_400.pkl b/sherlock/data/par_vec_trained_400.pkl
similarity index 100%
rename from sherlock/features/par_vec_trained_400.pkl
rename to sherlock/data/par_vec_trained_400.pkl
diff --git a/models/retrain_minimal_sample_model.json b/sherlock/data/retrain_minimal_sample_model.json
similarity index 100%
rename from models/retrain_minimal_sample_model.json
rename to sherlock/data/retrain_minimal_sample_model.json
diff --git a/models/sherlock_model.json b/sherlock/data/sherlock_model.json
similarity index 100%
rename from models/sherlock_model.json
rename to sherlock/data/sherlock_model.json
diff --git a/models/sherlock_weights.h5 b/sherlock/data/sherlock_weights.h5
similarity index 100%
rename from models/sherlock_weights.h5
rename to sherlock/data/sherlock_weights.h5
diff --git a/sherlock/defaults.py b/sherlock/defaults.py
new file mode 100644
index 0000000..cb2714b
--- /dev/null
+++ b/sherlock/defaults.py
@@ -0,0 +1,94 @@
+import os
+from collections import OrderedDict
+from itertools import chain
+from typing import Optional
+
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+from sklearn.preprocessing import LabelEncoder
+from tensorflow.keras.callbacks import EarlyStopping
+from tensorflow.keras.models import model_from_json
+
+from sherlock import make_data_path
+
+DEFAULT_FEATURE_ORDER = ["char", "word", "par", "rest"]
+
+
+def default_features() -> OrderedDict:
+    """Get feature identifiers per feature set, to map features to feature sets.
+
+    Returns
+    -------
+    feature_cols_dict
+        Dictionary with lists of feature identifiers per feature set.
+    """
+    feature_cols_dict = OrderedDict()
+    feature_path = make_data_path("feature_column_identifiers")
+
+    for feature_set in DEFAULT_FEATURE_ORDER:
+        feature_file = os.path.join(feature_path, f"{feature_set}_col.tsv")
+        feature_data = pd.read_csv(
+            feature_file, sep="\t", index_col=0, header=None, squeeze=True
+        )
+        feature_cols_dict[feature_set] = feature_data.to_list()
+    return feature_cols_dict
+
+
+def default_encoder():
+    encoder = LabelEncoder()
+    class_file_path = make_data_path("classes_sherlock.npy")
+    encoder.classes_ = np.load(class_file_path, allow_pickle=True)
+    return encoder
+
+
+def construct_model(
+    model_path: Optional[str] = None,
+    weight_path: Optional[str] = None,
+    with_weights: bool = True,
+):
+    """Load model architecture and populate with pretrained weights.
+
+    Parameters
+    ----------
+    model_path
+        Location of model file
+    weight_path
+        Location of weight file
+    with_weights
+        Whether to populate the model with trained weights.
+
+    Returns
+    -------
+    model
+        Compiled model.
+    callbacks
+        Callback configuration for model retraining.
+    """
+    if model_path is None:
+        model_path = make_data_path("sherlock_model.json")
+
+    if weight_path is None:
+        weight_path = make_data_path("sherlock_weights.h5")
+
+    with open(model_path, "r") as model_file:
+        model = model_from_json(model_file.read())
+
+    if with_weights:
+        model.load_weights(weight_path)
+
+    learning_rate = 0.0001
+    callbacks = [EarlyStopping(monitor="val_loss", patience=5)]
+    model.compile(
+        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
+        loss="categorical_crossentropy",
+        metrics=["categorical_accuracy"],
+    )
+
+    return model, callbacks
+
+
+DEFAULT_FEATURES_DICT: OrderedDict = default_features()
+DEFAULT_FEATURES = list(chain(*[cols for cols in DEFAULT_FEATURES_DICT.values()]))
+DEFAULT_ENCODER = default_encoder()
+DEFAULT_MODEL, DEFAULT_CALLBACKS = construct_model()
diff --git a/sherlock/deploy/__init__.py b/sherlock/deploy/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/sherlock/deploy/model_helpers.py b/sherlock/deploy/model_helpers.py
deleted file mode 100644
index 50b2da4..0000000
--- a/sherlock/deploy/model_helpers.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import pandas as pd
-import tensorflow as tf
-from tensorflow.keras.callbacks import EarlyStopping
-from tensorflow.keras.models import model_from_json
-
-
-def categorize_features() -> dict:
-    """Get feature identifiers per feature set, to map features to feature sets.
-    
-    Returns
-    -------
-    feature_cols_dict
-        Dictionary with lists of feature identifiers per feature set.
-    """
-    feature_cols_dict = {}
-    for feature_set in ['char', 'word', 'par', 'rest']:
-        feature_cols_dict[feature_set] = pd.read_csv(
-            f"../sherlock/features/feature_column_identifiers/{feature_set}_col.tsv",
-            sep='\t', index_col=0, header=None, squeeze=True,
-        ).to_list()
-    return feature_cols_dict
-
-
-def construct_sherlock_model(nn_id: str, with_weights: bool):
-    """Load model architecture and populate with pretrained weights.
-    
-    Parameters
-    ----------
-    nn_id
-        Identifier for retrained model.
-    with_weights
-        Whether to populate the model with trained weights.
-    
-    Returns
-    -------
-    sherlock_model
-        Compiled sherlock model.
-    callbacks
-        Callback configuration for model retraining.
-    """
-
-    lr = 0.0001
-    callbacks = [EarlyStopping(monitor="val_loss", patience=5)]
-    
-    file = open(f"../models/sherlock_model.json", "r")
-    sherlock_model = model_from_json(file.read())
-    file.close()
-    
-    if with_weights:
-        sherlock_model.load_weights(f"../models/{nn_id}_weights.h5")
-        
-    sherlock_model.compile(
-        optimizer=tf.keras.optimizers.Adam(lr=lr),
-        loss='categorical_crossentropy',
-        metrics=['categorical_accuracy']
-    )
-
-    return sherlock_model, callbacks
diff --git a/sherlock/deploy/predict_sherlock.py b/sherlock/deploy/predict_sherlock.py
deleted file mode 100644
index 031dbd0..0000000
--- a/sherlock/deploy/predict_sherlock.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import numpy as np
-import pandas as pd
-import tensorflow as tf
-
-from sklearn.preprocessing import LabelEncoder
-
-from sherlock.deploy import model_helpers
-
-
-def _transform_predictions_to_classes(y_pred, nn_id) -> np.array:
-    """Get predicted semantic types from prediction vectors.
-    
-    Parameters
-    ----------
-    y_pred
-        Nested vector with for each sample a vector of likelihoods per semantic type.
-    nn_id
-        Identifier of model to use.
-        
-    Returns
-    -------
-    y_pred
-        Predicted semantic labels.
-    """
-    y_pred_int = np.argmax(y_pred, axis=1)
-    encoder = LabelEncoder()
-    encoder.classes_ = np.load(
-        f"../sherlock/deploy/classes_{nn_id}.npy",
-        allow_pickle=True
-    )
-    y_pred = encoder.inverse_transform(y_pred_int)
-
-    return y_pred
-
-
-def predict_sherlock(X: pd.DataFrame, nn_id: str) -> np.array:
-    """Use sherlock model to generate predictions for X.
-    
-    Parameters
-    ----------
-    X
-        Featurized data set to generate predictions for.
-    nn_id
-        Identifier of a trained model to use for generating predictions.
-        
-    Returns
-    -------
-    Array with predictions for X.
-    """
-    sherlock_model, _ = model_helpers.construct_sherlock_model(nn_id, with_weights=True)
-    feature_cols_dict = model_helpers.categorize_features()
-    y_pred = sherlock_model.predict(
-        [
-            X[feature_cols_dict['char']].values,
-            X[feature_cols_dict['word']].values,
-            X[feature_cols_dict['par']].values,
-            X[feature_cols_dict['rest']].values
-        ]
-    )
-    
-    return _transform_predictions_to_classes(y_pred, nn_id)
diff --git a/sherlock/features/__init__.py b/sherlock/features/__init__.py
index e69de29..4082c48 100644
--- a/sherlock/features/__init__.py
+++ b/sherlock/features/__init__.py
@@ -0,0 +1,3 @@
+from sherlock.features import (bag_of_characters, bag_of_words,
+                               paragraph_vectors, preprocessing,
+                               word_embeddings)
diff --git a/sherlock/features/bag_of_characters.py b/sherlock/features/bag_of_characters.py
index 9fc3793..3365df1 100644
--- a/sherlock/features/bag_of_characters.py
+++ b/sherlock/features/bag_of_characters.py
@@ -1,41 +1,34 @@
 import string
-import numpy  as np
-from scipy.stats import skew, kurtosis
 from collections import OrderedDict
 
+import numpy as np
+from scipy.stats import kurtosis, skew
+
+ignore_chars = {"\n", "\\", "\v", "\r", "\t", "^"}
+characters_to_check = [f"[{c}]" for c in string.printable if c not in ignore_chars]
+characters_to_check.extend(["[\\\\]", "[\^]"])
+
 
 # Input: a single column in the form of pandas series
 # Output: ordered dictionary holding bag of character features
 def extract_bag_of_characters_features(data):
-    
-    characters_to_check = (
-            ['['+ c + ']' for c in string.printable if c not in ('\n', '\\', '\v', '\r', '\t', '^')]
-            + ['[\\\\]', '[\^]']
-    )
-    
     f = OrderedDict()
 
-    data_no_null = data.dropna()
     all_value_features = OrderedDict()
 
     for c in characters_to_check:
-        all_value_features['n_{}'.format(c)] = data_no_null.str.count(c)
-        
+        all_value_features[f"n_{c}"] = data.str.count(c)
+
     for value_feature_name, value_features in all_value_features.items():
-        f['{}-agg-any'.format(value_feature_name)] = any(value_features)
-        f['{}-agg-all'.format(value_feature_name)] = all(value_features)
-        f['{}-agg-mean'.format(value_feature_name)] = np.mean(value_features)
-        f['{}-agg-var'.format(value_feature_name)] = np.var(value_features)
-        f['{}-agg-min'.format(value_feature_name)] = np.min(value_features)
-        f['{}-agg-max'.format(value_feature_name)] = np.max(value_features)
-        f['{}-agg-median'.format(value_feature_name)] = np.median(value_features)
-        f['{}-agg-sum'.format(value_feature_name)] = np.sum(value_features)
-        f['{}-agg-kurtosis'.format(value_feature_name)] = kurtosis(value_features)
-        f['{}-agg-skewness'.format(value_feature_name)] = skew(value_features)
+        f[f"{value_feature_name}-agg-any"] = any(value_features)
+        f[f"{value_feature_name}-agg-all"] = all(value_features)
+        f[f"{value_feature_name}-agg-mean"] = np.mean(value_features)
+        f[f"{value_feature_name}-agg-var"] = np.var(value_features)
+        f[f"{value_feature_name}-agg-min"] = np.min(value_features)
+        f[f"{value_feature_name}-agg-max"] = np.max(value_features)
+        f[f"{value_feature_name}-agg-median"] = np.median(value_features)
+        f[f"{value_feature_name}-agg-sum"] = np.sum(value_features)
+        f[f"{value_feature_name}-agg-kurtosis"] = kurtosis(value_features)
+        f[f"{value_feature_name}-agg-skewness"] = skew(value_features)
 
     return f
-
-
-    
-    
-
diff --git a/sherlock/features/bag_of_words.py b/sherlock/features/bag_of_words.py
index 53d961f..ce6b7c3 100644
--- a/sherlock/features/bag_of_words.py
+++ b/sherlock/features/bag_of_words.py
@@ -1,80 +1,79 @@
 import math
+from collections import OrderedDict
+
 import nltk
 import numpy as np
-from scipy.stats import skew, kurtosis
-from collections import OrderedDict
+from scipy.stats import kurtosis, skew
 
 
 # Input: a single column in the form of a pandas series
 # Output: ordered dictionary holding bag of words features
-def extract_bag_of_words_features(data, n_val):
-    
+def extract_bag_of_words_features(data):
+
     f = OrderedDict()
-    data = data.dropna()
-    
-    #n_val = data.size
-    
-    if not n_val: return
-    
+    n_val = data.shape[0]
+
     # Entropy of column
     freq_dist = nltk.FreqDist(data)
-    probs = [freq_dist.freq(l) for l in freq_dist]
-    f['col_entropy'] = -sum(p * math.log(p,2) for p in probs)
+    probs = np.array([freq_dist.freq(item) for item in freq_dist])
+    f["col_entropy"] = (probs * np.log2(probs)).sum()
 
     # Fraction of cells with unique content
-    num_unique = data.nunique()
-    f['frac_unique'] = num_unique / n_val
+    num_unique = len(freq_dist)
+    f["frac_unique"] = num_unique / n_val
 
     # Fraction of cells with numeric content -> frac text cells doesn't add information
-    num_cells = np.sum(data.str.contains('[0-9]', regex=True))
-    text_cells = np.sum(data.str.contains('[a-z]|[A-Z]', regex=True))
-    f['frac_numcells']  = num_cells / n_val
-    f['frac_textcells'] = text_cells / n_val
-    
+    num_cells = data.str.count("[0-9]")
+    text_cells = data.str.count("[a-z]|[A-Z]")
+    f["frac_numcells"] = num_cells[num_cells > 0].shape[0] / n_val
+    f["frac_textcells"] = text_cells[text_cells > 0].shape[0] / n_val
+
     # Average + std number of numeric tokens in cells
-    num_reg = '[0-9]'
-    f['avg_num_cells'] = np.mean(data.str.count(num_reg))
-    f['std_num_cells'] = np.std(data.str.count(num_reg))
-    
+    num_agg = num_cells.agg(["mean", "std"]).to_dict()
+    f["avg_num_cells"] = num_agg["mean"]
+    f["std_num_cells"] = num_agg["std"]
+
     # Average + std number of textual tokens in cells
-    text_reg = '[a-z]|[A-Z]'
-    f['avg_text_cells'] = np.mean(data.str.count(text_reg))
-    f['std_text_cells'] = np.std(data.str.count(text_reg))
-    
+    text_agg = text_cells.agg(["mean", "std"]).to_dict()
+    f["avg_text_cells"] = text_agg["mean"]
+    f["std_text_cells"] = text_agg["std"]
+
     # Average + std number of special characters in each cell
     spec_reg = '[[!@#$%^&*(),.?":{}|<>]]'
-    f['avg_spec_cells'] = np.mean(data.str.count(spec_reg))
-    f['std_spec_cells'] = np.std(data.str.count(spec_reg))
-    
+    spec_agg = data.str.count(spec_reg).agg(["mean", "std"]).to_dict()
+    f["avg_spec_cells"] = spec_agg["mean"]
+    f["std_spec_cells"] = spec_agg["std"]
+
     # Average number of words in each cell
     space_reg = '[" "]'
-    f['avg_word_cells'] = np.mean(data.str.count(space_reg) + 1)
-    f['std_word_cells'] = np.std(data.str.count(space_reg) + 1)
+    word_agg = (data.str.count(space_reg) + 1).agg(["mean", "std"]).to_dict()
+    f["avg_word_cells"] = word_agg["mean"]
+    f["std_word_cells"] = word_agg["std"]
 
     all_value_features = OrderedDict()
 
-    data_no_null = data.dropna()
+    data_no_null = data.dropna() if data.hasnans else data
 
-    f['n_values'] = n_val
+    f["n_values"] = n_val
 
-    all_value_features['length'] = data_no_null.apply(len)
+    all_value_features["length"] = data_no_null.apply(len)
 
     for value_feature_name, value_features in all_value_features.items():
-        f['{}-agg-any'.format(value_feature_name)] = any(value_features)
-        f['{}-agg-all'.format(value_feature_name)] = all(value_features)
-        f['{}-agg-mean'.format(value_feature_name)] = np.mean(value_features)
-        f['{}-agg-var'.format(value_feature_name)] = np.var(value_features)
-        f['{}-agg-min'.format(value_feature_name)] = np.min(value_features)
-        f['{}-agg-max'.format(value_feature_name)] = np.max(value_features)
-        f['{}-agg-median'.format(value_feature_name)] = np.median(value_features)
-        f['{}-agg-sum'.format(value_feature_name)] = np.sum(value_features)
-        f['{}-agg-kurtosis'.format(value_feature_name)] = kurtosis(value_features)
-        f['{}-agg-skewness'.format(value_feature_name)] = skew(value_features)
-
-    n_none = data.size - data_no_null.size - len([ e for e in data if e == ''])
-    f['none-agg-has'] = n_none > 0
-    f['none-agg-percent'] = n_none / len(data)
-    f['none-agg-num'] = n_none
-    f['none-agg-all'] = (n_none == len(data))
-    
+        f["{}-agg-any".format(value_feature_name)] = any(value_features)
+        f["{}-agg-all".format(value_feature_name)] = all(value_features)
+        f["{}-agg-mean".format(value_feature_name)] = np.mean(value_features)
+        f["{}-agg-var".format(value_feature_name)] = np.var(value_features)
+        f["{}-agg-min".format(value_feature_name)] = np.min(value_features)
+        f["{}-agg-max".format(value_feature_name)] = np.max(value_features)
+        f["{}-agg-median".format(value_feature_name)] = np.median(value_features)
+        f["{}-agg-sum".format(value_feature_name)] = np.sum(value_features)
+        f["{}-agg-kurtosis".format(value_feature_name)] = kurtosis(value_features)
+        f["{}-agg-skewness".format(value_feature_name)] = skew(value_features)
+
+    n_none = data.size - data_no_null.size - (data == "").sum()
+    f["none-agg-has"] = n_none > 0
+    f["none-agg-percent"] = n_none / len(data)
+    f["none-agg-num"] = n_none
+    f["none-agg-all"] = n_none == len(data)
+
     return f
diff --git a/sherlock/features/paragraph_vectors.py b/sherlock/features/paragraph_vectors.py
index 551042d..4affbab 100644
--- a/sherlock/features/paragraph_vectors.py
+++ b/sherlock/features/paragraph_vectors.py
@@ -1,19 +1,34 @@
-import pandas as pd
 import random
+from collections import OrderedDict
+from functools import cache
+from typing import Union
 
+import numpy as np
+import pandas as pd
 from gensim.models.doc2vec import Doc2Vec, TaggedDocument
 
+from sherlock import make_data_path
+
+
+@cache
+def get_paragraph_vector_model():
+    par_vec_file = make_data_path(f"par_vec_trained_{400}.pkl")
+    paragraph_vector_model = Doc2Vec.load(par_vec_file)
+    return paragraph_vector_model
+
 
 # Input: a collection of columns stored in a dataframe column 'values'
 # Output: tagged columns.
 # Only needed for training.
 def tagcol_paragraph_embeddings_features(train_data):
-    
     # Expects a dataframe with a 'values' column
-    train_data_values = train_data['values']
+    train_data_values = train_data["values"]
     random.seed(13)
-    columns = [TaggedDocument(random.sample(col, min(1000, len(col))), [i]) for i, col in enumerate(train_data_values.values)]
-    
+    columns = [
+        TaggedDocument(random.sample(col, min(1000, len(col))), [i])
+        for i, col in enumerate(train_data_values.values)
+    ]
+
     return columns
 
 
@@ -21,39 +36,37 @@ def tagcol_paragraph_embeddings_features(train_data):
 # Output: a stored retrained model
 # Only needed for training.
 def train_paragraph_embeddings_features(columns, dim):
-
     # Train Doc2Vec model
-    model = Doc2Vec(columns, dm=0, negative=3, workers=8, vector_size=dim, epochs=20, min_count=2, seed=13)
+    model = Doc2Vec(
+        columns,
+        dm=0,
+        negative=3,
+        workers=8,
+        vector_size=dim,
+        epochs=20,
+        min_count=2,
+        seed=13,
+    )
 
     # Save trained model
-    model_file = '../sherlock/features/par_vec_retrained_{}.pkl'.format(dim)
+    model_file = make_data_path(f"par_vec_retrained_{dim}.pkl")
     model.save(model_file)
     model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
-    
+
 
 # Input: a single column in the form of a pandas Series.
 # Output: ordered dictionary holding paragraph vector features
-def infer_paragraph_embeddings_features(data, dim):
-
-    # Load pretrained paragraph vector model
-    model = Doc2Vec.load('../sherlock/features/par_vec_trained_{}.pkl'.format(dim))
-
-    f = pd.DataFrame()
-
-    if len(data) > 1000:
-        random.seed(13)
-        vec = random.sample(data, 1000)
-    else:
-        vec = data
-
-    # Infer paragraph vector for data sample
-    f = f.append(pd.Series(model.infer_vector(vec, steps=20,
-                                              alpha=0.025)), ignore_index=True)
+def infer_paragraph_embeddings_features(
+    data: Union[np.array, pd.Series, list]
+) -> OrderedDict:
+    # pandas and numpy
+    if not isinstance(data, list):
+        data = data.tolist()
 
-    col_names = []
-    for i, col in enumerate(f):
-        col_names.append('par_vec_{}'.format(i))
-        
-    f.columns = col_names
+    model = get_paragraph_vector_model()
+    embedding = model.infer_vector(data, steps=20, alpha=0.025)
 
-    return f
+    res = OrderedDict()
+    for i, v in enumerate(embedding):
+        res[f"par_vec_{i}"] = v
+    return res
diff --git a/sherlock/features/preprocessing.py b/sherlock/features/preprocessing.py
index 90aa190..25eea84 100644
--- a/sherlock/features/preprocessing.py
+++ b/sherlock/features/preprocessing.py
@@ -1,39 +1,38 @@
+import os
+import random
 from ast import literal_eval
 from collections import OrderedDict
-import random
-import os
-from typing import Union
+from functools import cache
+from typing import Optional, Union
 
-from google_drive_downloader import GoogleDriveDownloader as gd
 import numpy as np
 import pandas as pd
-from tqdm import tqdm
+from google_drive_downloader import GoogleDriveDownloader as gd
 
-from sherlock.features.bag_of_characters import extract_bag_of_characters_features
+from sherlock import make_data_path
+from sherlock.features.bag_of_characters import \
+    extract_bag_of_characters_features
 from sherlock.features.bag_of_words import extract_bag_of_words_features
+from sherlock.features.paragraph_vectors import \
+    infer_paragraph_embeddings_features
 from sherlock.features.word_embeddings import extract_word_embeddings_features
-from sherlock.features.paragraph_vectors import infer_paragraph_embeddings_features
 
 
 def prepare_feature_extraction():
     """Download embedding files from Google Drive if they do not exist yet."""
-    word_embedding_file = '../sherlock/features/glove.6B.50d.txt'
-    paragraph_vector_file = '../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy'
-    
-    print(
-        f"""Preparing feature extraction by downloading 2 files:
-        \n {word_embedding_file} and \n {paragraph_vector_file}.
-        """
+    word_embedding_file = make_data_path("glove.6B.50d.txt")
+    paragraph_vector_file = make_data_path(
+        "par_vec_trained_400.pkl.docvecs.vectors_docs.npy"
     )
 
     if not os.path.exists(word_embedding_file):
-        print('Downloading GloVe word embedding vectors.')
+        print("Downloading GloVe word embedding vectors.")
         file_name = word_embedding_file
         gd.download_file_from_google_drive(
-            file_id='1kayd5oNRQm8-NCvA8pIrtezbQ-B1_Vmk',
+            file_id="1kayd5oNRQm8-NCvA8pIrtezbQ-B1_Vmk",
             dest_path=file_name,
             unzip=False,
-            showsize=True
+            showsize=True,
         )
 
         print("GloVe word embedding vectors were downloaded.")
@@ -42,125 +41,68 @@ def prepare_feature_extraction():
         print("Downloading pretrained paragraph vectors.")
         file_name = paragraph_vector_file
         gd.download_file_from_google_drive(
-            file_id='1vdyGJ4aB71FCaNqJKYX387eVufcH4SAu',
+            file_id="1vdyGJ4aB71FCaNqJKYX387eVufcH4SAu",
             dest_path=file_name,
             unzip=False,
-            showsize=True
+            showsize=True,
         )
-        
+
         print("Trained paragraph vector model was downloaded.")
-        
-    print("All files for extracting word and paragraph embeddings are present.")
-    
-def prepare_word_embeddings():
 
-    word_vectors_f = open('../sherlock/features/glove.6B.50d.txt', encoding='utf-8')
+
+@cache
+def prepare_word_embeddings():
+    word_vectors_f = open(make_data_path("glove.6B.50d.txt"), encoding="utf-8")
     word_to_embedding = {}
 
     for w in word_vectors_f:
-
-        term, vector = w.strip().split(' ', 1)
-        vector = np.array(vector.split(' '), dtype=float)
+        term, vector = w.strip().split(" ", 1)
+        vector = np.array(vector.split(" "), dtype=float)
         word_to_embedding[term] = vector
 
     return word_to_embedding
 
-    
-def convert_string_lists_to_lists(
-    data: Union[pd.DataFrame, pd.Series],
-    labels: Union[pd.DataFrame, pd.Series],
-    data_column_name: str = None,
-    labels_column_name: str = None,
-) -> pd.Series:
-    """Convert strings of arrays with values to arrays of strings of values.
-    Each row in de dataframe or series corresponds to a column, represented by a string of a list.
-    Each string-list will be converted to a list with string values.
-    
-    Parameters
-    ----------
-    data
-        Data to convert column from.
-    labels
-        Labels of each row corresponding to semantic column type.
-    data_column_name
-        Name of column of the data to convert.
-    labels_column_name
-        Name of column with the labels to convert.
-    
-    Returns
-    -------
-    converted_data
-        Series with all rows a list of string values.
-    converted_labels
-        List with labels.
-    """
-    tqdm.pandas()
-    
-    if isinstance(data, pd.DataFrame):
-        if data_column_name is None: raise ValueError("Missing column name of data.")
-        converted_data = data[data_column_name].progress_apply(literal_eval)
-    elif isinstance(data, pd.Series):
-        converted_data = data.progress_apply(literal_eval)
-    else:
-        raise TypeError("Unexpected data type of samples.")
-
-    if isinstance(labels, pd.DataFrame):
-        if labels_column_name is None: raise ValueError("Missing column name of labels.")
-        converted_labels = labels[labels_column_name].to_list()
-    elif isinstance(labels, pd.Series):
-        converted_labels = labels.to_list()
-    else:
-        raise TypeError("Unexpected data type of labels.")
-    
-    return converted_data, converted_labels
-
-
-def extract_features(data: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame:
+
+def extract_feature(data: pd.Series, n_samples=None):
+    word_to_embedding = prepare_word_embeddings()
+
+    if n_samples:
+        random.seed(13)
+        n_samples = min(data.shape[0], n_samples)
+        data = pd.Series(random.choices(data, k=n_samples))
+
+    not_na_idx = data.notna() if data.hasnans else np.array([True] * data.shape[0])
+    data[not_na_idx] = data[not_na_idx].astype(str)
+
+    f = (
+        extract_bag_of_characters_features(data[not_na_idx])
+        | extract_word_embeddings_features(data[not_na_idx], word_to_embedding)
+        | extract_bag_of_words_features(data)
+        | infer_paragraph_embeddings_features(data[not_na_idx])
+    )
+
+    return f
+
+
+def extract_features(
+    data: Union[pd.DataFrame, pd.Series], n_samples: Optional[int] = None
+) -> pd.DataFrame:
     """Extract features from raw data.
-    
+
     Parameters
     ----------
     data
         A pandas DataFrame or Series with each row a list of string values.
-        
+    n_samples
+        An optional integer indicating the number of samples to use for feature extraction
+
     Returns
     -------
     DataFrame with featurized column samples.
     """
     prepare_feature_extraction()
 
-    word_to_embedding = prepare_word_embeddings()
-
-    features_list = []
-    df_par = pd.DataFrame()
-    n_samples = 1000
-    vec_dim = 400
-    i = 0
-    for raw_sample in data:
-
-        i = i + 1
-        if i % 100 == 0:
-            print(f"Extracting features for data column: {i}")
-
-        n_values = len(raw_sample)
-
-        if n_samples > n_values:
-            n_samples = n_values
-
-        random.seed(13)
-        raw_sample = pd.Series(random.choices(raw_sample, k=n_samples)).astype(str)
-
-        f = OrderedDict(
-            list(extract_bag_of_characters_features(raw_sample).items()) +
-            list(extract_word_embeddings_features(raw_sample, word_to_embedding).items()) +
-            list(extract_bag_of_words_features(raw_sample, n_values).items())
-        )
-        features_list.append(f)
+    if isinstance(data, pd.Series):
+        return pd.DataFrame([extract_feature(data, n_samples)])
 
-        df_par = df_par.append(infer_paragraph_embeddings_features(raw_sample, vec_dim))
-
-    return pd.concat(
-        [pd.DataFrame(features_list).reset_index(drop=True), df_par.reset_index(drop=True)],
-        axis=1,
-        sort=False
-    )
+    return pd.DataFrame([extract_feature(data[col], n_samples) for col in data.columns])
diff --git a/sherlock/features/word_embeddings.py b/sherlock/features/word_embeddings.py
index 35eb0f8..a3c2d75 100644
--- a/sherlock/features/word_embeddings.py
+++ b/sherlock/features/word_embeddings.py
@@ -1,57 +1,65 @@
-import numpy as np
-
-from scipy import stats
 from collections import OrderedDict
 
+import numpy as np
+import pandas as pd
+from scipy import stats
 
-# Input: a single column in the form of a pandas series
-# Output: ordered dictionary holding word embedding features
-def extract_word_embeddings_features(values, word_to_embedding):
-
-    num_embeddings = 50
-    f = OrderedDict()
-    embeddings = []
+NUM_EMBEDDINGS = 50
 
-    values = values.dropna()
 
-    for v in values:
+def make_default_response():
+    default_response = OrderedDict()
+    default_response.setdefault("word_embedding_feature", 0)
+    for i in range(NUM_EMBEDDINGS):
+        default_response.setdefault("word_embedding_avg_{}".format(i), np.nan)
+    for i in range(NUM_EMBEDDINGS):
+        default_response.setdefault("word_embedding_std_{}".format(i), np.nan)
+    for i in range(NUM_EMBEDDINGS):
+        default_response.setdefault("word_embedding_med_{}".format(i), np.nan)
+    for i in range(NUM_EMBEDDINGS):
+        default_response.setdefault("word_embedding_mode_{}".format(i), np.nan)
+    return default_response
 
-        v = str(v).lower()
 
-        if v in word_to_embedding:
-            embeddings.append(word_to_embedding.get(v))
+def embedding_getter(word_to_embedding):
+    def inner(value: str):
+        if value in word_to_embedding:
+            embedding = word_to_embedding.get(value)
         else:
-            words = v.split(' ')
-            embeddings_to_all_words = []
+            embeddings = [
+                word_to_embedding.get(v)
+                for v in value.split(" ")
+                if v in word_to_embedding
+            ]
+            embedding = np.mean(embeddings, axis=0)
+        return embedding
 
-            for w in words:
-                if w in word_to_embedding:
-                    embeddings_to_all_words.append(word_to_embedding.get(w))
-            if embeddings_to_all_words:
-                mean_of_word_embeddings = np.nanmean(embeddings_to_all_words, axis=0)
-                embeddings.append(mean_of_word_embeddings)
+    return inner
 
-    if len(embeddings) == 0:
-        for i in range(num_embeddings): f['word_embedding_avg_{}'.format(i)] = np.nan
-        for i in range(num_embeddings): f['word_embedding_std_{}'.format(i)] = np.nan
-        for i in range(num_embeddings): f['word_embedding_med_{}'.format(i)] = np.nan
-        for i in range(num_embeddings): f['word_embedding_mode_{}'.format(i)] = np.nan
 
-        f['word_embedding_feature'] = 0
+# Input: a single column in the form of a pandas series
+# Output: ordered dictionary holding word embedding features
+def extract_word_embeddings_features(
+    values: pd.Series, word_to_embedding: dict
+) -> OrderedDict:
+    f = make_default_response()
 
-        return f
+    get_embedding = embedding_getter(word_to_embedding)
+    embeddings = [get_embedding(val) for val in values.str.lower()]
 
-    else:
-        mean_embeddings = np.nanmean(embeddings, axis=0)
-        med_embeddings = np.nanmedian(embeddings, axis=0)
-        std_embeddings = np.nanstd(embeddings, axis=0)
-        mode_embeddings = stats.mode(embeddings, axis=0, nan_policy='omit')[0].flatten()
+    if len(embeddings) == 0:
+        return f
 
-        for i, e in enumerate(mean_embeddings): f['word_embedding_avg_{}'.format(i)] = e
-        for i, e in enumerate(std_embeddings): f['word_embedding_std_{}'.format(i)] = e
-        for i, e in enumerate(med_embeddings): f['word_embedding_med_{}'.format(i)] = e
-        for i, e in enumerate(mode_embeddings): f['word_embedding_mode_{}'.format(i)] = e
+    f["word_embedding_feature"] = 1
+    mean_embeddings = np.nanmean(embeddings, axis=0)
+    med_embeddings = np.nanmedian(embeddings, axis=0)
+    std_embeddings = np.nanstd(embeddings, axis=0)
+    mode_embeddings = stats.mode(embeddings, axis=0, nan_policy="omit")[0].flatten()
 
-        f['word_embedding_feature'] = 1
+    for i in range(NUM_EMBEDDINGS):
+        f[f"word_embedding_avg_{i}"] = mean_embeddings[i]
+        f[f"word_embedding_std_{i}"] = std_embeddings[i]
+        f[f"word_embedding_med_{i}"] = med_embeddings[i]
+        f[f"word_embedding_mode_{i}"] = mode_embeddings[i]
 
-        return f
+    return f
diff --git a/sherlock/helpers.py b/sherlock/helpers.py
index fe6b6c5..ea2013a 100644
--- a/sherlock/helpers.py
+++ b/sherlock/helpers.py
@@ -1,23 +1,29 @@
 import os
 
+import pkg_resources
 from google_drive_downloader import GoogleDriveDownloader as gd
 
+DATA_PATH = pkg_resources.resource_filename("sherlock", "data/")
+
+
+def make_data_path(path):
+    return os.path.join(DATA_PATH, path)
+
 
 def download_data():
     """Download raw and preprocessed data files.
     The data is downloaded from Google Drive and stored in the 'data/' directory.
     """
-    data_dir = '../data/data.zip'
+    data_dir = make_data_path("data.zip")
     print(f"Downloading the raw and preprocessed data into {data_dir}.")
 
     if not os.path.exists(data_dir):
-        print('Downloading data directory.')
-        dir_name = data_dir
+        print("Downloading data directory.")
         gd.download_file_from_google_drive(
-            file_id='1-g0zbKFAXz7zKZc0Dnh74uDBpZCv4YqU',
-            dest_path=dir_name,
+            file_id="1-g0zbKFAXz7zKZc0Dnh74uDBpZCv4YqU",
+            dest_path=data_dir,
             unzip=True,
-            showsize=True
+            showsize=True,
         )
 
-    print('Data was downloaded.')
\ No newline at end of file
+    print("Data was downloaded.")
diff --git a/sherlock/model.py b/sherlock/model.py
new file mode 100644
index 0000000..f8dd9b5
--- /dev/null
+++ b/sherlock/model.py
@@ -0,0 +1,56 @@
+from typing import List
+
+import numpy as np
+import pandas as pd
+from sklearn.base import ClassifierMixin, TransformerMixin
+from sklearn.pipeline import Pipeline
+
+from sherlock import defaults
+from sherlock.features import preprocessing
+
+
+class SherlockTransformer(TransformerMixin):
+    def __init__(self):
+        self.feature_dict = defaults.DEFAULT_FEATURES_DICT
+
+    def transform(self, X: pd.DataFrame):
+        X = preprocessing.extract_features(X).astype("float32")
+        return [X[cols].values for cols in self.feature_dict.values()]
+
+
+class SherlockModel(ClassifierMixin):
+    def __init__(self):
+        self.encoder = defaults.DEFAULT_ENCODER
+        self.model = defaults.DEFAULT_MODEL
+
+    def predict_proba(self, X: List[pd.DataFrame]) -> np.array:
+        return self.model.predict(X)
+
+    def predict_log_proba(self, X: List[pd.DataFrame]) -> np.array:
+        return np.log(self.model.predict(X))
+
+    def predict(self, X: List[pd.DataFrame]) -> np.array:
+        y_pred = self.predict_proba(X)
+        y_pred_int = np.argmax(y_pred, axis=1)
+        return self.encoder.inverse_transform(y_pred_int)
+
+    def fit(self, X: List[pd.DataFrame], y: pd.Series) -> np.array:
+        self.model.fit(X, y)
+
+
+class SherlockPipeline(Pipeline):
+    def __init__(self):
+        steps = [("transformer", SherlockTransformer()), ("model", SherlockModel())]
+        super().__init__(steps)
+
+    def named_proba(self, X: pd.DataFrame, top_n=None):
+        y_pred = self.predict_proba(X)
+        result = dict()
+        for i, col in enumerate(X.columns):
+            temp = sorted(
+                zip(self.steps[-1].encoder.classes_, y_pred[i]),
+                key=lambda item: item[1],
+                reverse=True,
+            )
+            result[col] = temp[0:top_n] if top_n else temp
+        return result
diff --git a/sherlock/deploy/train_sherlock.py b/sherlock/training.py
similarity index 76%
rename from sherlock/deploy/train_sherlock.py
rename to sherlock/training.py
index 17de34e..d34626b 100644
--- a/sherlock/deploy/train_sherlock.py
+++ b/sherlock/training.py
@@ -1,17 +1,16 @@
 import numpy as np
-import tensorflow as tf
 import pandas as pd
-
+import tensorflow as tf
 from sklearn.preprocessing import LabelEncoder
 
-from sherlock.deploy import model_helpers
+from sherlock import defaults
 
 SEED = 13
 
 
 def _get_categorical_label_encodings(y_train, y_val, nn_id) -> (list, list):
     """Encode semantic type string labels as categoricals.
-    
+
     Parameters
     ----------
     y_train
@@ -20,11 +19,11 @@ def _get_categorical_label_encodings(y_train, y_val, nn_id) -> (list, list):
         Validation labels.
     nn_id
         Identifier of retrained model.
-        
+
     Returns
     -------
     y_train_cat
-        Categorical encodings of train labels.  
+        Categorical encodings of train labels.
     y_val_cat
         Categorical encodings of validation labels.
     """
@@ -48,7 +47,7 @@ def _get_categorical_label_encodings(y_train, y_val, nn_id) -> (list, list):
 
 def _save_retrained_sherlock_model(sherlock_model, nn_id: str):
     """Save weights of retrained sherlock model.
-    
+
     Parameters
     ----------
     sherlock_model
@@ -72,7 +71,7 @@ def train_sherlock(
     nn_id: str,
 ):
     """Train weights of sherlock model from existing NN architecture.
-    
+
     Parameters
     ----------
     X_train
@@ -86,40 +85,42 @@ def train_sherlock(
     nn_id
         Identifier for retrained model.
     """
-    
+
     if nn_id == "sherlock":
         raise ValueError(
             """nn_id cannot be equal to 'sherlock' 
             to avoid overwriting pretrained model.
             """
         )
-    
-    feature_cols = model_helpers.categorize_features()
+
+    feature_cols = defaults.default_features()
     y_train_cat, y_val_cat = _get_categorical_label_encodings(y_train, y_val, nn_id)
-    sherlock_model, callbacks = model_helpers.construct_sherlock_model(nn_id, False)
+    sherlock_model, callbacks = defaults.construct_sherlock_model(nn_id, False)
 
     print("Successfully loaded and compiled model, now fitting model on data.")
 
     sherlock_model.fit(
         [
-            X_train[feature_cols['char']].values,
-            X_train[feature_cols['word']].values,
-            X_train[feature_cols['par']].values,
-            X_train[feature_cols['rest']].values,
+            X_train[feature_cols["char"]].values,
+            X_train[feature_cols["word"]].values,
+            X_train[feature_cols["par"]].values,
+            X_train[feature_cols["rest"]].values,
         ],
         y_train_cat,
         validation_data=(
             [
-                X_val[feature_cols['char']].values,
-                X_val[feature_cols['word']].values,
-                X_val[feature_cols['par']].values,
-                X_val[feature_cols['rest']].values,
+                X_val[feature_cols["char"]].values,
+                X_val[feature_cols["word"]].values,
+                X_val[feature_cols["par"]].values,
+                X_val[feature_cols["rest"]].values,
             ],
-            y_val_cat
+            y_val_cat,
         ),
-        callbacks=callbacks, epochs=100, batch_size=256
+        callbacks=callbacks,
+        epochs=100,
+        batch_size=256,
     )
 
     _save_retrained_sherlock_model(sherlock_model, nn_id)
 
-    print('Retrained Sherlock.')
+    print("Retrained Sherlock.")

From c71e9b97173541923b5068278e8d9c834f777519 Mon Sep 17 00:00:00 2001
From: Ian Eaves <ian.k.eaves@gmail.com>
Date: Mon, 28 Jun 2021 14:13:38 -0500
Subject: [PATCH 2/2] entropy calculation fix

---
 sherlock/features/bag_of_words.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sherlock/features/bag_of_words.py b/sherlock/features/bag_of_words.py
index ce6b7c3..5455801 100644
--- a/sherlock/features/bag_of_words.py
+++ b/sherlock/features/bag_of_words.py
@@ -16,7 +16,7 @@ def extract_bag_of_words_features(data):
     # Entropy of column
     freq_dist = nltk.FreqDist(data)
     probs = np.array([freq_dist.freq(item) for item in freq_dist])
-    f["col_entropy"] = (probs * np.log2(probs)).sum()
+    f["col_entropy"] = -(probs * np.log2(probs)).sum()
 
     # Fraction of cells with unique content
     num_unique = len(freq_dist)