mitmedialab · ieaves · Jun 28, 2021 · Jun 28, 2021
diff --git a/.gitignore b/.gitignore
@@ -74,6 +74,7 @@ target/
 
 # Jupyter Notebook
 .ipynb_checkpoints
+*.ipynb
 
 # pyenv
 .python-version

diff --git a/requirements.txt b/requirements.txt
@@ -1,43 +1,10 @@
-absl-py==0.7.1
-astor==0.8.0
-boto==2.49.0
-boto3==1.9.188
-botocore==1.12.188
-certifi==2019.6.16
-chardet==3.0.4
-docutils==0.14
-gast==0.2.2
-gensim==3.8.0
-google-pasta==0.1.7
+gensim>=3.8.0
 googledrivedownloader==0.4
-grpcio==1.22.0
-h5py==2.9.0
-idna==2.8
-jmespath==0.9.4
-joblib==0.13.2
-Keras-Applications==1.0.8
-Keras-Preprocessing==1.1.0
-Markdown==3.1.1
-nltk==3.4.4
-matplotlib==2.2.5
-numpy==1.16.1
-pandas==0.24.2
-pre-commit==2.8.2
-protobuf==3.9.0
-pyarrow==2.0.0
-python-dateutil==2.8.0
-pytz==2019.1
-requests==2.22.0
-s3transfer==0.2.1
-scikit-learn==0.20
-six==1.12.0
-smart-open==1.8.4
-tensorboard==1.14.0
-tensorflow==1.14.0
-tensorflow-estimator==1.14.0
-termcolor==1.1.0
-tqdm==4.51.0
-urllib3==1.25.3
-Werkzeug==0.15.4
-wrapt==1.11.2
--e .
+joblib>=0.13.2
+nltk>=3.4.4
+numpy~=1.21.0
+pandas~=1.2.5
+pre-commit~=2.8.2
+scikit-learn~=0.24.2
+tensorflow>=1.14.0
+scipy~=1.7.0
diff --git a/setup.py b/setup.py
@@ -8,5 +8,5 @@
     description="Package for semantic type detection using Sherlock",
     url="https://github.com/mitmedialab/sherlock-project",
     packages=setuptools.find_packages(),
-    package_dir={"sherlock": "sherlock"}
-)
+    package_dir={"sherlock": "sherlock"},
+)
diff --git a/sherlock/__init__.py b/sherlock/__init__.py
@@ -0,0 +1,5 @@
+from sherlock.defaults import *
+from sherlock.features import *
+from sherlock.helpers import *
+from sherlock.model import *
+from sherlock.training import *
diff --git a/...deploy/classes_retrain_minimal_sample.npy → ...k/data/classes_retrain_minimal_sample.npy b/...deploy/classes_retrain_minimal_sample.npy → ...k/data/classes_retrain_minimal_sample.npy
diff --git a/...ock/deploy/classes_retrained_sherlock.npy → sherlock/data/classes_retrained_sherlock.npy b/...ock/deploy/classes_retrained_sherlock.npy → sherlock/data/classes_retrained_sherlock.npy
diff --git a/sherlock/deploy/classes_sherlock.npy → sherlock/data/classes_sherlock.npy b/sherlock/deploy/classes_sherlock.npy → sherlock/data/classes_sherlock.npy
diff --git a/...s/feature_column_identifiers/char_col.tsv → ...a/feature_column_identifiers/char_col.tsv b/...s/feature_column_identifiers/char_col.tsv → ...a/feature_column_identifiers/char_col.tsv
diff --git a/...es/feature_column_identifiers/par_col.tsv → ...ta/feature_column_identifiers/par_col.tsv b/...es/feature_column_identifiers/par_col.tsv → ...ta/feature_column_identifiers/par_col.tsv
diff --git a/...s/feature_column_identifiers/rest_col.tsv → ...a/feature_column_identifiers/rest_col.tsv b/...s/feature_column_identifiers/rest_col.tsv → ...a/feature_column_identifiers/rest_col.tsv
diff --git a/...s/feature_column_identifiers/word_col.tsv → ...a/feature_column_identifiers/word_col.tsv b/...s/feature_column_identifiers/word_col.tsv → ...a/feature_column_identifiers/word_col.tsv
diff --git a/sherlock/features/par_vec_trained_400.pkl → sherlock/data/par_vec_trained_400.pkl b/sherlock/features/par_vec_trained_400.pkl → sherlock/data/par_vec_trained_400.pkl
diff --git a/models/retrain_minimal_sample_model.json → ...ck/data/retrain_minimal_sample_model.json b/models/retrain_minimal_sample_model.json → ...ck/data/retrain_minimal_sample_model.json
diff --git a/models/sherlock_model.json → sherlock/data/sherlock_model.json b/models/sherlock_model.json → sherlock/data/sherlock_model.json
diff --git a/models/sherlock_weights.h5 → sherlock/data/sherlock_weights.h5 b/models/sherlock_weights.h5 → sherlock/data/sherlock_weights.h5
diff --git a/sherlock/defaults.py b/sherlock/defaults.py
@@ -0,0 +1,94 @@
+import os
+from collections import OrderedDict
+from itertools import chain
+from typing import Optional
+
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+from sklearn.preprocessing import LabelEncoder
+from tensorflow.keras.callbacks import EarlyStopping
+from tensorflow.keras.models import model_from_json
+
+from sherlock import make_data_path
+
+DEFAULT_FEATURE_ORDER = ["char", "word", "par", "rest"]
+
+
+def default_features() -> OrderedDict:
+    """Get feature identifiers per feature set, to map features to feature sets.
+
+    Returns
+    -------
+    feature_cols_dict
+        Dictionary with lists of feature identifiers per feature set.
+    """
+    feature_cols_dict = OrderedDict()
+    feature_path = make_data_path("feature_column_identifiers")
+
+    for feature_set in DEFAULT_FEATURE_ORDER:
+        feature_file = os.path.join(feature_path, f"{feature_set}_col.tsv")
+        feature_data = pd.read_csv(
+            feature_file, sep="\t", index_col=0, header=None, squeeze=True
+        )
+        feature_cols_dict[feature_set] = feature_data.to_list()
+    return feature_cols_dict
+
+
+def default_encoder():
+    encoder = LabelEncoder()
+    class_file_path = make_data_path("classes_sherlock.npy")
+    encoder.classes_ = np.load(class_file_path, allow_pickle=True)
+    return encoder
+
+
+def construct_model(
+    model_path: Optional[str] = None,
+    weight_path: Optional[str] = None,
+    with_weights: bool = True,
+):
+    """Load model architecture and populate with pretrained weights.
+
+    Parameters
+    ----------
+    model_path
+        Location of model file
+    weight_path
+        Location of weight file
+    with_weights
+        Whether to populate the model with trained weights.
+
+    Returns
+    -------
+    model
+        Compiled model.
+    callbacks
+        Callback configuration for model retraining.
+    """
+    if model_path is None:
+        model_path = make_data_path("sherlock_model.json")
+
+    if weight_path is None:
+        weight_path = make_data_path("sherlock_weights.h5")
+
+    with open(model_path, "r") as model_file:
+        model = model_from_json(model_file.read())
+
+    if with_weights:
+        model.load_weights(weight_path)
+
+    learning_rate = 0.0001
+    callbacks = [EarlyStopping(monitor="val_loss", patience=5)]
+    model.compile(
+        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
+        loss="categorical_crossentropy",
+        metrics=["categorical_accuracy"],
+    )
+
+    return model, callbacks
+
+
+DEFAULT_FEATURES_DICT: OrderedDict = default_features()
+DEFAULT_FEATURES = list(chain(*[cols for cols in DEFAULT_FEATURES_DICT.values()]))
+DEFAULT_ENCODER = default_encoder()
+DEFAULT_MODEL, DEFAULT_CALLBACKS = construct_model()
diff --git a/sherlock/deploy/__init__.py b/sherlock/deploy/__init__.py
diff --git a/sherlock/deploy/model_helpers.py b/sherlock/deploy/model_helpers.py
diff --git a/sherlock/deploy/predict_sherlock.py b/sherlock/deploy/predict_sherlock.py
diff --git a/sherlock/features/__init__.py b/sherlock/features/__init__.py
@@ -0,0 +1,3 @@
+from sherlock.features import (bag_of_characters, bag_of_words,
+                               paragraph_vectors, preprocessing,
+                               word_embeddings)
diff --git a/sherlock/features/bag_of_characters.py b/sherlock/features/bag_of_characters.py
@@ -1,41 +1,34 @@
 import string
-import numpy  as np
-from scipy.stats import skew, kurtosis
 from collections import OrderedDict
 
+import numpy as np
+from scipy.stats import kurtosis, skew
+
+ignore_chars = {"\n", "\\", "\v", "\r", "\t", "^"}
+characters_to_check = [f"[{c}]" for c in string.printable if c not in ignore_chars]
+characters_to_check.extend(["[\\\\]", "[\^]"])
+
 
 # Input: a single column in the form of pandas series
 # Output: ordered dictionary holding bag of character features
 def extract_bag_of_characters_features(data):
-
-    characters_to_check = (
-            ['['+ c + ']' for c in string.printable if c not in ('\n', '\\', '\v', '\r', '\t', '^')]
-            + ['[\\\\]', '[\^]']
-    )
-
     f = OrderedDict()
 
-    data_no_null = data.dropna()
     all_value_features = OrderedDict()
 
     for c in characters_to_check:
-        all_value_features['n_{}'.format(c)] = data_no_null.str.count(c)
-        
+        all_value_features[f"n_{c}"] = data.str.count(c)
+
     for value_feature_name, value_features in all_value_features.items():
-        f['{}-agg-any'.format(value_feature_name)] = any(value_features)
-        f['{}-agg-all'.format(value_feature_name)] = all(value_features)
-        f['{}-agg-mean'.format(value_feature_name)] = np.mean(value_features)
-        f['{}-agg-var'.format(value_feature_name)] = np.var(value_features)
-        f['{}-agg-min'.format(value_feature_name)] = np.min(value_features)
-        f['{}-agg-max'.format(value_feature_name)] = np.max(value_features)
-        f['{}-agg-median'.format(value_feature_name)] = np.median(value_features)
-        f['{}-agg-sum'.format(value_feature_name)] = np.sum(value_features)
-        f['{}-agg-kurtosis'.format(value_feature_name)] = kurtosis(value_features)
-        f['{}-agg-skewness'.format(value_feature_name)] = skew(value_features)
+        f[f"{value_feature_name}-agg-any"] = any(value_features)
+        f[f"{value_feature_name}-agg-all"] = all(value_features)
+        f[f"{value_feature_name}-agg-mean"] = np.mean(value_features)
+        f[f"{value_feature_name}-agg-var"] = np.var(value_features)
+        f[f"{value_feature_name}-agg-min"] = np.min(value_features)
+        f[f"{value_feature_name}-agg-max"] = np.max(value_features)
+        f[f"{value_feature_name}-agg-median"] = np.median(value_features)
+        f[f"{value_feature_name}-agg-sum"] = np.sum(value_features)
+        f[f"{value_feature_name}-agg-kurtosis"] = kurtosis(value_features)
+        f[f"{value_feature_name}-agg-skewness"] = skew(value_features)
 
     return f
-
-
-
-
-
-Original file line number
+Diff line change
@@ Expand Up / @@ -74,6 +74,7 @@ target/ @@
     # Jupyter Notebook
     .ipynb_checkpoints
+    *.ipynb
     # pyenv
     .python-version
@@ Expand Down @@