Skip to content

Rewrite #29

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ target/

# Jupyter Notebook
.ipynb_checkpoints
*.ipynb

# pyenv
.python-version
Expand Down
51 changes: 9 additions & 42 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,43 +1,10 @@
absl-py==0.7.1
astor==0.8.0
boto==2.49.0
boto3==1.9.188
botocore==1.12.188
certifi==2019.6.16
chardet==3.0.4
docutils==0.14
gast==0.2.2
gensim==3.8.0
google-pasta==0.1.7
gensim>=3.8.0
googledrivedownloader==0.4
grpcio==1.22.0
h5py==2.9.0
idna==2.8
jmespath==0.9.4
joblib==0.13.2
Keras-Applications==1.0.8
Keras-Preprocessing==1.1.0
Markdown==3.1.1
nltk==3.4.4
matplotlib==2.2.5
numpy==1.16.1
pandas==0.24.2
pre-commit==2.8.2
protobuf==3.9.0
pyarrow==2.0.0
python-dateutil==2.8.0
pytz==2019.1
requests==2.22.0
s3transfer==0.2.1
scikit-learn==0.20
six==1.12.0
smart-open==1.8.4
tensorboard==1.14.0
tensorflow==1.14.0
tensorflow-estimator==1.14.0
termcolor==1.1.0
tqdm==4.51.0
urllib3==1.25.3
Werkzeug==0.15.4
wrapt==1.11.2
-e .
joblib>=0.13.2
nltk>=3.4.4
numpy~=1.21.0
pandas~=1.2.5
pre-commit~=2.8.2
scikit-learn~=0.24.2
tensorflow>=1.14.0
scipy~=1.7.0
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@
description="Package for semantic type detection using Sherlock",
url="https://github.com/mitmedialab/sherlock-project",
packages=setuptools.find_packages(),
package_dir={"sherlock": "sherlock"}
)
package_dir={"sherlock": "sherlock"},
)
5 changes: 5 additions & 0 deletions sherlock/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from sherlock.defaults import *
from sherlock.features import *
from sherlock.helpers import *
from sherlock.model import *
from sherlock.training import *
File renamed without changes.
File renamed without changes.
94 changes: 94 additions & 0 deletions sherlock/defaults.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import os
from collections import OrderedDict
from itertools import chain
from typing import Optional

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import model_from_json

from sherlock import make_data_path

DEFAULT_FEATURE_ORDER = ["char", "word", "par", "rest"]


def default_features() -> OrderedDict:
"""Get feature identifiers per feature set, to map features to feature sets.

Returns
-------
feature_cols_dict
Dictionary with lists of feature identifiers per feature set.
"""
feature_cols_dict = OrderedDict()
feature_path = make_data_path("feature_column_identifiers")

for feature_set in DEFAULT_FEATURE_ORDER:
feature_file = os.path.join(feature_path, f"{feature_set}_col.tsv")
feature_data = pd.read_csv(
feature_file, sep="\t", index_col=0, header=None, squeeze=True
)
feature_cols_dict[feature_set] = feature_data.to_list()
return feature_cols_dict


def default_encoder():
encoder = LabelEncoder()
class_file_path = make_data_path("classes_sherlock.npy")
encoder.classes_ = np.load(class_file_path, allow_pickle=True)
return encoder


def construct_model(
model_path: Optional[str] = None,
weight_path: Optional[str] = None,
with_weights: bool = True,
):
"""Load model architecture and populate with pretrained weights.

Parameters
----------
model_path
Location of model file
weight_path
Location of weight file
with_weights
Whether to populate the model with trained weights.

Returns
-------
model
Compiled model.
callbacks
Callback configuration for model retraining.
"""
if model_path is None:
model_path = make_data_path("sherlock_model.json")

if weight_path is None:
weight_path = make_data_path("sherlock_weights.h5")

with open(model_path, "r") as model_file:
model = model_from_json(model_file.read())

if with_weights:
model.load_weights(weight_path)

learning_rate = 0.0001
callbacks = [EarlyStopping(monitor="val_loss", patience=5)]
model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
loss="categorical_crossentropy",
metrics=["categorical_accuracy"],
)

return model, callbacks


DEFAULT_FEATURES_DICT: OrderedDict = default_features()
DEFAULT_FEATURES = list(chain(*[cols for cols in DEFAULT_FEATURES_DICT.values()]))
DEFAULT_ENCODER = default_encoder()
DEFAULT_MODEL, DEFAULT_CALLBACKS = construct_model()
Empty file removed sherlock/deploy/__init__.py
Empty file.
58 changes: 0 additions & 58 deletions sherlock/deploy/model_helpers.py

This file was deleted.

61 changes: 0 additions & 61 deletions sherlock/deploy/predict_sherlock.py

This file was deleted.

3 changes: 3 additions & 0 deletions sherlock/features/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from sherlock.features import (bag_of_characters, bag_of_words,
paragraph_vectors, preprocessing,
word_embeddings)
45 changes: 19 additions & 26 deletions sherlock/features/bag_of_characters.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,34 @@
import string
import numpy as np
from scipy.stats import skew, kurtosis
from collections import OrderedDict

import numpy as np
from scipy.stats import kurtosis, skew

ignore_chars = {"\n", "\\", "\v", "\r", "\t", "^"}
characters_to_check = [f"[{c}]" for c in string.printable if c not in ignore_chars]
characters_to_check.extend(["[\\\\]", "[\^]"])


# Input: a single column in the form of pandas series
# Output: ordered dictionary holding bag of character features
def extract_bag_of_characters_features(data):

characters_to_check = (
['['+ c + ']' for c in string.printable if c not in ('\n', '\\', '\v', '\r', '\t', '^')]
+ ['[\\\\]', '[\^]']
)

f = OrderedDict()

data_no_null = data.dropna()
all_value_features = OrderedDict()

for c in characters_to_check:
all_value_features['n_{}'.format(c)] = data_no_null.str.count(c)
all_value_features[f"n_{c}"] = data.str.count(c)

for value_feature_name, value_features in all_value_features.items():
f['{}-agg-any'.format(value_feature_name)] = any(value_features)
f['{}-agg-all'.format(value_feature_name)] = all(value_features)
f['{}-agg-mean'.format(value_feature_name)] = np.mean(value_features)
f['{}-agg-var'.format(value_feature_name)] = np.var(value_features)
f['{}-agg-min'.format(value_feature_name)] = np.min(value_features)
f['{}-agg-max'.format(value_feature_name)] = np.max(value_features)
f['{}-agg-median'.format(value_feature_name)] = np.median(value_features)
f['{}-agg-sum'.format(value_feature_name)] = np.sum(value_features)
f['{}-agg-kurtosis'.format(value_feature_name)] = kurtosis(value_features)
f['{}-agg-skewness'.format(value_feature_name)] = skew(value_features)
f[f"{value_feature_name}-agg-any"] = any(value_features)
f[f"{value_feature_name}-agg-all"] = all(value_features)
f[f"{value_feature_name}-agg-mean"] = np.mean(value_features)
f[f"{value_feature_name}-agg-var"] = np.var(value_features)
f[f"{value_feature_name}-agg-min"] = np.min(value_features)
f[f"{value_feature_name}-agg-max"] = np.max(value_features)
f[f"{value_feature_name}-agg-median"] = np.median(value_features)
f[f"{value_feature_name}-agg-sum"] = np.sum(value_features)
f[f"{value_feature_name}-agg-kurtosis"] = kurtosis(value_features)
f[f"{value_feature_name}-agg-skewness"] = skew(value_features)

return f





Loading