Skip to content

Commit

Permalink
Revert "Merge pull request #426 from cisco/vidamoda/update-sklearn"
Browse files Browse the repository at this point in the history
This reverts commit f158106, reversing
changes made to 76963ea.

A bug fix for the 4.6.0 release was inadvertently included in a subsequent PR.
This reverts PR#426 so that the fix can be added as 4.6.1 without including
the rest of PR#426
  • Loading branch information
snow0x2d0 committed Aug 2, 2022
1 parent f158106 commit 834e23f
Show file tree
Hide file tree
Showing 12 changed files with 23 additions and 30 deletions.
4 changes: 2 additions & 2 deletions mindmeld/components/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -783,8 +783,8 @@ def get_classifier_config(
copy.deepcopy(getattr(module_conf, attr_name)))
except AttributeError:
try:
result = merge_param_configs(_get_default_classifier_config(clf_type), copy.deepcopy(
getattr(module_conf, CONFIG_DEPRECATION_MAPPING[attr_name]))
result = copy.deepcopy(
getattr(module_conf, CONFIG_DEPRECATION_MAPPING[attr_name])
)
msg = (
"%s config is deprecated. Please use the equivalent %s config "
Expand Down
5 changes: 3 additions & 2 deletions mindmeld/components/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,8 +403,6 @@ def _get_model_config(loaded_config=None, **kwargs):
Returns:
ModelConfig: The model configuration corresponding to the provided config name
"""
if 'params' in loaded_config and 'params' in kwargs:
kwargs['params'] = {**loaded_config['params'], **kwargs['params']}
try:
# If all params required for model config were passed in, use kwargs
return ModelConfig(**kwargs)
Expand All @@ -413,6 +411,9 @@ def _get_model_config(loaded_config=None, **kwargs):
if not loaded_config:
logger.warning("loaded_config is not passed in")
model_config = loaded_config or {}
if 'params' in model_config and 'params' in kwargs:
kwargs['params'].update(model_config['params'])

model_config.update(kwargs)

return ModelConfig(**model_config)
Expand Down
2 changes: 1 addition & 1 deletion mindmeld/components/entity_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import logging
import pickle

import joblib
from sklearn.externals import joblib

from ._config import get_classifier_config
from .classifier import Classifier, ClassifierConfig, ClassifierLoadError
Expand Down
2 changes: 1 addition & 1 deletion mindmeld/components/role_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import logging
import pickle

import joblib
from sklearn.externals import joblib

from ._config import get_classifier_config
from .classifier import Classifier, ClassifierConfig, ClassifierLoadError
Expand Down
4 changes: 2 additions & 2 deletions mindmeld/models/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from inspect import signature
from typing import Union, Type, Dict, Any, Tuple, List, Pattern, Set

import joblib
from sklearn.externals import joblib
from sklearn.model_selection import (
GridSearchCV,
GroupKFold,
Expand Down Expand Up @@ -486,7 +486,7 @@ def _fit_cv(self, examples, labels, groups=None, selection_settings=None, fixed_
n_jobs=n_jobs,
return_train_score=False,
)
model = grid_cv.fit(examples, y=labels, groups=groups)
model = grid_cv.fit(examples, labels, groups)

for idx, params in enumerate(model.cv_results_["params"]):
logger.debug("Candidate parameters: %s", params)
Expand Down
2 changes: 1 addition & 1 deletion mindmeld/models/tagger_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import os
import random

import joblib
from sklearn.externals import joblib

from .evaluation import EntityModelEvaluation, EvaluatedExample
from .helpers import (
Expand Down
2 changes: 1 addition & 1 deletion mindmeld/models/taggers/lstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
import os
import re

import joblib
import numpy as np
import tensorflow as tf
from sklearn.externals import joblib
from sklearn.preprocessing import LabelBinarizer

from .embeddings import CharacterSequenceEmbedding, WordSequenceEmbedding
Expand Down
6 changes: 3 additions & 3 deletions mindmeld/models/taggers/memm.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def _predict_proba_example(self, example, config, resources):
X, _ = self._preprocess_data([features])
prediction = self._clf.predict_proba(X)[0]
predicted_tag = np.argmax(prediction)
prev_tag = self.class_encoder.inverse_transform([predicted_tag])[0]
prev_tag = self.class_encoder.inverse_transform(predicted_tag)
seq_log_probs.append([prev_tag, prediction[predicted_tag]])
return seq_log_probs

Expand All @@ -175,7 +175,7 @@ def _predict_proba_distribution_example(self, example, config, resources):
predictions.append(list(prediction))
tag_maps.append(
[
self.class_encoder.inverse_transform([i])[0] for i in range(len(prediction))
self.class_encoder.inverse_transform(i) for i in range(len(prediction))
]
)
return [tag_maps, predictions]
Expand All @@ -190,7 +190,7 @@ def _get_feature_selector(selector_type):
given the full feature matrix, X and the class labels, y.
"""
selector = {
"l1": SelectFromModel(LogisticRegression(penalty="l1", C=1, solver="liblinear")),
"l1": SelectFromModel(LogisticRegression(penalty="l1", C=1)),
"f": SelectPercentile(),
}.get(selector_type)
return selector
Expand Down
16 changes: 4 additions & 12 deletions mindmeld/models/taggers/pytorch_crf.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def get_tensor_data(self, feat_dicts, labels=None, fit=False):
if isinstance(self.feat_extractor, DictVectorizer):
flattened_feat_dicts = list(chain.from_iterable(feat_dicts))
self.feat_extractor.fit(flattened_feat_dicts)
self.num_feats = len(self.feat_extractor.get_feature_names_out())
self.num_feats = len(self.feat_extractor.get_feature_names())
if labels is not None:
flattened_labels = list(chain.from_iterable(labels))
self.label_encoder.fit(flattened_labels)
Expand Down Expand Up @@ -294,16 +294,8 @@ def save_best_weights_path(self, path):
else:
raise MindMeldError("CRF weights not saved. Please re-train model from scratch.")

def validate_params(self, kwargs):
def validate_params(self):
"""Validate the argument values saved into the CRF model. """
for key in kwargs:
msg = (
"Unexpected param `{param}`, dropping it from model config.".format(
param=key
)
)
logger.warning(msg)

if self.optimizer not in ["sgd", "adam"]:
raise MindMeldError(
f"Optimizer type {self.optimizer_type} not supported. Supported options are ['sgd', 'adam']")
Expand Down Expand Up @@ -439,7 +431,7 @@ def compute_marginal_probabilities(self, inputs, mask):
# pylint: disable=too-many-arguments
def set_params(self, feat_type="hash", feat_num=50000, stratify_train_val_split=True, drop_input=0.2, batch_size=8,
number_of_epochs=100, patience=3, dev_split_ratio=0.2, optimizer="sgd",
random_state=None, **kwargs):
random_state=None):
"""Set the parameters for the PyTorch CRF model and also validates the parameters.
Args:
Expand Down Expand Up @@ -467,7 +459,7 @@ def set_params(self, feat_type="hash", feat_num=50000, stratify_train_val_split=
self.optimizer = optimizer # ["sgd", "adam"]
self.random_state = random_state or randint(1, 10000001)

self.validate_params(kwargs)
self.validate_params()

logger.debug("Random state for torch-crf is %s", self.random_state)
if self.feat_type == "dict":
Expand Down
4 changes: 2 additions & 2 deletions mindmeld/models/text_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
import os
import random

import joblib
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectFromModel, SelectPercentile
from sklearn.linear_model import LogisticRegression
Expand Down Expand Up @@ -371,7 +371,7 @@ def _get_feature_selector(self):
else:
selector_type = self.config.model_settings.get("feature_selector")
selector = {
"l1": SelectFromModel(LogisticRegression(penalty="l1", C=1, solver="liblinear")),
"l1": SelectFromModel(LogisticRegression(penalty="l1", C=1)),
"f": SelectPercentile(),
}.get(selector_type)
return selector
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"pytz", # uses calendar versioning
"scipy>=0.13.3,<2.0",
'scikit-learn>=0.18.1,<0.20; python_version < "3.7"',
'scikit-learn~=1.0.2; python_version >= "3.7"',
'scikit-learn>=0.19.2,<0.20; python_version >= "3.7"',
"requests>=2.20.1,<3.0",
"tqdm~=4.15",
'python-crfsuite~=0.9; python_version < "3.7"',
Expand Down
4 changes: 2 additions & 2 deletions tests/models/test_tagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,8 +271,7 @@ def test_get_boundary_counts_sequential(

@pytest.mark.parametrize(
"model_type,params",
[("memm", {"penalty": "l2", "C": 10000}),
("torch-crf", {"feat_type": "dict"}),
[("memm", {"penalty": "l2", "C": 10000}), ("crf", {"c1": 0.01, "c2": 0.01}), ("torch-crf", {"feat_type": "dict"}),
("torch-crf", {"feat_type": "hash"})],
)
def test_view_extracted_features(kwik_e_mart_nlp, model_type, params):
Expand Down Expand Up @@ -312,6 +311,7 @@ def test_view_extracted_features(kwik_e_mart_nlp, model_type, params):
"query,model_type,params",
[
("Main st store hours", "memm", {"penalty": "l2", "C": 10000}),
("Main st store hours", "crf", {"c1": 0.01, "c2": 0.01}),
("Main st store hours", "torch-crf", {"feat_type": "dict"}),
("Main st store hours", "torch-crf", {"feat_type": "hash"})
],
Expand Down

0 comments on commit 834e23f

Please sign in to comment.