From f04a57e09f34ae86b3abc7b59c01daa98d23647b Mon Sep 17 00:00:00 2001
From: Forest Gregg <fgregg@datamade.us>
Date: Thu, 26 Sep 2024 11:43:44 -0400
Subject: [PATCH] more cleaning

---
 .flake8                       |   4 +-
 docs/conf.py                  | 151 ++++++++++++++++++----------------
 parserator/data_prep_utils.py |   8 +-
 parserator/main.py            |   8 +-
 parserator/manual_labeling.py |  10 ---
 parserator/parser_template.py |  36 ++++----
 parserator/utils.py           | 106 ------------------------
 tests/test_xml.py             |  47 +++++++----
 8 files changed, 136 insertions(+), 234 deletions(-)
 delete mode 100644 parserator/utils.py

diff --git a/.flake8 b/.flake8
index 7350ce3..eaf739f 100644
--- a/.flake8
+++ b/.flake8
@@ -1,3 +1,3 @@
 [flake8]
-max-line-length=160
-extend-ignore = E203
\ No newline at end of file
+max-line-length=232
+extend-ignore = E203
diff --git a/docs/conf.py b/docs/conf.py
index d9a4c5d..e46e105 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -11,19 +11,16 @@
 #
 # All configuration values have a default; values that are commented out
 # serve to show the default.
-
-import sys
-import os
-
+#
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-#sys.path.insert(0, os.path.abspath('.'))
+# sys.path.insert(0, os.path.abspath('.'))
 
 # -- General configuration ------------------------------------------------
 
 # If your documentation needs a minimal Sphinx version, state it here.
-#needs_sphinx = '1.0'
+# needs_sphinx = '1.0'
 
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
@@ -31,194 +28,197 @@
 extensions = []
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # The suffix of source filenames.
-source_suffix = '.rst'
+source_suffix = ".rst"
 
 # The encoding of source files.
-#source_encoding = 'utf-8-sig'
+# source_encoding = 'utf-8-sig'
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"
 
 # General information about the project.
-project = u'parserator'
-copyright = u'2015, Cathy Deng, Forest Gregg'
+project = "parserator"
+copyright = "2015, Cathy Deng, Forest Gregg"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = '0.4.1'
+version = "0.4.1"
 # The full version, including alpha/beta/rc tags.
-release = '0.4.1'
+release = "0.4.1"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
-#language = None
+# language = None
 
 # There are two options for replacing |today|: either, you set today to some
 # non-false value, then it is used:
-#today = ''
+# today = ''
 # Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
+# today_fmt = '%B %d, %Y'
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build']
+exclude_patterns = ["_build"]
 
 # The reST default role (used for this markup: `text`) to use for all
 # documents.
-#default_role = None
+# default_role = None
 
 # If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
+# add_function_parentheses = True
 
 # If true, the current module name will be prepended to all description
 # unit titles (such as .. function::).
-#add_module_names = True
+# add_module_names = True
 
 # If true, sectionauthor and moduleauthor directives will be shown in the
 # output. They are ignored by default.
-#show_authors = False
+# show_authors = False
 
 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
 
 # A list of ignored prefixes for module index sorting.
-#modindex_common_prefix = []
+# modindex_common_prefix = []
 
 # If true, keep warnings as "system message" paragraphs in the built documents.
-#keep_warnings = False
+# keep_warnings = False
 
 
 # -- Options for HTML output ----------------------------------------------
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-html_theme = 'default'
+html_theme = "default"
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
-#html_theme_options = {}
+# html_theme_options = {}
 
 # Add any paths that contain custom themes here, relative to this directory.
-#html_theme_path = []
+# html_theme_path = []
 
 # The name for this set of Sphinx documents.  If None, it defaults to
 # "<project> v<release> documentation".
-#html_title = None
+# html_title = None
 
 # A shorter title for the navigation bar.  Default is the same as html_title.
-#html_short_title = None
+# html_short_title = None
 
 # The name of an image file (relative to this directory) to place at the top
 # of the sidebar.
-#html_logo = None
+# html_logo = None
 
 # The name of an image file (within the static path) to use as favicon of the
 # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
 # pixels large.
-#html_favicon = None
+# html_favicon = None
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
 
 # Add any extra paths that contain custom files (such as robots.txt or
 # .htaccess) here, relative to this directory. These files are copied
 # directly to the root of the documentation.
-#html_extra_path = []
+# html_extra_path = []
 
 # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
 # using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
+# html_last_updated_fmt = '%b %d, %Y'
 
 # If true, SmartyPants will be used to convert quotes and dashes to
 # typographically correct entities.
-#html_use_smartypants = True
+# html_use_smartypants = True
 
 # Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
+# html_sidebars = {}
 
 # Additional templates that should be rendered to pages, maps page names to
 # template names.
-#html_additional_pages = {}
+# html_additional_pages = {}
 
 # If false, no module index is generated.
-#html_domain_indices = True
+# html_domain_indices = True
 
 # If false, no index is generated.
-#html_use_index = True
+# html_use_index = True
 
 # If true, the index is split into individual pages for each letter.
-#html_split_index = False
+# html_split_index = False
 
 # If true, links to the reST sources are added to the pages.
-#html_show_sourcelink = True
+# html_show_sourcelink = True
 
 # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
-#html_show_sphinx = True
+# html_show_sphinx = True
 
 # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
-#html_show_copyright = True
+# html_show_copyright = True
 
 # If true, an OpenSearch description file will be output, and all pages will
 # contain a <link> tag referring to it.  The value of this option must be the
 # base URL from which the finished HTML is served.
-#html_use_opensearch = ''
+# html_use_opensearch = ''
 
 # This is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = None
+# html_file_suffix = None
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'parseratordoc'
+htmlhelp_basename = "parseratordoc"
 
 
 # -- Options for LaTeX output ---------------------------------------------
 
 latex_elements = {
-# The paper size ('letterpaper' or 'a4paper').
-#'papersize': 'letterpaper',
-
-# The font size ('10pt', '11pt' or '12pt').
-#'pointsize': '10pt',
-
-# Additional stuff for the LaTeX preamble.
-#'preamble': '',
+    # The paper size ('letterpaper' or 'a4paper').
+    # 'papersize': 'letterpaper',
+    # The font size ('10pt', '11pt' or '12pt').
+    # 'pointsize': '10pt',
+    # Additional stuff for the LaTeX preamble.
+    # 'preamble': '',
 }
 
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-  ('index', 'parserator.tex', u'parserator Documentation',
-   u'Cathy Deng, Forest Gregg', 'manual'),
+    (
+        "index",
+        "parserator.tex",
+        "parserator Documentation",
+        "Cathy Deng, Forest Gregg",
+        "manual",
+    ),
 ]
 
 # The name of an image file (relative to this directory) to place at the top of
 # the title page.
-#latex_logo = None
+# latex_logo = None
 
 # For "manual" documents, if this is true, then toplevel headings are parts,
 # not chapters.
-#latex_use_parts = False
+# latex_use_parts = False
 
 # If true, show page references after internal links.
-#latex_show_pagerefs = False
+# latex_show_pagerefs = False
 
 # If true, show URL addresses after external links.
-#latex_show_urls = False
+# latex_show_urls = False
 
 # Documents to append as an appendix to all manuals.
-#latex_appendices = []
+# latex_appendices = []
 
 # If false, no module index is generated.
-#latex_domain_indices = True
+# latex_domain_indices = True
 
 
 # -- Options for manual page output ---------------------------------------
@@ -226,12 +226,11 @@
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    ('index', 'parserator', u'parserator Documentation',
-     [u'Cathy Deng, Forest Gregg'], 1)
+    ("index", "parserator", "parserator Documentation", ["Cathy Deng, Forest Gregg"], 1)
 ]
 
 # If true, show URL addresses after external links.
-#man_show_urls = False
+# man_show_urls = False
 
 
 # -- Options for Texinfo output -------------------------------------------
@@ -240,19 +239,25 @@
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-  ('index', 'parserator', u'parserator Documentation',
-   u'Cathy Deng, Forest Gregg', 'parserator', 'One line description of project.',
-   'Miscellaneous'),
+    (
+        "index",
+        "parserator",
+        "parserator Documentation",
+        "Cathy Deng, Forest Gregg",
+        "parserator",
+        "One line description of project.",
+        "Miscellaneous",
+    ),
 ]
 
 # Documents to append as an appendix to all manuals.
-#texinfo_appendices = []
+# texinfo_appendices = []
 
 # If false, no module index is generated.
-#texinfo_domain_indices = True
+# texinfo_domain_indices = True
 
 # How to display URL addresses: 'footnote', 'no', or 'inline'.
-#texinfo_show_urls = 'footnote'
+# texinfo_show_urls = 'footnote'
 
 # If true, do not generate a @detailmenu in the "Top" node's menu.
-#texinfo_no_detailmenu = False
+# texinfo_no_detailmenu = False
diff --git a/parserator/data_prep_utils.py b/parserator/data_prep_utils.py
index 5a17f41..8ae9f1b 100644
--- a/parserator/data_prep_utils.py
+++ b/parserator/data_prep_utils.py
@@ -1,14 +1,8 @@
 #!/usr/bin/python
 
-import os
-import sys
-
 from lxml import etree
 
-if sys.version < "3":
-    from backports import csv
-else:
-    import csv
+import csv
 
 
 class TrainingData:
diff --git a/parserator/main.py b/parserator/main.py
index 44cd815..df224c7 100644
--- a/parserator/main.py
+++ b/parserator/main.py
@@ -146,8 +146,8 @@ def file_type(arg):
     try:
         f = open(arg, "rb")
     except OSError as e:
-        message = _("can't open '%s': %s")
-        raise ArgumentTypeError(message % (arg, e))
+        message = "can't open '%s': %s"
+        raise argparse.ArgumentTypeError(message % (arg, e))
     else:
         detector = chardet.universaldetector.UniversalDetector()
 
@@ -216,7 +216,9 @@ def __call__(self, parser, namespace, model_file, option_string):
                 msg = """
                       Invalid --modelfile argument
                       Models available: %s"""
-                raise argparse.ArgumentTypeError(text.dedent(msg) % module.MODEL_FILES)
+                raise argparse.ArgumentTypeError(
+                    textwrap.dedent(msg) % module.MODEL_FILES
+                )
         else:
             raise argparse.ArgumentError(
                 self, "This parser does not allow for multiple models"
diff --git a/parserator/manual_labeling.py b/parserator/manual_labeling.py
index 25f2d9d..c71e595 100644
--- a/parserator/manual_labeling.py
+++ b/parserator/manual_labeling.py
@@ -2,22 +2,12 @@
 
 
 import csv
-import io
 import os.path
-import re
 import sys
-from argparse import ArgumentParser
 from collections import OrderedDict
 
-from lxml import etree
-
 from . import data_prep_utils
 
-if sys.version < "3":
-    from backports import csv
-else:
-    import csv
-
 
 def consoleLabel(raw_strings, labels, module):
     print("\nStart console labeling!\n")
diff --git a/parserator/parser_template.py b/parserator/parser_template.py
index 889ebd5..6b97e8b 100644
--- a/parserator/parser_template.py
+++ b/parserator/parser_template.py
@@ -16,9 +16,9 @@ def init_template():
 
 #  _____________________
 # |1. CONFIGURE LABELS! |
-# |_____________________| 
-#     (\\__/) || 
-#     (•ㅅ•) || 
+# |_____________________|
+#     (\\__/) ||
+#     (•ㅅ•) ||
 #     / 　 づ
 LABELS = [] # The labels should be a list of strings
 
@@ -65,9 +65,9 @@ def tag(raw_string) :
 
 #  _____________________
 # |2. CONFIGURE TOKENS! |
-# |_____________________| 
-#     (\\__/) || 
-#     (•ㅅ•) || 
+# |_____________________|
+#     (\\__/) ||
+#     (•ㅅ•) ||
 #     / 　 づ
 def tokenize(raw_string):
     # this determines how any given string is split into its tokens
@@ -78,7 +78,7 @@ def tokenize(raw_string):
             raw_string = str(raw_string, encoding='utf-8')
         except:
             raw_string = str(raw_string)
-    
+
     re_tokens = # re.compile( [REGEX HERE], re.VERBOSE | re.UNICODE)
     tokens = re_tokens.findall(raw_string)
 
@@ -89,14 +89,14 @@ def tokenize(raw_string):
 
 #  _______________________
 # |3. CONFIGURE FEATURES! |
-# |_______________________| 
-#     (\\__/) || 
-#     (•ㅅ•) || 
+# |_______________________|
+#     (\\__/) ||
+#     (•ㅅ•) ||
 #     / 　 づ
 def tokens2features(tokens):
     # this should call tokenFeatures to get features for individual tokens,
     # as well as define any features that are dependent upon tokens before/after
-    
+
     feature_sequence = [tokenFeatures(tokens[0])]
     previous_features = feature_sequence[-1].copy()
 
@@ -107,11 +107,11 @@ def tokens2features(tokens):
 
         # features for the features of adjacent tokens
         feature_sequence[-1]['next'] = current_features
-        token_features['previous'] = previous_features        
-        
+        token_features['previous'] = previous_features
+
         # DEFINE ANY OTHER FEATURES THAT ARE DEPENDENT UPON TOKENS BEFORE/AFTER
         # for example, a feature for whether a certain character has appeared previously in the token sequence
-        
+
         feature_sequence.append(token_features)
         previous_features = current_features
 
@@ -122,7 +122,7 @@ def tokens2features(tokens):
         feature_sequence[1]['previous']['rawstring.start'] = True
         feature_sequence[-2]['next']['rawstring.end'] = True
 
-    else : 
+    else :
         # a singleton feature, for if there is only one token in a string
         feature_sequence[0]['singleton'] = True
 
@@ -143,7 +143,7 @@ def casing(token) :
     if token.isupper() :
         return 'upper'
     elif token.islower() :
-        return 'lower' 
+        return 'lower'
     elif token.istitle() :
         return 'title'
     elif token.isalpha() :
@@ -203,7 +203,7 @@ class TestTokenizing(unittest.TestCase) :
     def test_split_on_punc(self) :
 
         assert tokenize('foo,bar') == ['foo,', 'bar']
-    
+
     def test_spaces(self) :
 
         assert tokenize('foo bar') == ['foo', 'bar']
@@ -212,7 +212,7 @@ def test_spaces(self) :
         assert tokenize(' foo bar') == ['foo', 'bar']
 
 if __name__ == '__main__' :
-    unittest.main()    
+    unittest.main()
 """
         % module_name
     )
diff --git a/parserator/utils.py b/parserator/utils.py
deleted file mode 100644
index 8b26aa6..0000000
--- a/parserator/utils.py
+++ /dev/null
@@ -1,106 +0,0 @@
-#!/usr/bin/python
-
-
-import pycrfsuite
-from sklearn.base import BaseEstimator
-from sklearn.grid_search import GridSearchCV
-from sklearn.metrics import f1_score
-
-
-def f1_with_flattening(estimator, X, y):
-    """
-    Calculate F1 score by flattening the predictions of the
-    estimator across all sequences. For example, given the following
-    address sequences as input
-        ['1 My str', '2 Your blvd'],
-    the predictions of the model will be flattened like so:
-        ['AddressNumber', 'StreetName', 'StreetNamePostType', 'AddressNumber', 'StreetName', 'StreetNamePostType']
-    and compared to a similarly flattened gold standard labels. This calculates the overall
-    quality of the model across all sequences as opposed to how well it does
-    at any particular sequence.
-    :param X: list of sequences to tag
-    :param y: list of gold standard tuples
-    """
-    predicted = estimator.predict(X)
-    flat_pred, flat_gold = [], []
-    for a, b in zip(predicted, y):
-        if len(a) == len(b):
-            flat_pred.extend(a)
-            flat_gold.extend(b)
-    return f1_score(flat_gold, flat_pred)
-
-
-def get_data_sklearn_format(train_file_list, module):
-    """
-    Parses the specified data files and returns it in sklearn format.
-    :param path:
-    :return: tuple of:
-                1) list of training sequences, each of which is a string
-                2) list of gold standard labels, each of which is a tuple
-                of strings, one for each token in the corresponding training
-                sequence
-    """
-    data = list(readTrainingData(train_file_list, module.GROUP_LABEL))
-    random.shuffle(data)
-
-    x, y = [], []
-    for raw_string, components in data:
-        tokens, labels = zip(*components)
-        x.append(raw_string)
-        y.append(labels)
-    return x, y
-
-
-class SequenceEstimator(BaseEstimator):
-    """
-    A sklearn-compatible wrapper for a parser trainer
-    """
-
-    def __init__(self, c1=1, c2=1, feature_minfreq=0):
-        """
-        :param c1: L1 regularisation coefficient
-        :param c2: L2 regularisation coefficient
-        :param feature_minfreq: minimum feature frequency
-        :return:
-        """
-        self.c1 = c1
-        self.c2 = c2
-        self.feature_minfreq = feature_minfreq
-
-    def fit(self, X, y, model_path, **params):
-        # sklearn requires parameters to be declared as fields of the estimator,
-        # an we can't have a full stop there. Replace with an underscore
-        params = {k.replace("_", "."): v for k, v in self.__dict__.items()}
-        trainer = pycrfsuite.Trainer(verbose=False, params=params)
-        for raw_text, labels in zip(X, y):
-            tokens = tokenize(raw_text)
-            trainer.append(tokens2features(tokens), labels)
-        trainer.train(model_path)
-        reload(parserator)
-
-    def predict(self, X):
-        reload(parserator)  # tagger object is defined at the module level, update now
-        predictions = []
-        for sequence in X:
-            predictions.append([foo[1] for foo in parserator.parse(sequence)])
-        return predictions
-
-
-if __name__ == "__main__":
-    # refer to http://www.chokkan.org/software/crfsuite/manual.html
-    # for description of parameters
-    cv = GridSearchCV(
-        SequenceEstimator(),
-        {
-            "c1": [10**x for x in range(-2, 2)],
-            "c2": [10**x for x in range(-2, 4)],
-            "feature_minfreq": [0, 3, 5],
-        },
-        scoring=f1_with_flattening,
-        verbose=5,
-    )
-    X, y = get_data_sklearn_format()
-    cv.fit(X, y)
-    print(cv.best_params_)
-    for foo in cv.grid_scores_:
-        print(foo)
diff --git a/tests/test_xml.py b/tests/test_xml.py
index 031acd7..32cb47e 100644
--- a/tests/test_xml.py
+++ b/tests/test_xml.py
@@ -6,35 +6,52 @@
 from lxml import etree
 import unittest
 
+
 class Mock(object):
     pass
 
+
 class TestList2XML(unittest.TestCase):
     def setUp(self):
         mock_module = Mock()
-        mock_module.GROUP_LABEL = 'Collection'
-        mock_module.PARENT_LABEL = 'TokenSequence'
+        mock_module.GROUP_LABEL = "Collection"
+        mock_module.PARENT_LABEL = "TokenSequence"
         self.training_data = data_prep_utils.TrainingData(None, mock_module)
 
     def test_xml(self):
-        self.XMLequals( [('#', 'foo'), ('1', 'foo'), ('Pinto', 'foo')], '<foo>#</foo> <foo>1</foo> <foo>Pinto</foo>')
-        self.XMLequals( [('&', 'foo'), ('1', 'foo'), ('Pinto', 'foo')], '<foo>&amp;</foo> <foo>1</foo> <foo>Pinto</foo>')
-
+        self.XMLequals(
+            [("#", "foo"), ("1", "foo"), ("Pinto", "foo")],
+            "<foo>#</foo> <foo>1</foo> <foo>Pinto</foo>",
+        )
+        self.XMLequals(
+            [("&", "foo"), ("1", "foo"), ("Pinto", "foo")],
+            "<foo>&amp;</foo> <foo>1</foo> <foo>Pinto</foo>",
+        )
 
     def test_none_tag(self):
-        self.XMLequals( [('Box', 'foo'), ('#', 'Null'), ('1', 'foo'), ('Pinto', 'foo')], '<foo>Box</foo> <Null>#</Null> <foo>1</foo> <foo>Pinto</foo>')
-        self.XMLequals( [('#', 'Null'), ('1', 'foo'), ('Pinto', 'foo')], '<Null>#</Null> <foo>1</foo> <foo>Pinto</foo>')
+        self.XMLequals(
+            [("Box", "foo"), ("#", "Null"), ("1", "foo"), ("Pinto", "foo")],
+            "<foo>Box</foo> <Null>#</Null> <foo>1</foo> <foo>Pinto</foo>",
+        )
+        self.XMLequals(
+            [("#", "Null"), ("1", "foo"), ("Pinto", "foo")],
+            "<Null>#</Null> <foo>1</foo> <foo>Pinto</foo>",
+        )
 
     def test_ampersand(self):
-        assert self.training_data._xml_to_sequence(self.training_data._sequence_to_xml([('&', 'foo')])) == (('&', 'foo'),)
-       
+        assert self.training_data._xml_to_sequence(
+            self.training_data._sequence_to_xml([("&", "foo")])
+        ) == (("&", "foo"),)
+
     def XMLequals(self, labeled_sequence, xml):
-        correct_xml = '<TokenSequence>' + xml + '</TokenSequence>'
-        generated_xml = etree.tostring(self.training_data._sequence_to_xml(labeled_sequence)).decode()
-        print('Correct:   %s' %correct_xml)
-        print('Generated: %s' %generated_xml)
+        correct_xml = "<TokenSequence>" + xml + "</TokenSequence>"
+        generated_xml = etree.tostring(
+            self.training_data._sequence_to_xml(labeled_sequence)
+        ).decode()
+        print("Correct:   %s" % correct_xml)
+        print("Generated: %s" % generated_xml)
         assert correct_xml == generated_xml
 
 
-if __name__ == '__main__' :
-    unittest.main()    
+if __name__ == "__main__":
+    unittest.main()