From f04a57e09f34ae86b3abc7b59c01daa98d23647b Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Thu, 26 Sep 2024 11:43:44 -0400 Subject: [PATCH] more cleaning --- .flake8 | 4 +- docs/conf.py | 151 ++++++++++++++++++---------------- parserator/data_prep_utils.py | 8 +- parserator/main.py | 8 +- parserator/manual_labeling.py | 10 --- parserator/parser_template.py | 36 ++++---- parserator/utils.py | 106 ------------------------ tests/test_xml.py | 47 +++++++---- 8 files changed, 136 insertions(+), 234 deletions(-) delete mode 100644 parserator/utils.py diff --git a/.flake8 b/.flake8 index 7350ce3..eaf739f 100644 --- a/.flake8 +++ b/.flake8 @@ -1,3 +1,3 @@ [flake8] -max-line-length=160 -extend-ignore = E203 \ No newline at end of file +max-line-length=232 +extend-ignore = E203 diff --git a/docs/conf.py b/docs/conf.py index d9a4c5d..e46e105 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -11,19 +11,16 @@ # # All configuration values have a default; values that are commented out # serve to show the default. - -import sys -import os - +# # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) +# sys.path.insert(0, os.path.abspath('.')) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom @@ -31,194 +28,197 @@ extensions = [] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'parserator' -copyright = u'2015, Cathy Deng, Forest Gregg' +project = "parserator" +copyright = "2015, Cathy Deng, Forest Gregg" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = '0.4.1' +version = "0.4.1" # The full version, including alpha/beta/rc tags. -release = '0.4.1' +release = "0.4.1" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build'] +exclude_patterns = ["_build"] # The reST default role (used for this markup: `text`) to use for all # documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False +# keep_warnings = False # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'default' +html_theme = "default" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -#html_theme_options = {} +# html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. -#html_extra_path = [] +# html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'parseratordoc' +htmlhelp_basename = "parseratordoc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', + # The paper size ('letterpaper' or 'a4paper'). + # 'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + # 'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + # 'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - ('index', 'parserator.tex', u'parserator Documentation', - u'Cathy Deng, Forest Gregg', 'manual'), + ( + "index", + "parserator.tex", + "parserator Documentation", + "Cathy Deng, Forest Gregg", + "manual", + ), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output --------------------------------------- @@ -226,12 +226,11 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'parserator', u'parserator Documentation', - [u'Cathy Deng, Forest Gregg'], 1) + ("index", "parserator", "parserator Documentation", ["Cathy Deng, Forest Gregg"], 1) ] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------- @@ -240,19 +239,25 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'parserator', u'parserator Documentation', - u'Cathy Deng, Forest Gregg', 'parserator', 'One line description of project.', - 'Miscellaneous'), + ( + "index", + "parserator", + "parserator Documentation", + "Cathy Deng, Forest Gregg", + "parserator", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False +# texinfo_no_detailmenu = False diff --git a/parserator/data_prep_utils.py b/parserator/data_prep_utils.py index 5a17f41..8ae9f1b 100644 --- a/parserator/data_prep_utils.py +++ b/parserator/data_prep_utils.py @@ -1,14 +1,8 @@ #!/usr/bin/python -import os -import sys - from lxml import etree -if sys.version < "3": - from backports import csv -else: - import csv +import csv class TrainingData: diff --git a/parserator/main.py b/parserator/main.py index 44cd815..df224c7 100644 --- a/parserator/main.py +++ b/parserator/main.py @@ -146,8 +146,8 @@ def file_type(arg): try: f = open(arg, "rb") except OSError as e: - message = _("can't open '%s': %s") - raise ArgumentTypeError(message % (arg, e)) + message = "can't open '%s': %s" + raise argparse.ArgumentTypeError(message % (arg, e)) else: detector = chardet.universaldetector.UniversalDetector() @@ -216,7 +216,9 @@ def __call__(self, parser, namespace, model_file, option_string): msg = """ Invalid --modelfile argument Models available: %s""" - raise argparse.ArgumentTypeError(text.dedent(msg) % module.MODEL_FILES) + raise argparse.ArgumentTypeError( + textwrap.dedent(msg) % module.MODEL_FILES + ) else: raise argparse.ArgumentError( self, "This parser does not allow for multiple models" diff --git a/parserator/manual_labeling.py b/parserator/manual_labeling.py index 25f2d9d..c71e595 100644 --- a/parserator/manual_labeling.py +++ b/parserator/manual_labeling.py @@ -2,22 +2,12 @@ import csv -import io import os.path -import re import sys -from argparse import ArgumentParser from collections import OrderedDict -from lxml import etree - from . import data_prep_utils -if sys.version < "3": - from backports import csv -else: - import csv - def consoleLabel(raw_strings, labels, module): print("\nStart console labeling!\n") diff --git a/parserator/parser_template.py b/parserator/parser_template.py index 889ebd5..6b97e8b 100644 --- a/parserator/parser_template.py +++ b/parserator/parser_template.py @@ -16,9 +16,9 @@ def init_template(): # _____________________ # |1. CONFIGURE LABELS! | -# |_____________________| -# (\\__/) || -# (•ㅅ•) || +# |_____________________| +# (\\__/) || +# (•ㅅ•) || # /   づ LABELS = [] # The labels should be a list of strings @@ -65,9 +65,9 @@ def tag(raw_string) : # _____________________ # |2. CONFIGURE TOKENS! | -# |_____________________| -# (\\__/) || -# (•ㅅ•) || +# |_____________________| +# (\\__/) || +# (•ㅅ•) || # /   づ def tokenize(raw_string): # this determines how any given string is split into its tokens @@ -78,7 +78,7 @@ def tokenize(raw_string): raw_string = str(raw_string, encoding='utf-8') except: raw_string = str(raw_string) - + re_tokens = # re.compile( [REGEX HERE], re.VERBOSE | re.UNICODE) tokens = re_tokens.findall(raw_string) @@ -89,14 +89,14 @@ def tokenize(raw_string): # _______________________ # |3. CONFIGURE FEATURES! | -# |_______________________| -# (\\__/) || -# (•ㅅ•) || +# |_______________________| +# (\\__/) || +# (•ㅅ•) || # /   づ def tokens2features(tokens): # this should call tokenFeatures to get features for individual tokens, # as well as define any features that are dependent upon tokens before/after - + feature_sequence = [tokenFeatures(tokens[0])] previous_features = feature_sequence[-1].copy() @@ -107,11 +107,11 @@ def tokens2features(tokens): # features for the features of adjacent tokens feature_sequence[-1]['next'] = current_features - token_features['previous'] = previous_features - + token_features['previous'] = previous_features + # DEFINE ANY OTHER FEATURES THAT ARE DEPENDENT UPON TOKENS BEFORE/AFTER # for example, a feature for whether a certain character has appeared previously in the token sequence - + feature_sequence.append(token_features) previous_features = current_features @@ -122,7 +122,7 @@ def tokens2features(tokens): feature_sequence[1]['previous']['rawstring.start'] = True feature_sequence[-2]['next']['rawstring.end'] = True - else : + else : # a singleton feature, for if there is only one token in a string feature_sequence[0]['singleton'] = True @@ -143,7 +143,7 @@ def casing(token) : if token.isupper() : return 'upper' elif token.islower() : - return 'lower' + return 'lower' elif token.istitle() : return 'title' elif token.isalpha() : @@ -203,7 +203,7 @@ class TestTokenizing(unittest.TestCase) : def test_split_on_punc(self) : assert tokenize('foo,bar') == ['foo,', 'bar'] - + def test_spaces(self) : assert tokenize('foo bar') == ['foo', 'bar'] @@ -212,7 +212,7 @@ def test_spaces(self) : assert tokenize(' foo bar') == ['foo', 'bar'] if __name__ == '__main__' : - unittest.main() + unittest.main() """ % module_name ) diff --git a/parserator/utils.py b/parserator/utils.py deleted file mode 100644 index 8b26aa6..0000000 --- a/parserator/utils.py +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/python - - -import pycrfsuite -from sklearn.base import BaseEstimator -from sklearn.grid_search import GridSearchCV -from sklearn.metrics import f1_score - - -def f1_with_flattening(estimator, X, y): - """ - Calculate F1 score by flattening the predictions of the - estimator across all sequences. For example, given the following - address sequences as input - ['1 My str', '2 Your blvd'], - the predictions of the model will be flattened like so: - ['AddressNumber', 'StreetName', 'StreetNamePostType', 'AddressNumber', 'StreetName', 'StreetNamePostType'] - and compared to a similarly flattened gold standard labels. This calculates the overall - quality of the model across all sequences as opposed to how well it does - at any particular sequence. - :param X: list of sequences to tag - :param y: list of gold standard tuples - """ - predicted = estimator.predict(X) - flat_pred, flat_gold = [], [] - for a, b in zip(predicted, y): - if len(a) == len(b): - flat_pred.extend(a) - flat_gold.extend(b) - return f1_score(flat_gold, flat_pred) - - -def get_data_sklearn_format(train_file_list, module): - """ - Parses the specified data files and returns it in sklearn format. - :param path: - :return: tuple of: - 1) list of training sequences, each of which is a string - 2) list of gold standard labels, each of which is a tuple - of strings, one for each token in the corresponding training - sequence - """ - data = list(readTrainingData(train_file_list, module.GROUP_LABEL)) - random.shuffle(data) - - x, y = [], [] - for raw_string, components in data: - tokens, labels = zip(*components) - x.append(raw_string) - y.append(labels) - return x, y - - -class SequenceEstimator(BaseEstimator): - """ - A sklearn-compatible wrapper for a parser trainer - """ - - def __init__(self, c1=1, c2=1, feature_minfreq=0): - """ - :param c1: L1 regularisation coefficient - :param c2: L2 regularisation coefficient - :param feature_minfreq: minimum feature frequency - :return: - """ - self.c1 = c1 - self.c2 = c2 - self.feature_minfreq = feature_minfreq - - def fit(self, X, y, model_path, **params): - # sklearn requires parameters to be declared as fields of the estimator, - # an we can't have a full stop there. Replace with an underscore - params = {k.replace("_", "."): v for k, v in self.__dict__.items()} - trainer = pycrfsuite.Trainer(verbose=False, params=params) - for raw_text, labels in zip(X, y): - tokens = tokenize(raw_text) - trainer.append(tokens2features(tokens), labels) - trainer.train(model_path) - reload(parserator) - - def predict(self, X): - reload(parserator) # tagger object is defined at the module level, update now - predictions = [] - for sequence in X: - predictions.append([foo[1] for foo in parserator.parse(sequence)]) - return predictions - - -if __name__ == "__main__": - # refer to http://www.chokkan.org/software/crfsuite/manual.html - # for description of parameters - cv = GridSearchCV( - SequenceEstimator(), - { - "c1": [10**x for x in range(-2, 2)], - "c2": [10**x for x in range(-2, 4)], - "feature_minfreq": [0, 3, 5], - }, - scoring=f1_with_flattening, - verbose=5, - ) - X, y = get_data_sklearn_format() - cv.fit(X, y) - print(cv.best_params_) - for foo in cv.grid_scores_: - print(foo) diff --git a/tests/test_xml.py b/tests/test_xml.py index 031acd7..32cb47e 100644 --- a/tests/test_xml.py +++ b/tests/test_xml.py @@ -6,35 +6,52 @@ from lxml import etree import unittest + class Mock(object): pass + class TestList2XML(unittest.TestCase): def setUp(self): mock_module = Mock() - mock_module.GROUP_LABEL = 'Collection' - mock_module.PARENT_LABEL = 'TokenSequence' + mock_module.GROUP_LABEL = "Collection" + mock_module.PARENT_LABEL = "TokenSequence" self.training_data = data_prep_utils.TrainingData(None, mock_module) def test_xml(self): - self.XMLequals( [('#', 'foo'), ('1', 'foo'), ('Pinto', 'foo')], '# 1 Pinto') - self.XMLequals( [('&', 'foo'), ('1', 'foo'), ('Pinto', 'foo')], '& 1 Pinto') - + self.XMLequals( + [("#", "foo"), ("1", "foo"), ("Pinto", "foo")], + "# 1 Pinto", + ) + self.XMLequals( + [("&", "foo"), ("1", "foo"), ("Pinto", "foo")], + "& 1 Pinto", + ) def test_none_tag(self): - self.XMLequals( [('Box', 'foo'), ('#', 'Null'), ('1', 'foo'), ('Pinto', 'foo')], 'Box # 1 Pinto') - self.XMLequals( [('#', 'Null'), ('1', 'foo'), ('Pinto', 'foo')], '# 1 Pinto') + self.XMLequals( + [("Box", "foo"), ("#", "Null"), ("1", "foo"), ("Pinto", "foo")], + "Box # 1 Pinto", + ) + self.XMLequals( + [("#", "Null"), ("1", "foo"), ("Pinto", "foo")], + "# 1 Pinto", + ) def test_ampersand(self): - assert self.training_data._xml_to_sequence(self.training_data._sequence_to_xml([('&', 'foo')])) == (('&', 'foo'),) - + assert self.training_data._xml_to_sequence( + self.training_data._sequence_to_xml([("&", "foo")]) + ) == (("&", "foo"),) + def XMLequals(self, labeled_sequence, xml): - correct_xml = '' + xml + '' - generated_xml = etree.tostring(self.training_data._sequence_to_xml(labeled_sequence)).decode() - print('Correct: %s' %correct_xml) - print('Generated: %s' %generated_xml) + correct_xml = "" + xml + "" + generated_xml = etree.tostring( + self.training_data._sequence_to_xml(labeled_sequence) + ).decode() + print("Correct: %s" % correct_xml) + print("Generated: %s" % generated_xml) assert correct_xml == generated_xml -if __name__ == '__main__' : - unittest.main() +if __name__ == "__main__": + unittest.main()