tidying up

datamade · Sep 26, 2024 · 06a5dfd · 06a5dfd
1 parent efec453
commit 06a5dfd
Show file tree

Hide file tree

Showing 10 changed files with 957 additions and 430 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+include training/*
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,67 @@
+[project]
+name = "usaddress"
+version = "0.5.10"
+description = "Parse US addresses using conditional random fields"
+readme = "README.md"
+license = {text = "MIT License", url = "http://www.opensource.org/licenses/mit-license.php"}
+requires-python = ">=3.7"
+dependencies = [
+  "python-crfsuite>=0.7",
+  "probableparsing"
+  ]
+classifiers = [
+  "Development Status :: 3 - Alpha",
+  "Intended Audience :: Developers",
+  "Intended Audience :: Science/Research",
+  "License :: OSI Approved :: MIT License",
+  "Natural Language :: English",
+  "Operating System :: MacOS :: MacOS X",
+  "Operating System :: Microsoft :: Windows",
+  "Operating System :: POSIX",
+  "Topic :: Software Development :: Libraries :: Python Modules",
+  "Topic :: Scientific/Engineering",
+  "Topic :: Scientific/Engineering :: Information Analysis",
+]  
+
+[project.urls]
+Homepage = "https://github.com/datamade/usaddress"
+
+[project.optional-dependencies]
+dev = ["pytest",
+       "black",
+       "isort",
+       "mypy",
+       "flake8"       
+]
+
+[build-system]
+requires = ["setuptools>=42", "wheel", "parserator", "probableparsing"]
+build-backend = "setuptools.build_meta"
+
+
+[tool.setuptools.packages.find]
+include = ["usaddress"]
+
+
+[tool.setuptools.package-data]
+usaddress = ['usaddr.crfsuite']
+
+
+[tool.pytest.ini_options]
+addopts = [
+    "--import-mode=importlib",
+]
+testpaths = [
+  "tests",
+  ]
+
+[tool.mypy]
+files = ["usaddress"]
+show_error_codes = true
+ignore_missing_imports = true
+check_untyped_defs = true
+implicit_reexport = false
+
+[tool.isort]
+profile = "black"
+src_paths = ["usaddress", "tests"]
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,4 @@
-nose
+pytest
 parserator>=0.3
-requests
-coverage
 flake8
+pytest
diff --git a/setup.cfg b/setup.cfg
diff --git a/setup.py b/setup.py
@@ -1,51 +1,38 @@
-try:
-    from setuptools import setup
-except ImportError:
-    raise ImportError(
-        "setuptools module required, please go to "
-        "https://pypi.python.org/pypi/setuptools and follow the instructions "
-        "for installing setuptools"
-    )
+import os
+import subprocess
+from distutils.cmd import Command
 
+from setuptools import setup
+from setuptools.command.build_py import build_py as _build_py
+
+
+class TrainModel(Command):
+    description = "Training the model before building the package"
+    user_options = []
+
+    def initialize_options(self):
+        pass
+
+    def finalize_options(self):
+        pass
+
+    def run(self):
+        subprocess.run(
+            ["parserator", "train", "training/labeled.xml", "usaddress"],
+            env=dict(os.environ, PYTHONPATH="."),
+        )
+
+
+class build_py(_build_py):
+    def run(self):
+        self.run_command("train_model")  # Run the custom command
+        super().run()
+
+
+# Standard setup configuration
 setup(
-    version='0.5.10',
-    url='https://github.com/datamade/usaddress',
-    description='Parse US addresses using conditional random fields',
-    name='usaddress',
-    packages=['usaddress'],
-    package_data={'usaddress': ['usaddr.crfsuite']},
-    license='The MIT License: http://www.opensource.org/licenses/mit-license.php',
-    install_requires=['python-crfsuite>=0.7',
-                      'future>=0.14',
-                      'probableparsing'],
-    classifiers=[
-        'Development Status :: 3 - Alpha',
-        'Intended Audience :: Developers',
-        'Intended Audience :: Science/Research',
-        'License :: OSI Approved :: MIT License',
-        'Natural Language :: English',
-        'Operating System :: MacOS :: MacOS X',
-        'Operating System :: Microsoft :: Windows',
-        'Operating System :: POSIX',
-        'Programming Language :: Python :: 2.7',
-        'Programming Language :: Python :: 2 :: Only',
-        'Topic :: Software Development :: Libraries :: Python Modules',
-        'Topic :: Scientific/Engineering',
-        'Topic :: Scientific/Engineering :: Information Analysis'],
-    long_description="""
-    usaddress is a python library for parsing unstructured address strings into
-    address components, using advanced NLP methods.
-
-    From the python interpreter:
-
-    >>> import usaddress
-    >>> usaddress.parse('123 Main St. Suite 100 Chicago, IL')
-    [('123', 'AddressNumber'),
-     ('Main', 'StreetName'),
-     ('St.', 'StreetNamePostType'),
-     ('Suite', 'OccupancyType'),
-     ('100', 'OccupancyIdentifier'),
-     ('Chicago,', 'PlaceName'),
-     ('IL', 'StateName')]
-    """
+    cmdclass={
+        "build_py": build_py,  # Override build_py
+        "train_model": TrainModel,  # Register custom command
+    },
 )
diff --git a/tests/test_labeling.py b/tests/test_labeling.py
@@ -1,94 +1,77 @@
-from __future__ import print_function
-from builtins import zip
-from builtins import object
-from usaddress import parse, GROUP_LABEL
+import pytest
 from parserator.training import readTrainingData
-import unittest
 
+from usaddress import GROUP_LABEL, parse
 
-class TestPerformance(object):  # for test generators, must inherit from object
 
-    # these are simple address patterns
-    def test_simple_addresses(self):
-        test_file = 'measure_performance/test_data/simple_address_patterns.xml'
-        data = list(readTrainingData([test_file], GROUP_LABEL))
+# these are simple address patterns
+@pytest.mark.parametrize(
+    "address_text,components",
+    readTrainingData(
+        ["measure_performance/test_data/simple_address_patterns.xml"], GROUP_LABEL
+    ),
+)
+def test_simple_addresses(address_text, components):
 
-        for labeled_address in data:
-            address_text, components = labeled_address
-            _, labels_true = list(zip(*components))
-            _, labels_pred = list(zip(*parse(address_text)))
-            yield equals, address_text, labels_pred, labels_true
-
-    # for making sure that performance isn't degrading
-    # from now on, labeled examples of new address formats
-    # should go both in training data & test data
-    def test_all(self):
-        test_file = 'measure_performance/test_data/labeled.xml'
-        data = list(readTrainingData([test_file], GROUP_LABEL))
+    _, labels_true = list(zip(*components))
+    _, labels_pred = list(zip(*parse(address_text)))
+    assert labels_pred == labels_true
 
-        for labeled_address in data:
-            address_text, components = labeled_address
-            _, labels_true = list(zip(*components))
-            _, labels_pred = list(zip(*parse(address_text)))
-            yield equals, address_text, labels_pred, labels_true
 
+# for making sure that performance isn't degrading
+# from now on, labeled examples of new address formats
+# should go both in training data & test data
+# these are simple address patterns
+@pytest.mark.parametrize(
+    "address_text,components",
+    readTrainingData(["measure_performance/test_data/labeled.xml"], GROUP_LABEL),
+)
+def test_all(address_text, components):
 
-class TestPerformanceOld(object):  # some old tests for usaddress
+    _, labels_true = list(zip(*components))
+    _, labels_pred = list(zip(*parse(address_text)))
+    assert labels_pred == labels_true
 
-    def test_synthetic_addresses(self):
-        test_file = 'measure_performance/test_data/synthetic_osm_data.xml'
-        data = list(readTrainingData([test_file], GROUP_LABEL))
 
-        for labeled_address in data:
-            address_text, components = labeled_address
-            _, labels_true = list(zip(*components))
-            _, labels_pred = list(zip(*parse(address_text)))
-            yield equals, address_text, labels_pred, labels_true
+@pytest.mark.parametrize(
+    "address_text,components",
+    readTrainingData(
+        ["measure_performance/test_data/synthetic_osm_data.xml"], GROUP_LABEL
+    ),
+)
+def test_synthetic_addresses(address_text, components):
 
-    def test_us50(self):
-        test_file = 'measure_performance/test_data/us50_test_tagged.xml'
-        data = list(readTrainingData([test_file], GROUP_LABEL))
+    _, labels_true = list(zip(*components))
+    _, labels_pred = list(zip(*parse(address_text)))
+    assert labels_pred == labels_true
 
-        for labeled_address in data:
-            address_text, components = labeled_address
-            _, labels_true = list(zip(*components))
-            _, labels_pred = list(zip(*parse(address_text)))
-            yield fuzzyEquals, address_text, labels_pred, labels_true
 
+@pytest.mark.parametrize(
+    "address_text,components",
+    readTrainingData(
+        ["measure_performance/test_data/us50_test_tagged.xml"], GROUP_LABEL
+    ),
+)
+def test_us50(address_text, components):
 
-def equals(addr,
-           labels_pred,
-           labels_true):
-    prettyPrint(addr, labels_pred, labels_true)
-    assert labels_pred == labels_true
+    _, labels_true = list(zip(*components))
+    _, labels_pred = list(zip(*parse(address_text)))
+    fuzzyEquals(labels_pred, labels_true)
 
 
-def fuzzyEquals(addr,
-                labels_pred,
-                labels_true):
+def fuzzyEquals(labels_pred, labels_true):
     labels = []
     fuzzy_labels = []
     for label in labels_pred:
-        if label.startswith('StreetName'):
-            fuzzy_labels.append('StreetName')
-        elif label.startswith('AddressNumber'):
-            fuzzy_labels.append('AddressNumber')
-        elif label == ('Null'):
-            fuzzy_labels.append('NotAddress')
+        if label.startswith("StreetName"):
+            fuzzy_labels.append("StreetName")
+        elif label.startswith("AddressNumber"):
+            fuzzy_labels.append("AddressNumber")
+        elif label == ("Null"):
+            fuzzy_labels.append("NotAddress")
         else:
             fuzzy_labels.append(label)
     for label in labels_true:
         labels.append(label)
-    prettyPrint(addr, fuzzy_labels, labels)
 
     assert fuzzy_labels == labels
-
-
-def prettyPrint(addr, predicted, true):
-    print("ADDRESS:    ", addr)
-    print("pred:       ", predicted)
-    print("true:       ", true)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/test_tagging.py b/tests/test_tagging.py
@@ -1,9 +1,9 @@
 import unittest
+
 import usaddress
 
 
 class TestTagging(unittest.TestCase):
-
     def test_broadway(self):
-        s1 = '1775 Broadway And 57th, Newyork NY'
+        s1 = "1775 Broadway And 57th, Newyork NY"
         usaddress.tag(s1)
diff --git a/tests/test_token_features.py b/tests/test_token_features.py
@@ -1,14 +1,13 @@
-# -*- coding: utf-8 -*-
-from usaddress import tokenFeatures
 import unittest
 
+from usaddress import tokenFeatures
+
 
 class TestTokenFeatures(unittest.TestCase):
-
     def test_unicode(self):
-        features = tokenFeatures(u'å')
-        assert features['endsinpunc'] is False
+        features = tokenFeatures("å")
+        assert features["endsinpunc"] is False
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()