Skip to content

Commit

Permalink
tidying up
Browse files Browse the repository at this point in the history
  • Loading branch information
fgregg committed Sep 26, 2024
1 parent efec453 commit 06a5dfd
Show file tree
Hide file tree
Showing 10 changed files with 957 additions and 430 deletions.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include training/*
67 changes: 67 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
[project]
name = "usaddress"
version = "0.5.10"
description = "Parse US addresses using conditional random fields"
readme = "README.md"
license = {text = "MIT License", url = "http://www.opensource.org/licenses/mit-license.php"}
requires-python = ">=3.7"
dependencies = [
"python-crfsuite>=0.7",
"probableparsing"
]
classifiers = [
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Natural Language :: English",
"Operating System :: MacOS :: MacOS X",
"Operating System :: Microsoft :: Windows",
"Operating System :: POSIX",
"Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: Scientific/Engineering",
"Topic :: Scientific/Engineering :: Information Analysis",
]

[project.urls]
Homepage = "https://github.com/datamade/usaddress"

[project.optional-dependencies]
dev = ["pytest",
"black",
"isort",
"mypy",
"flake8"
]

[build-system]
requires = ["setuptools>=42", "wheel", "parserator", "probableparsing"]
build-backend = "setuptools.build_meta"


[tool.setuptools.packages.find]
include = ["usaddress"]


[tool.setuptools.package-data]
usaddress = ['usaddr.crfsuite']


[tool.pytest.ini_options]
addopts = [
"--import-mode=importlib",
]
testpaths = [
"tests",
]

[tool.mypy]
files = ["usaddress"]
show_error_codes = true
ignore_missing_imports = true
check_untyped_defs = true
implicit_reexport = false

[tool.isort]
profile = "black"
src_paths = ["usaddress", "tests"]
5 changes: 2 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
nose
pytest
parserator>=0.3
requests
coverage
flake8
pytest
2 changes: 0 additions & 2 deletions setup.cfg

This file was deleted.

83 changes: 35 additions & 48 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,38 @@
try:
from setuptools import setup
except ImportError:
raise ImportError(
"setuptools module required, please go to "
"https://pypi.python.org/pypi/setuptools and follow the instructions "
"for installing setuptools"
)
import os
import subprocess
from distutils.cmd import Command

from setuptools import setup
from setuptools.command.build_py import build_py as _build_py


class TrainModel(Command):
description = "Training the model before building the package"
user_options = []

def initialize_options(self):
pass

def finalize_options(self):
pass

def run(self):
subprocess.run(
["parserator", "train", "training/labeled.xml", "usaddress"],
env=dict(os.environ, PYTHONPATH="."),
)


class build_py(_build_py):
def run(self):
self.run_command("train_model") # Run the custom command
super().run()


# Standard setup configuration
setup(
version='0.5.10',
url='https://github.com/datamade/usaddress',
description='Parse US addresses using conditional random fields',
name='usaddress',
packages=['usaddress'],
package_data={'usaddress': ['usaddr.crfsuite']},
license='The MIT License: http://www.opensource.org/licenses/mit-license.php',
install_requires=['python-crfsuite>=0.7',
'future>=0.14',
'probableparsing'],
classifiers=[
'Development Status :: 3 - Alpha',
'Intended Audience :: Developers',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: MIT License',
'Natural Language :: English',
'Operating System :: MacOS :: MacOS X',
'Operating System :: Microsoft :: Windows',
'Operating System :: POSIX',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 2 :: Only',
'Topic :: Software Development :: Libraries :: Python Modules',
'Topic :: Scientific/Engineering',
'Topic :: Scientific/Engineering :: Information Analysis'],
long_description="""
usaddress is a python library for parsing unstructured address strings into
address components, using advanced NLP methods.
From the python interpreter:
>>> import usaddress
>>> usaddress.parse('123 Main St. Suite 100 Chicago, IL')
[('123', 'AddressNumber'),
('Main', 'StreetName'),
('St.', 'StreetNamePostType'),
('Suite', 'OccupancyType'),
('100', 'OccupancyIdentifier'),
('Chicago,', 'PlaceName'),
('IL', 'StateName')]
"""
cmdclass={
"build_py": build_py, # Override build_py
"train_model": TrainModel, # Register custom command
},
)
121 changes: 52 additions & 69 deletions tests/test_labeling.py
Original file line number Diff line number Diff line change
@@ -1,94 +1,77 @@
from __future__ import print_function
from builtins import zip
from builtins import object
from usaddress import parse, GROUP_LABEL
import pytest
from parserator.training import readTrainingData
import unittest

from usaddress import GROUP_LABEL, parse

class TestPerformance(object): # for test generators, must inherit from object

# these are simple address patterns
def test_simple_addresses(self):
test_file = 'measure_performance/test_data/simple_address_patterns.xml'
data = list(readTrainingData([test_file], GROUP_LABEL))
# these are simple address patterns
@pytest.mark.parametrize(
"address_text,components",
readTrainingData(
["measure_performance/test_data/simple_address_patterns.xml"], GROUP_LABEL
),
)
def test_simple_addresses(address_text, components):

for labeled_address in data:
address_text, components = labeled_address
_, labels_true = list(zip(*components))
_, labels_pred = list(zip(*parse(address_text)))
yield equals, address_text, labels_pred, labels_true

# for making sure that performance isn't degrading
# from now on, labeled examples of new address formats
# should go both in training data & test data
def test_all(self):
test_file = 'measure_performance/test_data/labeled.xml'
data = list(readTrainingData([test_file], GROUP_LABEL))
_, labels_true = list(zip(*components))
_, labels_pred = list(zip(*parse(address_text)))
assert labels_pred == labels_true

for labeled_address in data:
address_text, components = labeled_address
_, labels_true = list(zip(*components))
_, labels_pred = list(zip(*parse(address_text)))
yield equals, address_text, labels_pred, labels_true

# for making sure that performance isn't degrading
# from now on, labeled examples of new address formats
# should go both in training data & test data
# these are simple address patterns
@pytest.mark.parametrize(
"address_text,components",
readTrainingData(["measure_performance/test_data/labeled.xml"], GROUP_LABEL),
)
def test_all(address_text, components):

class TestPerformanceOld(object): # some old tests for usaddress
_, labels_true = list(zip(*components))
_, labels_pred = list(zip(*parse(address_text)))
assert labels_pred == labels_true

def test_synthetic_addresses(self):
test_file = 'measure_performance/test_data/synthetic_osm_data.xml'
data = list(readTrainingData([test_file], GROUP_LABEL))

for labeled_address in data:
address_text, components = labeled_address
_, labels_true = list(zip(*components))
_, labels_pred = list(zip(*parse(address_text)))
yield equals, address_text, labels_pred, labels_true
@pytest.mark.parametrize(
"address_text,components",
readTrainingData(
["measure_performance/test_data/synthetic_osm_data.xml"], GROUP_LABEL
),
)
def test_synthetic_addresses(address_text, components):

def test_us50(self):
test_file = 'measure_performance/test_data/us50_test_tagged.xml'
data = list(readTrainingData([test_file], GROUP_LABEL))
_, labels_true = list(zip(*components))
_, labels_pred = list(zip(*parse(address_text)))
assert labels_pred == labels_true

for labeled_address in data:
address_text, components = labeled_address
_, labels_true = list(zip(*components))
_, labels_pred = list(zip(*parse(address_text)))
yield fuzzyEquals, address_text, labels_pred, labels_true

@pytest.mark.parametrize(
"address_text,components",
readTrainingData(
["measure_performance/test_data/us50_test_tagged.xml"], GROUP_LABEL
),
)
def test_us50(address_text, components):

def equals(addr,
labels_pred,
labels_true):
prettyPrint(addr, labels_pred, labels_true)
assert labels_pred == labels_true
_, labels_true = list(zip(*components))
_, labels_pred = list(zip(*parse(address_text)))
fuzzyEquals(labels_pred, labels_true)


def fuzzyEquals(addr,
labels_pred,
labels_true):
def fuzzyEquals(labels_pred, labels_true):
labels = []
fuzzy_labels = []
for label in labels_pred:
if label.startswith('StreetName'):
fuzzy_labels.append('StreetName')
elif label.startswith('AddressNumber'):
fuzzy_labels.append('AddressNumber')
elif label == ('Null'):
fuzzy_labels.append('NotAddress')
if label.startswith("StreetName"):
fuzzy_labels.append("StreetName")
elif label.startswith("AddressNumber"):
fuzzy_labels.append("AddressNumber")
elif label == ("Null"):
fuzzy_labels.append("NotAddress")
else:
fuzzy_labels.append(label)
for label in labels_true:
labels.append(label)
prettyPrint(addr, fuzzy_labels, labels)

assert fuzzy_labels == labels


def prettyPrint(addr, predicted, true):
print("ADDRESS: ", addr)
print("pred: ", predicted)
print("true: ", true)


if __name__ == "__main__":
unittest.main()
4 changes: 2 additions & 2 deletions tests/test_tagging.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import unittest

import usaddress


class TestTagging(unittest.TestCase):

def test_broadway(self):
s1 = '1775 Broadway And 57th, Newyork NY'
s1 = "1775 Broadway And 57th, Newyork NY"
usaddress.tag(s1)
11 changes: 5 additions & 6 deletions tests/test_token_features.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
# -*- coding: utf-8 -*-
from usaddress import tokenFeatures
import unittest

from usaddress import tokenFeatures


class TestTokenFeatures(unittest.TestCase):

def test_unicode(self):
features = tokenFeatures(u'å')
assert features['endsinpunc'] is False
features = tokenFeatures("å")
assert features["endsinpunc"] is False


if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()
Loading

0 comments on commit 06a5dfd

Please sign in to comment.