From bea210c65e792aa1f50f812e8a597c98e2090049 Mon Sep 17 00:00:00 2001
From: Forest Gregg <fgregg@datamade.us>
Date: Thu, 26 Sep 2024 13:44:23 -0400
Subject: [PATCH] tidy some other scripts

---
 .pre-commit-config.yaml            |   4 +-
 docs/conf.py                       |  51 +++++-----
 parse_scripts/import_osm.py        |  28 +++---
 parse_scripts/parse.py             | 144 +++++++++++++++++------------
 parse_scripts/parse_openaddress.py |  13 ++-
 5 files changed, 140 insertions(+), 100 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4896e74..47f2463 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/psf/black
-    rev: 24.4.2 
+    rev: 24.8.0 
     hooks:
     - id: black
   - repo: https://github.com/pycqa/isort
@@ -9,7 +9,7 @@ repos:
       - id: isort
         name: isort (python)
   - repo: https://github.com/pycqa/flake8
-    rev: "7.1.0"
+    rev: "7.1.1"
     hooks:
       - id: flake8
         args: [--config=.flake8]
diff --git a/docs/conf.py b/docs/conf.py
index 07e11aa..8abb930 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -28,29 +28,29 @@
 extensions = []
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # The suffix of source filenames.
-source_suffix = '.rst'
+source_suffix = ".rst"
 
 # The encoding of source files.
 # source_encoding = 'utf-8-sig'
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"
 
 # General information about the project.
-project = u'usaddress'
-copyright = u'2014, Cathy Deng, Forest Gregg'
+project = "usaddress"
+copyright = "2014, Cathy Deng, Forest Gregg"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = '0.5.4'
+version = "0.5.4"
 # The full version, including alpha/beta/rc tags.
-release = '0.5.4'
+release = "0.5.4"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -64,7 +64,7 @@
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build']
+exclude_patterns = ["_build"]
 
 # The reST default role (used for this markup: `text`) to use for all
 # documents.
@@ -82,7 +82,7 @@
 # show_authors = False
 
 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
 
 # A list of ignored prefixes for module index sorting.
 # modindex_common_prefix = []
@@ -95,7 +95,7 @@
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-html_theme = 'default'
+html_theme = "default"
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
@@ -124,7 +124,7 @@
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
 
 # Add any extra paths that contain custom files (such as robots.txt or
 # .htaccess) here, relative to this directory. These files are copied
@@ -173,7 +173,7 @@
 # html_file_suffix = None
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'usaddressdoc'
+htmlhelp_basename = "usaddressdoc"
 
 
 # -- Options for LaTeX output ---------------------------------------------
@@ -181,10 +181,8 @@
 latex_elements = {
     # The paper size ('letterpaper' or 'a4paper').
     # 'papersize': 'letterpaper',
-
     # The font size ('10pt', '11pt' or '12pt').
     # 'pointsize': '10pt',
-
     # Additional stuff for the LaTeX preamble.
     # 'preamble': '',
 }
@@ -193,8 +191,13 @@
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    ('index', 'usaddress.tex', u'usaddress Documentation',
-     u'Cathy Deng, Forest Gregg', 'manual'),
+    (
+        "index",
+        "usaddress.tex",
+        "usaddress Documentation",
+        "Cathy Deng, Forest Gregg",
+        "manual",
+    ),
 ]
 
 # The name of an image file (relative to this directory) to place at the top of
@@ -223,8 +226,7 @@
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    ('index', 'usaddress', u'usaddress Documentation',
-     [u'Cathy Deng, Forest Gregg'], 1)
+    ("index", "usaddress", "usaddress Documentation", ["Cathy Deng, Forest Gregg"], 1)
 ]
 
 # If true, show URL addresses after external links.
@@ -237,10 +239,15 @@
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    ('index', 'usaddress', u'usaddress Documentation',
-     u'Cathy Deng, Forest Gregg', 'usaddress',
-     'One line description of project.',
-     'Miscellaneous'),
+    (
+        "index",
+        "usaddress",
+        "usaddress Documentation",
+        "Cathy Deng, Forest Gregg",
+        "usaddress",
+        "One line description of project.",
+        "Miscellaneous",
+    ),
 ]
 
 # Documents to append as an appendix to all manuals.
diff --git a/parse_scripts/import_osm.py b/parse_scripts/import_osm.py
index 14c6bcd..aa5b5a2 100644
--- a/parse_scripts/import_osm.py
+++ b/parse_scripts/import_osm.py
@@ -1,6 +1,7 @@
-import requests
 import codecs
 
+import requests
+
 query1 = """<union>
 <query type="way">
     <has-kv k="addr:housenumber"/>
@@ -46,11 +47,13 @@
 </query>
 </union>
 
-<print/>""" % ((-70.000000, 50.000000, 25.000000, -125.000000) * 6)
-r1 = requests.post('http://overpass-api.de/api/interpreter/', data=query1)
-r1.encoding = 'utf-8'
+<print/>""" % (
+    (-70.000000, 50.000000, 25.000000, -125.000000) * 6
+)
+r1 = requests.post("http://overpass-api.de/api/interpreter/", data=query1)
+r1.encoding = "utf-8"
 
-f = codecs.open('data/osm_data.xml', encoding='utf-8', mode='w+')
+f = codecs.open("data/osm_data.xml", encoding="utf-8", mode="w+")
 f.write(r1.text)
 
 
@@ -71,8 +74,9 @@
 </query>
 </union>
 
-<print/>""" % ((-87.61309146881104, 41.890042371392965, 41.87234107841773,
-                -87.64235973358154) * 2)
+<print/>""" % (
+    (-87.61309146881104, 41.890042371392965, 41.87234107841773, -87.64235973358154) * 2
+)
 # r2 = requests.post('http://overpass-api.de/api/interpreter/', data=query2)
 
 # f = codecs.open("data/osm_data_street.xml", "wb", "utf-8")
@@ -92,11 +96,13 @@
 </union>
 
 <print/>
-""" % ((-70.000000, 50.000000, 25.000000, -125.000000) * 2)
+""" % (
+    (-70.000000, 50.000000, 25.000000, -125.000000) * 2
+)
 
-if __name__ == '__main__':
-    r3 = requests.post('http://overpass-api.de/api/interpreter/', data=query3)
+if __name__ == "__main__":
+    r3 = requests.post("http://overpass-api.de/api/interpreter/", data=query3)
 
     f = codecs.open("data/osm_data_full_addr.xml", "wb", "utf-8")
-    r3.encoding = 'utf-8'
+    r3.encoding = "utf-8"
     f.write(r3.text)
diff --git a/parse_scripts/parse.py b/parse_scripts/parse.py
index 4a4a62a..62cf303 100644
--- a/parse_scripts/parse.py
+++ b/parse_scripts/parse.py
@@ -1,8 +1,9 @@
-from builtins import str
-from lxml import etree
 import ast
-import re
 import random
+import re
+from builtins import str
+
+from lxml import etree
 
 
 def xmlToAddrList(xml_file):
@@ -11,11 +12,11 @@ def xmlToAddrList(xml_file):
     root = tree.getroot()
     addr_list = []
     for element in root:
-        if element.tag == 'node' or element.tag == 'way':
+        if element.tag == "node" or element.tag == "way":
             address = {}
-            for x in element.iter('tag'):
+            for x in element.iter("tag"):
                 addr = ast.literal_eval(str(x.attrib))
-                address[addr['k']] = addr['v']
+                address[addr["k"]] = addr["v"]
             addr_list.append(address)
     return addr_list
 
@@ -23,10 +24,9 @@ def xmlToAddrList(xml_file):
 def osmNaturalToTraining(xml_file):
     # natural addresses (in addr:full from osm xml data) -> training file (xml)
     address_list = xmlToAddrList(xml_file)
-    train_addr_list = etree.Element('AddressCollection')
-    trainFileName = '../training_data/' + \
-        re.sub(r'\W+', '_', xml_file) + '.xml'
-    punc_list = ',.'
+    train_addr_list = etree.Element("AddressCollection")
+    trainFileName = "../training_data/" + re.sub(r"\W+", "_", xml_file) + ".xml"
+    punc_list = ",."
     # only the osm tags below will end up in training data; others will be
     # ignored
     osm_tags_to_addr_tags = {
@@ -36,18 +36,23 @@ def osmNaturalToTraining(xml_file):
         "addr:street:type": "StreetNamePostType",
         "addr:city": "PlaceName",
         "addr:state": "StateName",
-        "addr:postcode": "ZipCode"}
+        "addr:postcode": "ZipCode",
+    }
     for address in address_list:
-        addr_tokens = address['addr:full'].split()
-        train_addr = etree.Element('AddressString')
+        addr_tokens = address["addr:full"].split()
+        train_addr = etree.Element("AddressString")
         is_addr_taggable = True
         # loop through tokens & find tags for each
         for token in addr_tokens:
             is_token_taggable = False
             for key, value in list(address.items()):
-                if all([key in list(osm_tags_to_addr_tags.keys()),
-                        key != 'addr:full',
-                        token in value.split()]):
+                if all(
+                    [
+                        key in list(osm_tags_to_addr_tags.keys()),
+                        key != "addr:full",
+                        token in value.split(),
+                    ]
+                ):
                     token_xml = etree.Element(osm_tags_to_addr_tags[key])
                     # check for punctuation
                     token_xml.text = token
@@ -60,7 +65,7 @@ def osmNaturalToTraining(xml_file):
         if is_addr_taggable is True:
             train_addr_list.append(train_addr)
     output = etree.tostring(train_addr_list, pretty_print=True)
-    with open(trainFileName, 'w') as f:
+    with open(trainFileName, "w") as f:
         f.write(output)
 
 
@@ -69,40 +74,45 @@ def osmSyntheticToTraining(xml_file):
     address_list = xmlToAddrList(xml_file)
     train_addr_list = []
 
-    trainFileName = 'training/training_data/synthetic_' + \
-        re.sub(r'\W+', '_', re.sub(r'.*/', '', xml_file)) + '.xml'
-    testFileName = 'training/test_data/synthetic_' + \
-        re.sub(r'\W+', '_', re.sub(r'.*/', '', xml_file)) + '.xml'
+    trainFileName = (
+        "training/training_data/synthetic_"
+        + re.sub(r"\W+", "_", re.sub(r".*/", "", xml_file))
+        + ".xml"
+    )
+    testFileName = (
+        "training/test_data/synthetic_"
+        + re.sub(r"\W+", "_", re.sub(r".*/", "", xml_file))
+        + ".xml"
+    )
 
     synthetic_order = [
-        ('addr:housenumber', 'AddressNumber', 'Street'),
-        ('addr:street:prefix', 'StreetNamePreDirectional', 'Street'),
-        ('addr:street:name', 'StreetName', 'Street'),
-        ('addr:street:type', 'StreetNamePostType', 'Street'),
-        ('addr:city', 'PlaceName', 'City'),
-        ('addr:state', 'StateName', 'Area'),
-        ('addr:postcode', 'ZipCode', 'Area')]
+        ("addr:housenumber", "AddressNumber", "Street"),
+        ("addr:street:prefix", "StreetNamePreDirectional", "Street"),
+        ("addr:street:name", "StreetName", "Street"),
+        ("addr:street:type", "StreetNamePostType", "Street"),
+        ("addr:city", "PlaceName", "City"),
+        ("addr:state", "StateName", "Area"),
+        ("addr:postcode", "ZipCode", "Area"),
+    ]
 
     for address in address_list:
-        train_addr = etree.Element('AddressString')
-        components = {'Street': [], 'City': [], 'Area': []}
+        train_addr = etree.Element("AddressString")
+        components = {"Street": [], "City": [], "Area": []}
         for source_tag, target_tag, tag_type in synthetic_order:
             if source_tag in list(address.keys()):
                 words = address[source_tag].split()
                 for word in words:
                     token_xml = etree.Element(target_tag)
                     token_xml.text = word
-                    token_xml.tail = ' '
+                    token_xml.tail = " "
                     components[tag_type].append(token_xml)
 
-        for tag_type in ('Street', 'City', 'Area'):
-            l = components[tag_type]
-            if l:
-                l[-1].text += ','
+        for tag_type in ("Street", "City", "Area"):
+            label = components[tag_type]
+            if label:
+                label[-1].text += ","
 
-        address_xml = (components['Street'] +
-                       components['City'] +
-                       components['Area'])
+        address_xml = components["Street"] + components["City"] + components["Area"]
 
         address_xml[-1].text = address_xml[-1].text[:-1]
         address_xml[-1].tail = None
@@ -115,41 +125,55 @@ def osmSyntheticToTraining(xml_file):
     random.shuffle(train_addr_list)
     percent_20 = int(len(train_addr_list) * 0.2)
 
-    test_data = etree.Element('AddressCollection')
+    test_data = etree.Element("AddressCollection")
     test_data.extend(train_addr_list[:percent_20])
 
-    train_data = etree.Element('AddressCollection')
+    train_data = etree.Element("AddressCollection")
     train_data.extend(train_addr_list[percent_20:])
 
-    with open(trainFileName, 'w') as f:
+    with open(trainFileName, "w") as f:
         f.write(etree.tostring(train_data, pretty_print=True))
 
-    with open(testFileName, 'w') as f:
+    with open(testFileName, "w") as f:
         f.write(etree.tostring(test_data, pretty_print=True))
 
 
 def trainFileFromLines(addr_file, is_train=True):
     # us50 data -> training or test file (xml)
-    lines = open(addr_file, 'r')
+    lines = open(addr_file, "r")
     if is_train is True:
-        outputFileName = 'training/training_data/' + \
-            re.sub(r'\W+', '_', re.sub(r'.*/', '', addr_file)) + '.xml'
+        outputFileName = (
+            "training/training_data/"
+            + re.sub(r"\W+", "_", re.sub(r".*/", "", addr_file))
+            + ".xml"
+        )
     else:
-        outputFileName = 'training/test_data/' + \
-            re.sub(r'\W+', '_', re.sub(r'.*/', '', addr_file)) + '.xml'
-
-    tag_list = [None, 'AddressNumber', 'USPSBox', 'StreetName',
-                'StreetNamePostType', 'PlaceName', 'StateName', 'ZipCode',
-                'suffix']
-    addr_list = etree.Element('AddressCollection')
-    addr = etree.Element('AddressString')
+        outputFileName = (
+            "training/test_data/"
+            + re.sub(r"\W+", "_", re.sub(r".*/", "", addr_file))
+            + ".xml"
+        )
+
+    tag_list = [
+        None,
+        "AddressNumber",
+        "USPSBox",
+        "StreetName",
+        "StreetNamePostType",
+        "PlaceName",
+        "StateName",
+        "ZipCode",
+        "suffix",
+    ]
+    addr_list = etree.Element("AddressCollection")
+    addr = etree.Element("AddressString")
     for line in lines:
-        if line == '\n':  # add addr to list & reset addr
+        if line == "\n":  # add addr to list & reset addr
             addr[-1].tail = None
             addr_list.append(addr)
-            addr = etree.Element('AddressString')
+            addr = etree.Element("AddressString")
         else:
-            split = line.split(' |')
+            split = line.split(" |")
             addr_line = split[0]
             addr_tokens = addr_line.split()
             token_num = int(split[1].rstrip())
@@ -157,15 +181,15 @@ def trainFileFromLines(addr_file, is_train=True):
             for token in addr_tokens:
                 token_xml = etree.Element(token_tag)
                 token_xml.text = token
-                token_xml.tail = ' '
+                token_xml.tail = " "
                 addr.append(token_xml)
 
     output = etree.tostring(addr_list, pretty_print=True)
-    with open(outputFileName, 'w') as f:
+    with open(outputFileName, "w") as f:
         f.write(output)
 
 
-if __name__ == '__main__':
-    osmSyntheticToTraining('training/data/osm_data.xml')
+if __name__ == "__main__":
+    osmSyntheticToTraining("training/data/osm_data.xml")
     # trainFileFromLines('training/data/us50.train.tagged')
     # trainFileFromLines('training/data/us50.test.tagged', False)
diff --git a/parse_scripts/parse_openaddress.py b/parse_scripts/parse_openaddress.py
index 2b07859..d145469 100644
--- a/parse_scripts/parse_openaddress.py
+++ b/parse_scripts/parse_openaddress.py
@@ -1,5 +1,7 @@
 import json
+
 from lxml import etree
+
 from usaddress import tokenize
 
 
@@ -24,22 +26,22 @@ def json2addrlist(data, tagmapping):
 
 
 def list2xml(addr_list, outfile):
-    xml_addr_list = etree.Element('AddressCollection')
+    xml_addr_list = etree.Element("AddressCollection")
     for addr in addr_list:
-        xml_addr = etree.Element('AddressString')
+        xml_addr = etree.Element("AddressString")
         # handle commas?
         for component in addr:
             if component[1]:
                 for token in tokenize(component[1]):
                     token_xml = etree.Element(component[0])
                     token_xml.text = token
-                    token_xml.tail = ' '
+                    token_xml.tail = " "
                     xml_addr.append(token_xml)
         xml_addr[-1].tail = None
         xml_addr_list.append(xml_addr)
 
     output = etree.tostring(xml_addr_list, pretty_print=True)
-    with open(outfile, 'w') as f:
+    with open(outfile, "w") as f:
         f.write(output)
 
 
@@ -57,7 +59,8 @@ def list2xml(addr_list, outfile):
     ["OccupancyIdentifier", True, "UNITNO"],
     ["PlaceName", True, "CITY"],
     ["StateName", False, "IA"],
-    ["ZipCode", True, "ZIP"]]
+    ["ZipCode", True, "ZIP"],
+]
 
 
 infile = "../data/openaddresses/us-ia-linn.json"