From bea210c65e792aa1f50f812e8a597c98e2090049 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Thu, 26 Sep 2024 13:44:23 -0400 Subject: [PATCH] tidy some other scripts --- .pre-commit-config.yaml | 4 +- docs/conf.py | 51 +++++----- parse_scripts/import_osm.py | 28 +++--- parse_scripts/parse.py | 144 +++++++++++++++++------------ parse_scripts/parse_openaddress.py | 13 ++- 5 files changed, 140 insertions(+), 100 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4896e74..47f2463 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/psf/black - rev: 24.4.2 + rev: 24.8.0 hooks: - id: black - repo: https://github.com/pycqa/isort @@ -9,7 +9,7 @@ repos: - id: isort name: isort (python) - repo: https://github.com/pycqa/flake8 - rev: "7.1.0" + rev: "7.1.1" hooks: - id: flake8 args: [--config=.flake8] diff --git a/docs/conf.py b/docs/conf.py index 07e11aa..8abb930 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -28,29 +28,29 @@ extensions = [] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. # source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'usaddress' -copyright = u'2014, Cathy Deng, Forest Gregg' +project = "usaddress" +copyright = "2014, Cathy Deng, Forest Gregg" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = '0.5.4' +version = "0.5.4" # The full version, including alpha/beta/rc tags. -release = '0.5.4' +release = "0.5.4" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -64,7 +64,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build'] +exclude_patterns = ["_build"] # The reST default role (used for this markup: `text`) to use for all # documents. @@ -82,7 +82,7 @@ # show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. # modindex_common_prefix = [] @@ -95,7 +95,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'default' +html_theme = "default" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -124,7 +124,7 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied @@ -173,7 +173,7 @@ # html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'usaddressdoc' +htmlhelp_basename = "usaddressdoc" # -- Options for LaTeX output --------------------------------------------- @@ -181,10 +181,8 @@ latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # 'preamble': '', } @@ -193,8 +191,13 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - ('index', 'usaddress.tex', u'usaddress Documentation', - u'Cathy Deng, Forest Gregg', 'manual'), + ( + "index", + "usaddress.tex", + "usaddress Documentation", + "Cathy Deng, Forest Gregg", + "manual", + ), ] # The name of an image file (relative to this directory) to place at the top of @@ -223,8 +226,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'usaddress', u'usaddress Documentation', - [u'Cathy Deng, Forest Gregg'], 1) + ("index", "usaddress", "usaddress Documentation", ["Cathy Deng, Forest Gregg"], 1) ] # If true, show URL addresses after external links. @@ -237,10 +239,15 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'usaddress', u'usaddress Documentation', - u'Cathy Deng, Forest Gregg', 'usaddress', - 'One line description of project.', - 'Miscellaneous'), + ( + "index", + "usaddress", + "usaddress Documentation", + "Cathy Deng, Forest Gregg", + "usaddress", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. diff --git a/parse_scripts/import_osm.py b/parse_scripts/import_osm.py index 14c6bcd..aa5b5a2 100644 --- a/parse_scripts/import_osm.py +++ b/parse_scripts/import_osm.py @@ -1,6 +1,7 @@ -import requests import codecs +import requests + query1 = """ @@ -46,11 +47,13 @@ -""" % ((-70.000000, 50.000000, 25.000000, -125.000000) * 6) -r1 = requests.post('http://overpass-api.de/api/interpreter/', data=query1) -r1.encoding = 'utf-8' +""" % ( + (-70.000000, 50.000000, 25.000000, -125.000000) * 6 +) +r1 = requests.post("http://overpass-api.de/api/interpreter/", data=query1) +r1.encoding = "utf-8" -f = codecs.open('data/osm_data.xml', encoding='utf-8', mode='w+') +f = codecs.open("data/osm_data.xml", encoding="utf-8", mode="w+") f.write(r1.text) @@ -71,8 +74,9 @@ -""" % ((-87.61309146881104, 41.890042371392965, 41.87234107841773, - -87.64235973358154) * 2) +""" % ( + (-87.61309146881104, 41.890042371392965, 41.87234107841773, -87.64235973358154) * 2 +) # r2 = requests.post('http://overpass-api.de/api/interpreter/', data=query2) # f = codecs.open("data/osm_data_street.xml", "wb", "utf-8") @@ -92,11 +96,13 @@ -""" % ((-70.000000, 50.000000, 25.000000, -125.000000) * 2) +""" % ( + (-70.000000, 50.000000, 25.000000, -125.000000) * 2 +) -if __name__ == '__main__': - r3 = requests.post('http://overpass-api.de/api/interpreter/', data=query3) +if __name__ == "__main__": + r3 = requests.post("http://overpass-api.de/api/interpreter/", data=query3) f = codecs.open("data/osm_data_full_addr.xml", "wb", "utf-8") - r3.encoding = 'utf-8' + r3.encoding = "utf-8" f.write(r3.text) diff --git a/parse_scripts/parse.py b/parse_scripts/parse.py index 4a4a62a..62cf303 100644 --- a/parse_scripts/parse.py +++ b/parse_scripts/parse.py @@ -1,8 +1,9 @@ -from builtins import str -from lxml import etree import ast -import re import random +import re +from builtins import str + +from lxml import etree def xmlToAddrList(xml_file): @@ -11,11 +12,11 @@ def xmlToAddrList(xml_file): root = tree.getroot() addr_list = [] for element in root: - if element.tag == 'node' or element.tag == 'way': + if element.tag == "node" or element.tag == "way": address = {} - for x in element.iter('tag'): + for x in element.iter("tag"): addr = ast.literal_eval(str(x.attrib)) - address[addr['k']] = addr['v'] + address[addr["k"]] = addr["v"] addr_list.append(address) return addr_list @@ -23,10 +24,9 @@ def xmlToAddrList(xml_file): def osmNaturalToTraining(xml_file): # natural addresses (in addr:full from osm xml data) -> training file (xml) address_list = xmlToAddrList(xml_file) - train_addr_list = etree.Element('AddressCollection') - trainFileName = '../training_data/' + \ - re.sub(r'\W+', '_', xml_file) + '.xml' - punc_list = ',.' + train_addr_list = etree.Element("AddressCollection") + trainFileName = "../training_data/" + re.sub(r"\W+", "_", xml_file) + ".xml" + punc_list = ",." # only the osm tags below will end up in training data; others will be # ignored osm_tags_to_addr_tags = { @@ -36,18 +36,23 @@ def osmNaturalToTraining(xml_file): "addr:street:type": "StreetNamePostType", "addr:city": "PlaceName", "addr:state": "StateName", - "addr:postcode": "ZipCode"} + "addr:postcode": "ZipCode", + } for address in address_list: - addr_tokens = address['addr:full'].split() - train_addr = etree.Element('AddressString') + addr_tokens = address["addr:full"].split() + train_addr = etree.Element("AddressString") is_addr_taggable = True # loop through tokens & find tags for each for token in addr_tokens: is_token_taggable = False for key, value in list(address.items()): - if all([key in list(osm_tags_to_addr_tags.keys()), - key != 'addr:full', - token in value.split()]): + if all( + [ + key in list(osm_tags_to_addr_tags.keys()), + key != "addr:full", + token in value.split(), + ] + ): token_xml = etree.Element(osm_tags_to_addr_tags[key]) # check for punctuation token_xml.text = token @@ -60,7 +65,7 @@ def osmNaturalToTraining(xml_file): if is_addr_taggable is True: train_addr_list.append(train_addr) output = etree.tostring(train_addr_list, pretty_print=True) - with open(trainFileName, 'w') as f: + with open(trainFileName, "w") as f: f.write(output) @@ -69,40 +74,45 @@ def osmSyntheticToTraining(xml_file): address_list = xmlToAddrList(xml_file) train_addr_list = [] - trainFileName = 'training/training_data/synthetic_' + \ - re.sub(r'\W+', '_', re.sub(r'.*/', '', xml_file)) + '.xml' - testFileName = 'training/test_data/synthetic_' + \ - re.sub(r'\W+', '_', re.sub(r'.*/', '', xml_file)) + '.xml' + trainFileName = ( + "training/training_data/synthetic_" + + re.sub(r"\W+", "_", re.sub(r".*/", "", xml_file)) + + ".xml" + ) + testFileName = ( + "training/test_data/synthetic_" + + re.sub(r"\W+", "_", re.sub(r".*/", "", xml_file)) + + ".xml" + ) synthetic_order = [ - ('addr:housenumber', 'AddressNumber', 'Street'), - ('addr:street:prefix', 'StreetNamePreDirectional', 'Street'), - ('addr:street:name', 'StreetName', 'Street'), - ('addr:street:type', 'StreetNamePostType', 'Street'), - ('addr:city', 'PlaceName', 'City'), - ('addr:state', 'StateName', 'Area'), - ('addr:postcode', 'ZipCode', 'Area')] + ("addr:housenumber", "AddressNumber", "Street"), + ("addr:street:prefix", "StreetNamePreDirectional", "Street"), + ("addr:street:name", "StreetName", "Street"), + ("addr:street:type", "StreetNamePostType", "Street"), + ("addr:city", "PlaceName", "City"), + ("addr:state", "StateName", "Area"), + ("addr:postcode", "ZipCode", "Area"), + ] for address in address_list: - train_addr = etree.Element('AddressString') - components = {'Street': [], 'City': [], 'Area': []} + train_addr = etree.Element("AddressString") + components = {"Street": [], "City": [], "Area": []} for source_tag, target_tag, tag_type in synthetic_order: if source_tag in list(address.keys()): words = address[source_tag].split() for word in words: token_xml = etree.Element(target_tag) token_xml.text = word - token_xml.tail = ' ' + token_xml.tail = " " components[tag_type].append(token_xml) - for tag_type in ('Street', 'City', 'Area'): - l = components[tag_type] - if l: - l[-1].text += ',' + for tag_type in ("Street", "City", "Area"): + label = components[tag_type] + if label: + label[-1].text += "," - address_xml = (components['Street'] + - components['City'] + - components['Area']) + address_xml = components["Street"] + components["City"] + components["Area"] address_xml[-1].text = address_xml[-1].text[:-1] address_xml[-1].tail = None @@ -115,41 +125,55 @@ def osmSyntheticToTraining(xml_file): random.shuffle(train_addr_list) percent_20 = int(len(train_addr_list) * 0.2) - test_data = etree.Element('AddressCollection') + test_data = etree.Element("AddressCollection") test_data.extend(train_addr_list[:percent_20]) - train_data = etree.Element('AddressCollection') + train_data = etree.Element("AddressCollection") train_data.extend(train_addr_list[percent_20:]) - with open(trainFileName, 'w') as f: + with open(trainFileName, "w") as f: f.write(etree.tostring(train_data, pretty_print=True)) - with open(testFileName, 'w') as f: + with open(testFileName, "w") as f: f.write(etree.tostring(test_data, pretty_print=True)) def trainFileFromLines(addr_file, is_train=True): # us50 data -> training or test file (xml) - lines = open(addr_file, 'r') + lines = open(addr_file, "r") if is_train is True: - outputFileName = 'training/training_data/' + \ - re.sub(r'\W+', '_', re.sub(r'.*/', '', addr_file)) + '.xml' + outputFileName = ( + "training/training_data/" + + re.sub(r"\W+", "_", re.sub(r".*/", "", addr_file)) + + ".xml" + ) else: - outputFileName = 'training/test_data/' + \ - re.sub(r'\W+', '_', re.sub(r'.*/', '', addr_file)) + '.xml' - - tag_list = [None, 'AddressNumber', 'USPSBox', 'StreetName', - 'StreetNamePostType', 'PlaceName', 'StateName', 'ZipCode', - 'suffix'] - addr_list = etree.Element('AddressCollection') - addr = etree.Element('AddressString') + outputFileName = ( + "training/test_data/" + + re.sub(r"\W+", "_", re.sub(r".*/", "", addr_file)) + + ".xml" + ) + + tag_list = [ + None, + "AddressNumber", + "USPSBox", + "StreetName", + "StreetNamePostType", + "PlaceName", + "StateName", + "ZipCode", + "suffix", + ] + addr_list = etree.Element("AddressCollection") + addr = etree.Element("AddressString") for line in lines: - if line == '\n': # add addr to list & reset addr + if line == "\n": # add addr to list & reset addr addr[-1].tail = None addr_list.append(addr) - addr = etree.Element('AddressString') + addr = etree.Element("AddressString") else: - split = line.split(' |') + split = line.split(" |") addr_line = split[0] addr_tokens = addr_line.split() token_num = int(split[1].rstrip()) @@ -157,15 +181,15 @@ def trainFileFromLines(addr_file, is_train=True): for token in addr_tokens: token_xml = etree.Element(token_tag) token_xml.text = token - token_xml.tail = ' ' + token_xml.tail = " " addr.append(token_xml) output = etree.tostring(addr_list, pretty_print=True) - with open(outputFileName, 'w') as f: + with open(outputFileName, "w") as f: f.write(output) -if __name__ == '__main__': - osmSyntheticToTraining('training/data/osm_data.xml') +if __name__ == "__main__": + osmSyntheticToTraining("training/data/osm_data.xml") # trainFileFromLines('training/data/us50.train.tagged') # trainFileFromLines('training/data/us50.test.tagged', False) diff --git a/parse_scripts/parse_openaddress.py b/parse_scripts/parse_openaddress.py index 2b07859..d145469 100644 --- a/parse_scripts/parse_openaddress.py +++ b/parse_scripts/parse_openaddress.py @@ -1,5 +1,7 @@ import json + from lxml import etree + from usaddress import tokenize @@ -24,22 +26,22 @@ def json2addrlist(data, tagmapping): def list2xml(addr_list, outfile): - xml_addr_list = etree.Element('AddressCollection') + xml_addr_list = etree.Element("AddressCollection") for addr in addr_list: - xml_addr = etree.Element('AddressString') + xml_addr = etree.Element("AddressString") # handle commas? for component in addr: if component[1]: for token in tokenize(component[1]): token_xml = etree.Element(component[0]) token_xml.text = token - token_xml.tail = ' ' + token_xml.tail = " " xml_addr.append(token_xml) xml_addr[-1].tail = None xml_addr_list.append(xml_addr) output = etree.tostring(xml_addr_list, pretty_print=True) - with open(outfile, 'w') as f: + with open(outfile, "w") as f: f.write(output) @@ -57,7 +59,8 @@ def list2xml(addr_list, outfile): ["OccupancyIdentifier", True, "UNITNO"], ["PlaceName", True, "CITY"], ["StateName", False, "IA"], - ["ZipCode", True, "ZIP"]] + ["ZipCode", True, "ZIP"], +] infile = "../data/openaddresses/us-ia-linn.json"