create_schema.py

# -*- coding: utf-8 -*-
"""PSQL Schema.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1ml1yZjvYiKXw1_qZ7kULXaoarpVzFRdt

Imports and set up the schema key
"""

import requests
from bs4 import BeautifulSoup
import pandas as pd
from pprint import pprint
import math

schemas = []

# Translate beteween EPA data types and postgresql
key = {
  "Char": "character varying",
  "VarChar2": "character varying",
  "Number": "numeric",
  "NUMBER": "numeric",
  "Num": "numeric",
  "NUM": "numeric",
  "Date": "date",
}

"""read in EEW Tracker Google Sheet

"""

url = "https://docs.google.com/spreadsheets/d/1Z2rBoGqb_SXW6oAu12A6TCWEJGV1pk0YxL13P_Z5Wlw/edit#gid=2049992364"
r = requests.get(url)
if r.status_code != 200:
  raise(ValueError(f"Web site could not be retrieved. Status Code: {r.status_code}"))
else:
  url = url.replace('/edit#gid=', '/export?format=csv&gid=')
  sheet = pd.read_csv(url)
  display(sheet)

"""loop through each metadata source, pulling out only the relevant metadata tables, and create Postgresql schema out of them"""

metas = list(sheet["META_URL"].unique())
metas = [m for m in metas if str(m) != 'nan'] # remove any unlisted metadata sources

for url in metas:

  contents = requests.get(url, timeout=120).content.decode() # Decode the url's HTML # Handle the request so that it doesn't hang
  contents = BeautifulSoup(contents, 'lxml')
  body = contents.find('body')
  [s.extract() for s in body('sup')] # remove all footnotes
  body

  tables = {} # tables = a dict of Pandas Dataframes of EPA HTML

  # Unique case of https://echo.epa.gov/tools/data-downloads/air-emissions-download-summary
  if url == "https://echo.epa.gov/tools/data-downloads/air-emissions-download-summary":
    for header in body.find_all("h2"):
      t = header.next_sibling # move to table
      if t.name == "table": # make sure it's a table
        #print(t)
        t = pd.read_html(t.prettify()) # need to prettify
        tables["POLL_RPT_COMBINED_EMISSIONS"] = t
      elif t.next_sibling.name == "table": # try the next next element
        t = pd.read_html(t.next_sibling.prettify()) # need to prettify
        tables["POLL_RPT_COMBINED_EMISSIONS"] = t
      else:
        print("error making table: POLL_RPT_COMBINED_EMISSIONS")

    for entity, table in tables.items():
      #print(entity)
      fields = ""
      for index, row in table[0].iterrows():
        #print(index,row)
        name = row["Element Name"] if "Element Name" in row else row["Element"] #RCRA = Element
        this = "    "
        this += '"'+name+'"'
        this += " "
        if "(" in row["Data Type and Length"]:
          s = row["Data Type and Length"]
          print(s[s.find("(")+1:s.find(")")])
          length = s[s.find("(")+1:s.find(")")] # What's in the ()
          this += "character varying("+length+")"
        else:
          this += "numeric"
        if index < table[0].shape[0] - 1:
          this += ", \n" 
        else:
          this += "\n" # no comma at the end
        fields += this

      t_name = entity
      schema = ''\
      '--\n'\
      "-- Name: "+t_name+"; Type: TABLE; Schema: public; Owner: echoepa \n"\
      '--'\
      '\n'\
      "CREATE UNLOGGED TABLE public.\""+t_name+"\" ( \n"\
      ''+fields+''\
      ');'\
      '\n'\
      '\n'\
      "ALTER TABLE public.\""+t_name+"\" OWNER TO echoepa;"\
      '\n'
      schemas.append(schema)
  
  else:
    # Go through each header, check it against our list of tables, and scrape if there's a match
    for header in body.find_all("h3"): # for each header on the page
      for table in list(sheet['CSV FILE'].unique()): # check if it's in the list of tables we're interested in.
        if table in header.text: # if the table is listed, or if we're looking at DMRs...
          print(table)
          t = header.next_sibling # move to table
          if t.name == "table": # make sure it's a table
            #print(t)
            t = pd.read_html(t.prettify()) # need to prettify
            tables[table] = t
          elif t.next_sibling.name == "table": # try the next next element
            t = pd.read_html(t.next_sibling.prettify()) # need to prettify
            tables[table] = t
          else:
            print("error making table: " + table)
        elif header.text=="NPDES DMR (NPDES_DMR_FYxxxx.csv)": # an exception where EPA spells it out as FYXXXX instead of each FY2020, FY2021, etc.
          print("DMRs")
          t = header.next_sibling # move to table
          if t.name == "table": # make sure it's a table
            #print(t)
            t = pd.read_html(t.prettify()) # need to prettify
            tables["NPDES_DMRS_FY2021"] = t
            tables["NPDES_DMRS_FY2020"] = t
          elif t.next_sibling.name == "table": # try the next next element
            t = pd.read_html(t.next_sibling.prettify()) # need to prettify
            tables["NPDES_DMRS_FY2021"] = t
            tables["NPDES_DMRS_FY2020"] = t
          else:
            print("error making table: DMRs")

    # Work through each scraped dataframe/table
    for entity, table in tables.items():
      #print(entity)
      fields = ""
      for index, row in table[0].iterrows():
        #print(index,row)
        name = row["Element Name"] if "Element Name" in row else row["Element"] #RCRA = Element
        this = "    "
        this += '"'+name+'"'
        this += " "
        try:
          this += key[row["Data Type"]]
        except:
          this += "character varying" # some NaN data types. handle better
        if row["Data Type"] != "Date": # don't do length for date data types
          if math.isnan(row["Length"]): # Test for NaN
            this += "(10)" # If no lenght provided (NaN) set a random one (10)
          else:
            this += "("+str(int(row["Length"]))+")"  # need to handle floats here
        if index < table[0].shape[0] - 1:
          this += ", \n" 
        else:
          this += "\n" # no comma at the end
        fields += this

      t_name = entity
      schema = ''\
      '--\n'\
      "-- Name: "+t_name+"; Type: TABLE; Schema: public; Owner: echoepa \n"\
      '--'\
      '\n'\
      "CREATE UNLOGGED TABLE public.\""+t_name+"\" ( \n"\
      ''+fields+''\
      ');'\
      '\n'\
      '\n'\
      "ALTER TABLE public.\""+t_name+"\" OWNER TO echoepa;"\
      '\n'
      schemas.append(schema)
    
print("Done!")

"""Add additional schemas"""

# Add Last Modified table
schema = ''\
    '--\n'\
    "-- Name: Last-Modified; Type: TABLE; Schema: public; Owner: echoepa \n"\
    '--'\
    '\n'\
    "CREATE UNLOGGED TABLE public.\""+'Last-Modified'+"\" ( \n"\
    '    name character varying(40),' + '\n'\
    '    zip character varying(40),' + '\n'\
    '    modified date,' + '\n'\
    '    csv_count integer,' + '\n'\
    '    sql_count integer' + '\n'\
    ');'\
    '\n'\
    '\n'\
    "ALTER TABLE public.\""+'Last-Modified'+"\" OWNER TO echoepa;"\
    '\n'
schemas.append(schema)

# Add program lookup table
schema = ''\
    '--\n'\
    "-- Name: EXP_PGM; Type: TABLE; Schema: public; Owner: echoepa \n"\
    '--'\
    '\n'\
    "CREATE UNLOGGED TABLE public.\""+'EXP_PGM'+"\" ( \n"\
    '    "PGM" text,' + '\n'\
    '    "REGISTRY_ID" character varying(20),' + '\n'\
    '    "PGM_ID" text' + '\n'\
    ');'\
    '\n'\
    '\n'\
    "ALTER TABLE public.\""+'EXP_PGM'+"\" OWNER TO echoepa;"\
    '\n'
schemas.append(schema)

# Add ECHO_EXPORTER manually since the metadata are in a CSV not HTML
#.....

# Add views manually (figure out a way to auto-update)

"""print the schema"""

for schema in schemas:
  print(schema)