Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bill relationships resolution improved #128

Merged
merged 3 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 34 additions & 3 deletions openstates/cli/relationships.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,44 @@
import click
import logging
import logging.config
from typing import Union
from openstates.utils import abbr_to_jid
from ..utils.django import init_django
from ..utils import transformers
from ..exceptions import InternalError
from .. import settings


# Attempt to fix bill identifiers in the DB that were NOT normalized when saved the first time
# non-normalized bill identifiers will never be matchable to a bill.identifier value
def fix_abnormal_related_bill_identifiers(jurisdiction_id: str) -> None:
# import of model has to be after django_init
from ..data.models import RelatedBill
abnormal_unresolved_rb = RelatedBill.objects.filter(
bill__legislative_session__jurisdiction_id=jurisdiction_id,
related_bill=None,
).exclude(identifier__contains=' ')
for rb in abnormal_unresolved_rb:
new_identifier = transformers.fix_bill_id(rb.identifier)
if new_identifier is not rb.identifier:
# update this related bill row with normalized identifier
rb.identifier = new_identifier
rb.save()


@click.command(help="Resolve unresolved relationships between entities")
@click.argument("jurisdiction_abbreviation")
@click.option(
"--log_level",
help="Set the level of logging to output.",
default="INFO"
)
def main(jurisdiction_abbreviation: str, log_level: str) -> None:
@click.option(
"--session",
help="Session identifier, used to restrict resolution to within a specific session",
default=None
)
def main(jurisdiction_abbreviation: str, log_level: str, session: Union[str, None]) -> None:
# set up logging
logger = logging.getLogger("openstates")
handler_level = log_level
Expand All @@ -26,10 +50,17 @@ def main(jurisdiction_abbreviation: str, log_level: str) -> None:
init_django()
from openstates.importers import resolve_related_bills

logger.info(f"Beginning resolution of bill relationships for {jurisdiction_abbreviation}")
logger.info(f"Beginning resolution of bill relationships for {jurisdiction_abbreviation}, session: {session}")
jurisdiction_id = abbr_to_jid(jurisdiction_abbreviation)

# Prep: resolve any non-normalized bill identifiers in related bill data
# ie if RelatedBill has an identifier like "A1675" instead of "A 1675", then it can't be matched to a real bill
# (this was a historical problem only fixed in mid 2024)
fix_abnormal_related_bill_identifiers(jurisdiction_id)

# Run the resolution logic
try:
resolve_related_bills(jurisdiction_id, logger)
resolve_related_bills(jurisdiction_id, session, logger)
except InternalError as e:
logger.error(f"Error during bill relationship resolution for {jurisdiction_abbreviation}: {e}")

Expand Down
56 changes: 38 additions & 18 deletions openstates/importers/bills.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Union
from .base import BaseImporter
from ._types import _JsonDict, Model
from ..exceptions import InternalError
from ..data.models import (
Bill,
RelatedBill,
Expand All @@ -20,33 +20,53 @@
from .organizations import OrganizationImporter


def resolve_related_bills(jurisdiction_id, logger) -> None:
def resolve_related_bills(jurisdiction_id: str, session: Union[str, None], logger) -> None:
# go through all RelatedBill objs that are attached to a bill in this jurisdiction and
# are currently unresolved
related_bills = RelatedBill.objects.filter(
bill__legislative_session__jurisdiction_id=jurisdiction_id,
related_bill=None,
)
logger.info(f"Found {len(related_bills)} unresolved bill relationships")
matches_found = 0
if session is not None:
session_log = f"-{session}"
related_bills = RelatedBill.objects.filter(
bill__legislative_session__jurisdiction_id=jurisdiction_id,
bill__legislative_session__identifier=session,
related_bill=None,
)
else:
session_log = ""
related_bills = RelatedBill.objects.filter(
bill__legislative_session__jurisdiction_id=jurisdiction_id,
related_bill=None,
)
logger.info(f"Found {len(related_bills)} unresolved bill relationships in {jurisdiction_id}{session_log}")

# go session-by-session and see if we can find matching candidates
# we do this to reduce the number of SELECT queries we run in cases where there are many relations unresolved
sessions = {}
for rb in related_bills:
if rb.legislative_session not in sessions:
sessions[rb.legislative_session] = [rb.identifier]
else:
sessions[rb.legislative_session].append(rb.identifier)

session_candidate_bills = {}
for session in dict.keys(sessions):
candidates = list(
Bill.objects.filter(
legislative_session__identifier=rb.legislative_session,
identifier__in=sessions[session],
legislative_session__identifier=session,
legislative_session__jurisdiction_id=jurisdiction_id,
identifier=rb.identifier,
)
)
if len(candidates) == 1:
rb.related_bill = candidates[0]
session_candidate_bills[session] = {}
for bill in candidates:
session_candidate_bills[session][bill.identifier] = bill

matches_found = 0
for rb in related_bills:
if rb.identifier in session_candidate_bills[rb.legislative_session]:
rb.related_bill = session_candidate_bills[rb.legislative_session][rb.identifier]
rb.save()
matches_found += 1
logger.debug(f"Resolved {rb.legislative_session} {rb.bill.identifier}")
elif len(candidates) > 1: # pragma: no cover
# if we ever see this, we need to add additional fields on the relation
raise InternalError(
"multiple related_bill candidates found for {}".format(rb)
)
else:
logger.debug(f"FAILED to resolve {rb.legislative_session} {rb.bill.identifier}")

Expand Down Expand Up @@ -139,7 +159,7 @@ def prepare_for_db(self, data: _JsonDict) -> _JsonDict:
return data

def postimport(self) -> None:
resolve_related_bills(self.jurisdiction_id, self.logger)
resolve_related_bills(self.jurisdiction_id, None, self.logger)

def update_computed_fields(self, obj: Model) -> None:
update_bill_fields(obj, save=False)
5 changes: 4 additions & 1 deletion openstates/scrape/bill.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import warnings
from ..utils import _make_pseudo_id
from ..utils import _make_pseudo_id, transformers
from .popolo import pseudo_organization
from .base import BaseModel, SourceMixin, AssociatedLinkMixin, cleanup_list
from .schemas.bill import schema
Expand Down Expand Up @@ -90,6 +90,9 @@ def add_citation(
)

def add_related_bill(self, identifier, legislative_session, relation_type):
# Normalize identifier before saving
identifier = transformers.fix_bill_id(identifier)

# will we need jurisdiction, organization?
self.related_bills.append(
{
Expand Down
Loading