diff --git a/openstates/cli/relationships.py b/openstates/cli/relationships.py index a495f3ebe..64d6852b0 100644 --- a/openstates/cli/relationships.py +++ b/openstates/cli/relationships.py @@ -1,12 +1,31 @@ import click import logging import logging.config +from typing import Union from openstates.utils import abbr_to_jid from ..utils.django import init_django +from ..utils import transformers from ..exceptions import InternalError from .. import settings +# Attempt to fix bill identifiers in the DB that were NOT normalized when saved the first time +# non-normalized bill identifiers will never be matchable to a bill.identifier value +def fix_abnormal_related_bill_identifiers(jurisdiction_id: str) -> None: + # import of model has to be after django_init + from ..data.models import RelatedBill + abnormal_unresolved_rb = RelatedBill.objects.filter( + bill__legislative_session__jurisdiction_id=jurisdiction_id, + related_bill=None, + ).exclude(identifier__contains=' ') + for rb in abnormal_unresolved_rb: + new_identifier = transformers.fix_bill_id(rb.identifier) + if new_identifier is not rb.identifier: + # update this related bill row with normalized identifier + rb.identifier = new_identifier + rb.save() + + @click.command(help="Resolve unresolved relationships between entities") @click.argument("jurisdiction_abbreviation") @click.option( @@ -14,7 +33,12 @@ help="Set the level of logging to output.", default="INFO" ) -def main(jurisdiction_abbreviation: str, log_level: str) -> None: +@click.option( + "--session", + help="Session identifier, used to restrict resolution to within a specific session", + default=None +) +def main(jurisdiction_abbreviation: str, log_level: str, session: Union[str, None]) -> None: # set up logging logger = logging.getLogger("openstates") handler_level = log_level @@ -26,10 +50,17 @@ def main(jurisdiction_abbreviation: str, log_level: str) -> None: init_django() from openstates.importers import resolve_related_bills - logger.info(f"Beginning resolution of bill relationships for {jurisdiction_abbreviation}") + logger.info(f"Beginning resolution of bill relationships for {jurisdiction_abbreviation}, session: {session}") jurisdiction_id = abbr_to_jid(jurisdiction_abbreviation) + + # Prep: resolve any non-normalized bill identifiers in related bill data + # ie if RelatedBill has an identifier like "A1675" instead of "A 1675", then it can't be matched to a real bill + # (this was a historical problem only fixed in mid 2024) + fix_abnormal_related_bill_identifiers(jurisdiction_id) + + # Run the resolution logic try: - resolve_related_bills(jurisdiction_id, logger) + resolve_related_bills(jurisdiction_id, session, logger) except InternalError as e: logger.error(f"Error during bill relationship resolution for {jurisdiction_abbreviation}: {e}") diff --git a/openstates/importers/bills.py b/openstates/importers/bills.py index 8ace52d00..6b1aad422 100644 --- a/openstates/importers/bills.py +++ b/openstates/importers/bills.py @@ -1,6 +1,6 @@ +from typing import Union from .base import BaseImporter from ._types import _JsonDict, Model -from ..exceptions import InternalError from ..data.models import ( Bill, RelatedBill, @@ -20,33 +20,53 @@ from .organizations import OrganizationImporter -def resolve_related_bills(jurisdiction_id, logger) -> None: +def resolve_related_bills(jurisdiction_id: str, session: Union[str, None], logger) -> None: # go through all RelatedBill objs that are attached to a bill in this jurisdiction and # are currently unresolved - related_bills = RelatedBill.objects.filter( - bill__legislative_session__jurisdiction_id=jurisdiction_id, - related_bill=None, - ) - logger.info(f"Found {len(related_bills)} unresolved bill relationships") - matches_found = 0 + if session is not None: + session_log = f"-{session}" + related_bills = RelatedBill.objects.filter( + bill__legislative_session__jurisdiction_id=jurisdiction_id, + bill__legislative_session__identifier=session, + related_bill=None, + ) + else: + session_log = "" + related_bills = RelatedBill.objects.filter( + bill__legislative_session__jurisdiction_id=jurisdiction_id, + related_bill=None, + ) + logger.info(f"Found {len(related_bills)} unresolved bill relationships in {jurisdiction_id}{session_log}") + + # go session-by-session and see if we can find matching candidates + # we do this to reduce the number of SELECT queries we run in cases where there are many relations unresolved + sessions = {} for rb in related_bills: + if rb.legislative_session not in sessions: + sessions[rb.legislative_session] = [rb.identifier] + else: + sessions[rb.legislative_session].append(rb.identifier) + + session_candidate_bills = {} + for session in dict.keys(sessions): candidates = list( Bill.objects.filter( - legislative_session__identifier=rb.legislative_session, + identifier__in=sessions[session], + legislative_session__identifier=session, legislative_session__jurisdiction_id=jurisdiction_id, - identifier=rb.identifier, ) ) - if len(candidates) == 1: - rb.related_bill = candidates[0] + session_candidate_bills[session] = {} + for bill in candidates: + session_candidate_bills[session][bill.identifier] = bill + + matches_found = 0 + for rb in related_bills: + if rb.identifier in session_candidate_bills[rb.legislative_session]: + rb.related_bill = session_candidate_bills[rb.legislative_session][rb.identifier] rb.save() matches_found += 1 logger.debug(f"Resolved {rb.legislative_session} {rb.bill.identifier}") - elif len(candidates) > 1: # pragma: no cover - # if we ever see this, we need to add additional fields on the relation - raise InternalError( - "multiple related_bill candidates found for {}".format(rb) - ) else: logger.debug(f"FAILED to resolve {rb.legislative_session} {rb.bill.identifier}") @@ -139,7 +159,7 @@ def prepare_for_db(self, data: _JsonDict) -> _JsonDict: return data def postimport(self) -> None: - resolve_related_bills(self.jurisdiction_id, self.logger) + resolve_related_bills(self.jurisdiction_id, None, self.logger) def update_computed_fields(self, obj: Model) -> None: update_bill_fields(obj, save=False) diff --git a/openstates/scrape/bill.py b/openstates/scrape/bill.py index b230e852d..d1e40d7d6 100644 --- a/openstates/scrape/bill.py +++ b/openstates/scrape/bill.py @@ -1,5 +1,5 @@ import warnings -from ..utils import _make_pseudo_id +from ..utils import _make_pseudo_id, transformers from .popolo import pseudo_organization from .base import BaseModel, SourceMixin, AssociatedLinkMixin, cleanup_list from .schemas.bill import schema @@ -90,6 +90,9 @@ def add_citation( ) def add_related_bill(self, identifier, legislative_session, relation_type): + # Normalize identifier before saving + identifier = transformers.fix_bill_id(identifier) + # will we need jurisdiction, organization? self.related_bills.append( {