diff --git a/city_scrapers/spiders/example.py b/city_scrapers/spiders/example.py new file mode 100644 index 0000000..e6c26cc --- /dev/null +++ b/city_scrapers/spiders/example.py @@ -0,0 +1,231 @@ +import re +from collections import defaultdict +from datetime import datetime + +import scrapy +from city_scrapers_core.constants import BOARD, COMMITTEE, NOT_CLASSIFIED +from city_scrapers_core.items import Meeting +from city_scrapers_core.spiders import CityScrapersSpider + + +class StlAldermenSpider(CityScrapersSpider): + name = "stl_public_service" + agency = "St. Louis Board of Public Service" + timezone = "America/Chicago" + custom_settings = {"ROBOTSTXT_OBEY": False} + start_urls = [ + ( + "https://www.stlouis-mo.gov/government/departments/public-service/index.cfm" + ) + ] + + def __init__(self, *args, **kwargs): + self.agenda_map = defaultdict(list) + super().__init__(*args, **kwargs) + + def parse(self, response): + self._parse_links(response) + yield from self._parse_meetings_page(response) + + def _parse_meetings_page(self, response): + urls = [ + ( + "https://www.stlouis-mo.gov/events/" + "past-meetings.cfm?span=-30&department=332" + ), + "https://www.stlouis-mo.gov/events/all-public-meetings.cfm?span=30", + ] + for url in urls: + yield scrapy.Request( + url=url, method="GET", callback=self._parse_events_page + ) + + def _parse_events_page(self, response): + for url in self._get_event_urls(response): + yield scrapy.Request(url, callback=self._parse_event, dont_filter=True) + + def _get_event_urls(self, response): + event_urls = response.css("ul.list-group h4 a::attr(href)").getall() + event_sponsors = response.css("ul.list-group li span.small::text").getall() + urls = [] + for url, sponsor in zip(event_urls, event_sponsors): + if "aldermen" in sponsor.lower() or "aldermanic" in sponsor.lower(): + urls.append(response.urljoin(url)) + return urls + + def _parse_event(self, response): + """ + `parse` should always `yield` Meeting items. + Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping + needs. + """ + start = self._parse_start(response) + links_key = datetime.strftime(start, "%m-%d-%y") + + meeting = Meeting( + title=self._parse_title(response), + description=self._parse_description(response), + classification=self._parse_classification(response), + start=start, + end=self._parse_end(response), + all_day=self._parse_all_day(response), + location=self._parse_location(response), + source=response.url, + ) + + if meeting["classification"] == BOARD: + if links_key in self.agenda_map.keys(): + meeting["links"] = self.agenda_map[links_key] + else: + meeting["links"] = [] + + meeting["status"] = self._get_status(meeting) + meeting["id"] = self._get_id(meeting) + return meeting + + def _parse_title(self, response): + """Parse or generate meeting title.""" + title = response.css("div.page-title-row h1::text").get() + title = title.replace("Meeting", "").replace("Metting", "") + title = title.replace("-", "- ") + title = title.replace("(Canceled)", "Cancelled") + return title.replace(" ", " ").strip() + + def _parse_description(self, response): + """Parse or generate meeting description.""" + description = response.css( + "div#EventDisplayBlock div.col-md-8 h4 strong::text" + ).getall() + i = 0 + while i < len(description) - 1: + if "following:" in description[i]: + return description[i + 1].replace("\xa0", "") + elif "will" in description[i]: + return description[i].replace("\xa0", "") + else: + i += 1 + else: + return "" + + def _parse_classification(self, response): + """Parse or generate classification from allowed options.""" + title = response.css("div.page-title-row h1::text").get() + if "committee" in title.lower(): + return COMMITTEE + elif "board" in title.lower(): + return BOARD + else: + return NOT_CLASSIFIED + + def _parse_start(self, response): + """Parse start datetime as a naive datetime object.""" + date = response.css("div.page-title-row p.page-summary::text").get() + pattern = r"(?P\d{2}/\d{2}/\d{2}), (?P