Skip to content

Commit

Permalink
feat(main.py): accept multiple agencies as arg
Browse files Browse the repository at this point in the history
  • Loading branch information
nitaibezerra committed Feb 4, 2025
1 parent e1a3f87 commit b095f22
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 16 deletions.
14 changes: 10 additions & 4 deletions .github/workflows/scraper.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ on:
# - cron: '0 4 * * *'
workflow_dispatch:
inputs:
agency:
description: 'Agency to scrape (leave empty to scrape all)'
agencies:
description: 'Comma-separated list of agencies to scrape (leave empty to scrape all)'
required: false
default: ''

Expand All @@ -33,9 +33,15 @@ jobs:
# Ensure working directory is correct
cd /app
# Construct agencies argument if provided
AGENCIES_ARG=""
if [ ! -z "${{ inputs.agencies }}" ]; then
AGENCIES_ARG="--agencies ${{ inputs.agencies }}"
fi
# Run scraper without re-installing dependencies
python src/main.py scrape \
--min-date 2024-01-01 \
--min-date $YESTERDAY \
--sequential \
--allow-update \
${{ inputs.agency && '--agency ' }}${{ inputs.agency }}
$AGENCIES_ARG
18 changes: 11 additions & 7 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,12 @@ def run_scraper(args):
"""
dataset_manager = DatasetManager()
scrape_manager = ScrapeManager(dataset_manager)

# Convert agency input into a list (comma-separated values)
agencies = args.agencies.split(",") if args.agencies else None

scrape_manager.run_scraper(
args.agency, args.min_date, args.max_date, args.sequential, args.allow_update
agencies, args.min_date, args.max_date, args.sequential, args.allow_update
)


Expand All @@ -42,9 +46,9 @@ def run_augment(args):

augmentation_manager = AugmentationManager()

# Pass agency (if provided), along with the date range
# Pass agencies (if provided), along with the date range
augmentation_manager.classify_and_update_dataset(
min_date=args.min_date, max_date=args.max_date, agency=args.agency
min_date=args.min_date, max_date=args.max_date, agency=args.agencies
)


Expand Down Expand Up @@ -75,8 +79,8 @@ def main():
help="The maximum date for scraping news (format: YYYY-MM-DD).",
)
scraper_parser.add_argument(
"--agency",
help="Scrape news for a specific agency (key in the YAML).",
"--agencies",
help="Scrape news for specific agencies (comma-separated, e.g., 'gestao,saude'). Leave empty to scrape all.",
)
scraper_parser.add_argument(
"--sequential",
Expand Down Expand Up @@ -112,10 +116,10 @@ def main():
help="Maximum date to process files up to (format: 'YYYY-MM-DD').",
)
augment_parser.add_argument(
"--agency",
"--agencies",
type=str,
default=None,
help="Agency to filter the files by.",
help="Agencies to filter the files by (comma-separated list).",
)

# Parse the command-line arguments and dispatch
Expand Down
24 changes: 19 additions & 5 deletions src/scraper/scrape_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,25 +58,39 @@ def _load_urls_from_yaml(self, file_name: str, agency: str = None) -> List[str]:

def run_scraper(
self,
agency: str,
agencies: List[str],
min_date: str,
max_date: str,
sequential: bool,
allow_update: bool = False,
):
"""
Executes the web scraping process for the given agency (or agencies), date range,
Executes the web scraping process for the given agencies, date range,
and whether the scraping should happen sequentially or in bulk.
:param agency: The agency to scrape news from.
:param agencies: A list of agency names to scrape news from. If None, all agencies are scraped.
:param min_date: The minimum date for filtering news.
:param max_date: The maximum date for filtering news.
:param sequential: Whether to scrape sequentially (True) or in bulk (False).
:param allow_update: If True, overwrite existing entries in the dataset.
"""
try:
urls = self._load_urls_from_yaml("site_urls.yaml", agency)
webscrapers = [WebScraper(min_date, url, max_date=max_date) for url in urls]
all_urls = []
# Load URLs for each agency in the list
if agencies:
for agency in agencies:
try:
urls = self._load_urls_from_yaml("site_urls.yaml", agency)
all_urls.extend(urls)
except ValueError as e:
logging.warning(f"Skipping agency '{agency}': {e}")
else:
# Load all agency URLs if agencies list is None or empty
all_urls = self._load_urls_from_yaml("site_urls.yaml")

webscrapers = [
WebScraper(min_date, url, max_date=max_date) for url in all_urls
]

if sequential:
for scraper in webscrapers:
Expand Down

0 comments on commit b095f22

Please sign in to comment.