feat(main.py): accept multiple agencies as arg

nitaibezerra · Feb 4, 2025 · b095f22 · b095f22
1 parent e1a3f87
commit b095f22
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 16 deletions.
diff --git a/.github/workflows/scraper.yaml b/.github/workflows/scraper.yaml
@@ -6,8 +6,8 @@ on:
   #   - cron: '0 4 * * *'
   workflow_dispatch:
     inputs:
-      agency:
-        description: 'Agency to scrape (leave empty to scrape all)'
+      agencies:
+        description: 'Comma-separated list of agencies to scrape (leave empty to scrape all)'
         required: false
         default: ''
 
@@ -33,9 +33,15 @@ jobs:
           # Ensure working directory is correct
           cd /app
 
+          # Construct agencies argument if provided
+          AGENCIES_ARG=""
+          if [ ! -z "${{ inputs.agencies }}" ]; then
+            AGENCIES_ARG="--agencies ${{ inputs.agencies }}"
+          fi
+
           # Run scraper without re-installing dependencies
           python src/main.py scrape \
-            --min-date 2024-01-01 \
+            --min-date $YESTERDAY \
             --sequential \
             --allow-update \
-            ${{ inputs.agency && '--agency ' }}${{ inputs.agency }}
+            $AGENCIES_ARG
diff --git a/src/main.py b/src/main.py
@@ -21,8 +21,12 @@ def run_scraper(args):
     """
     dataset_manager = DatasetManager()
     scrape_manager = ScrapeManager(dataset_manager)
+
+    # Convert agency input into a list (comma-separated values)
+    agencies = args.agencies.split(",") if args.agencies else None
+
     scrape_manager.run_scraper(
-        args.agency, args.min_date, args.max_date, args.sequential, args.allow_update
+        agencies, args.min_date, args.max_date, args.sequential, args.allow_update
     )
 
 
@@ -42,9 +46,9 @@ def run_augment(args):
 
     augmentation_manager = AugmentationManager()
 
-    # Pass agency (if provided), along with the date range
+    # Pass agencies (if provided), along with the date range
     augmentation_manager.classify_and_update_dataset(
-        min_date=args.min_date, max_date=args.max_date, agency=args.agency
+        min_date=args.min_date, max_date=args.max_date, agency=args.agencies
     )
 
 
@@ -75,8 +79,8 @@ def main():
         help="The maximum date for scraping news (format: YYYY-MM-DD).",
     )
     scraper_parser.add_argument(
-        "--agency",
-        help="Scrape news for a specific agency (key in the YAML).",
+        "--agencies",
+        help="Scrape news for specific agencies (comma-separated, e.g., 'gestao,saude'). Leave empty to scrape all.",
     )
     scraper_parser.add_argument(
         "--sequential",
@@ -112,10 +116,10 @@ def main():
         help="Maximum date to process files up to (format: 'YYYY-MM-DD').",
     )
     augment_parser.add_argument(
-        "--agency",
+        "--agencies",
         type=str,
         default=None,
-        help="Agency to filter the files by.",
+        help="Agencies to filter the files by (comma-separated list).",
     )
 
     # Parse the command-line arguments and dispatch

diff --git a/src/scraper/scrape_manager.py b/src/scraper/scrape_manager.py
@@ -58,25 +58,39 @@ def _load_urls_from_yaml(self, file_name: str, agency: str = None) -> List[str]:
 
     def run_scraper(
         self,
-        agency: str,
+        agencies: List[str],
         min_date: str,
         max_date: str,
         sequential: bool,
         allow_update: bool = False,
     ):
         """
-        Executes the web scraping process for the given agency (or agencies), date range,
+        Executes the web scraping process for the given agencies, date range,
         and whether the scraping should happen sequentially or in bulk.
 
-        :param agency: The agency to scrape news from.
+        :param agencies: A list of agency names to scrape news from. If None, all agencies are scraped.
         :param min_date: The minimum date for filtering news.
         :param max_date: The maximum date for filtering news.
         :param sequential: Whether to scrape sequentially (True) or in bulk (False).
         :param allow_update: If True, overwrite existing entries in the dataset.
         """
         try:
-            urls = self._load_urls_from_yaml("site_urls.yaml", agency)
-            webscrapers = [WebScraper(min_date, url, max_date=max_date) for url in urls]
+            all_urls = []
+            # Load URLs for each agency in the list
+            if agencies:
+                for agency in agencies:
+                    try:
+                        urls = self._load_urls_from_yaml("site_urls.yaml", agency)
+                        all_urls.extend(urls)
+                    except ValueError as e:
+                        logging.warning(f"Skipping agency '{agency}': {e}")
+            else:
+                # Load all agency URLs if agencies list is None or empty
+                all_urls = self._load_urls_from_yaml("site_urls.yaml")
+
+            webscrapers = [
+                WebScraper(min_date, url, max_date=max_date) for url in all_urls
+            ]
 
             if sequential:
                 for scraper in webscrapers: