Run Scraper #158
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Run Scraper | |
on: | |
# schedule: | |
# # Runs every day at 4AM UTC (midnight in Brasília) | |
# - cron: '0 4 * * *' | |
workflow_dispatch: | |
inputs: | |
agencies: | |
description: 'Comma-separated list of agencies to scrape (leave empty to scrape all)' | |
required: false | |
default: '' | |
jobs: | |
run-scraper: | |
runs-on: ubuntu-latest | |
# 1) Use the published Docker image as the job container | |
container: | |
image: ghcr.io/nitaibezerra/govbrnews-scraper:latest | |
options: --workdir /app # This ensures the container starts in the correct directory | |
steps: | |
# Step 1: Set date variable for yesterday (still done in the runner's environment) | |
- name: Set date variable for yesterday | |
run: | | |
echo "YESTERDAY=$(date -d 'yesterday' +'%Y-%m-%d')" >> $GITHUB_ENV | |
# Step 2: Run the scraper (inside the Docker container) | |
- name: Run the scraper | |
env: | |
HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
run: | | |
# Ensure working directory is correct | |
cd /app | |
# Construct agencies argument if provided | |
AGENCIES_ARG="" | |
if [ ! -z "${{ inputs.agencies }}" ]; then | |
AGENCIES_ARG="--agencies ${{ inputs.agencies }}" | |
fi | |
# Run scraper without re-installing dependencies | |
python src/main.py scrape \ | |
--min-date $YESTERDAY \ | |
--sequential \ | |
--allow-update \ | |
$AGENCIES_ARG |