Skip to content

Run Scraper

Run Scraper #158

Workflow file for this run

name: Run Scraper
on:
# schedule:
# # Runs every day at 4AM UTC (midnight in Brasília)
# - cron: '0 4 * * *'
workflow_dispatch:
inputs:
agencies:
description: 'Comma-separated list of agencies to scrape (leave empty to scrape all)'
required: false
default: ''
jobs:
run-scraper:
runs-on: ubuntu-latest
# 1) Use the published Docker image as the job container
container:
image: ghcr.io/nitaibezerra/govbrnews-scraper:latest
options: --workdir /app # This ensures the container starts in the correct directory
steps:
# Step 1: Set date variable for yesterday (still done in the runner's environment)
- name: Set date variable for yesterday
run: |
echo "YESTERDAY=$(date -d 'yesterday' +'%Y-%m-%d')" >> $GITHUB_ENV
# Step 2: Run the scraper (inside the Docker container)
- name: Run the scraper
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
# Ensure working directory is correct
cd /app
# Construct agencies argument if provided
AGENCIES_ARG=""
if [ ! -z "${{ inputs.agencies }}" ]; then
AGENCIES_ARG="--agencies ${{ inputs.agencies }}"
fi
# Run scraper without re-installing dependencies
python src/main.py scrape \
--min-date $YESTERDAY \
--sequential \
--allow-update \
$AGENCIES_ARG