Skip to content

Run Scraper

Run Scraper #172

Workflow file for this run

name: Run Scraper
on:
# schedule:
# # Runs every day at 4AM UTC (midnight in Brasília)
# - cron: '0 4 * * *'
workflow_dispatch:
inputs:
agencies:
description: 'Comma-separated list of agencies to scrape (leave empty to scrape all)'
required: false
default: ''
min-date:
description: 'Start date for scraping (format: YYYY-MM-DD, defaults to one week ago if not provided)'
required: false
default: ''
max-date:
description: 'End date for scraping (format: YYYY-MM-DD, leave empty to scrape up to today)'
required: false
default: ''
sequential:
description: 'Process each agency sequentially? (true or false)'
required: false
default: 'false'
jobs:
run-scraper:
runs-on: ubuntu-latest
container:
image: ghcr.io/nitaibezerra/govbrnews-scraper:latest
options: --workdir /app
steps:
# Step 1: Set date variables (one week ago as default min-date)
- name: Set date variables
run: |
echo "ONE_WEEK_AGO=$(date -d '7 days ago' +'%Y-%m-%d')" >> $GITHUB_ENV
# If min-date is not set, use one week ago
if [ -z "${{ inputs.min-date }}" ]; then
echo "MIN_DATE=$ONE_WEEK_AGO" >> $GITHUB_ENV
else
echo "MIN_DATE=${{ inputs.min-date }}" >> $GITHUB_ENV
fi
# If max-date is set, use it, otherwise default to empty
if [ ! -z "${{ inputs.max-date }}" ]; then
echo "MAX_DATE_ARG=--max-date ${{ inputs.max-date }}" >> $GITHUB_ENV
else
echo "MAX_DATE_ARG=" >> $GITHUB_ENV
fi
# Step 2: Print input values before running the scraper
- name: Print input values
run: |
echo "📅 Min Date: $MIN_DATE"
echo "📅 Max Date: ${{ inputs.max-date }}"
echo "🏛 Agencies: ${{ inputs.agencies }}"
echo "🔁 Sequential? ${{ inputs.sequential }}"
# Step 3: Run the scraper
- name: Run the scraper
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
cd /app
# If agencies input is provided, convert it to an argument
AGENCIES_ARG=""
if [ ! -z "${{ inputs.agencies }}" ]; then
AGENCIES_ARG="--agencies ${{ inputs.agencies }}"
fi
# If sequential == "true", add --sequential
SEQUENTIAL_ARG=""
if [ "${{ inputs.sequential }}" = "true" ]; then
SEQUENTIAL_ARG="--sequential"
fi
python src/main.py scrape \
--min-date $MIN_DATE \
--allow-update \
$SEQUENTIAL_ARG \
$MAX_DATE_ARG \
$AGENCIES_ARG