Run Scraper #172
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Run Scraper | |
on: | |
# schedule: | |
# # Runs every day at 4AM UTC (midnight in Brasília) | |
# - cron: '0 4 * * *' | |
workflow_dispatch: | |
inputs: | |
agencies: | |
description: 'Comma-separated list of agencies to scrape (leave empty to scrape all)' | |
required: false | |
default: '' | |
min-date: | |
description: 'Start date for scraping (format: YYYY-MM-DD, defaults to one week ago if not provided)' | |
required: false | |
default: '' | |
max-date: | |
description: 'End date for scraping (format: YYYY-MM-DD, leave empty to scrape up to today)' | |
required: false | |
default: '' | |
sequential: | |
description: 'Process each agency sequentially? (true or false)' | |
required: false | |
default: 'false' | |
jobs: | |
run-scraper: | |
runs-on: ubuntu-latest | |
container: | |
image: ghcr.io/nitaibezerra/govbrnews-scraper:latest | |
options: --workdir /app | |
steps: | |
# Step 1: Set date variables (one week ago as default min-date) | |
- name: Set date variables | |
run: | | |
echo "ONE_WEEK_AGO=$(date -d '7 days ago' +'%Y-%m-%d')" >> $GITHUB_ENV | |
# If min-date is not set, use one week ago | |
if [ -z "${{ inputs.min-date }}" ]; then | |
echo "MIN_DATE=$ONE_WEEK_AGO" >> $GITHUB_ENV | |
else | |
echo "MIN_DATE=${{ inputs.min-date }}" >> $GITHUB_ENV | |
fi | |
# If max-date is set, use it, otherwise default to empty | |
if [ ! -z "${{ inputs.max-date }}" ]; then | |
echo "MAX_DATE_ARG=--max-date ${{ inputs.max-date }}" >> $GITHUB_ENV | |
else | |
echo "MAX_DATE_ARG=" >> $GITHUB_ENV | |
fi | |
# Step 2: Print input values before running the scraper | |
- name: Print input values | |
run: | | |
echo "📅 Min Date: $MIN_DATE" | |
echo "📅 Max Date: ${{ inputs.max-date }}" | |
echo "🏛 Agencies: ${{ inputs.agencies }}" | |
echo "🔁 Sequential? ${{ inputs.sequential }}" | |
# Step 3: Run the scraper | |
- name: Run the scraper | |
env: | |
HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
run: | | |
cd /app | |
# If agencies input is provided, convert it to an argument | |
AGENCIES_ARG="" | |
if [ ! -z "${{ inputs.agencies }}" ]; then | |
AGENCIES_ARG="--agencies ${{ inputs.agencies }}" | |
fi | |
# If sequential == "true", add --sequential | |
SEQUENTIAL_ARG="" | |
if [ "${{ inputs.sequential }}" = "true" ]; then | |
SEQUENTIAL_ARG="--sequential" | |
fi | |
python src/main.py scrape \ | |
--min-date $MIN_DATE \ | |
--allow-update \ | |
$SEQUENTIAL_ARG \ | |
$MAX_DATE_ARG \ | |
$AGENCIES_ARG |