Skip to content

Commit

Permalink
Add linters and run linting
Browse files Browse the repository at this point in the history
  • Loading branch information
jmelot committed Feb 27, 2024
1 parent d6f41c7 commit 6e18184
Show file tree
Hide file tree
Showing 9 changed files with 236 additions and 42 deletions.
5 changes: 5 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[flake8]
ignore = E203, E266, E501, W503, F403, F401
max-line-length = 120
max-complexity = 20
select = B,C,E,F,W,T4,B9
76 changes: 76 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
name: Python application

on: [pull_request]

jobs:
build:
name: tests-pass
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
- name: Set up Python 3.7
uses: actions/setup-python@v1
with:
python-version: 3.7
# - name: Install dependencies
# run: |
# python -m pip install --upgrade pip
# pip install -r requirements.txt
# - name: Test with pytest
# run: |
# coverage run -m pytest tests
# coverage xml -o coverage/python.xml
# - name: Report python coverage
# uses: orgoro/coverage@v3
# with:
# coverageFile: coverage/python.xml
# token: ${{ secrets.GITHUB_TOKEN }}
# The next few steps only apply if you have javascript files
# - name: Setup node
# uses: actions/setup-node@v3
# with:
# node-version: '18'
# - name: Test with jest
# shell: bash
# run: |
# npm install
# npm test -- --coverage --coverageReporters="json-summary" --coverageReporters="text" | tee ./coverage.txt
# shell: bash
# - name: Report javascript coverage
# uses: MishaKav/jest-coverage-comment@v1.0.20
# with:
# title: "JavaScript Coverage"
# summary-title: "Summary"
# coverage-title: "Modified Files"
# github-token: ${{ secrets.GITHUB_TOKEN }}
# report-only-changed-files: true
# coverage-path: ./JS-FOLDER-NAME/coverage.txt
# coverage-summary-path: ./JS-FOLDER-NAME/coverage/coverage-summary.json
# coverage-path-prefix: JS-FOLDER-NAME/src/
# - name: Build output files
# run: |
# npm run build
# - name: Check links in built files
# id: link_check
# run: |
# find public -name "*.js" -exec grep -Eo "(http|https):\/\/[^]\{\}\"'\\\(\)\> ]+" {} \; | sort -u > linklist.txt
# printf '%s\n%s\n%s\n' "# LinkChecker URL list" "# <meta charset=\"UTF-8\">" "$(cat linklist.txt)" > linklist.txt
# linkchecker linklist.txt --check-extern --ignore-url="https://.*\.fastly\.net/.*" --ignore-url="https://.*\.mapbox\..*" --ignore-url=".*//a\W.*" --ignore-url="http://(a|x|тест)" -o failures > output.txt || true
# cat output.txt
# echo "num_links=$(wc -l < output.txt | sed 's/^ *//g')" >> $GITHUB_OUTPUT
# echo "links<<EOFdelimiter" >> $GITHUB_OUTPUT
# echo "$(cat output.txt)" >> $GITHUB_OUTPUT
# echo "EOFdelimiter" >> $GITHUB_OUTPUT
# - name: Edit PR comment about link checking
# if: steps.link_check.outputs.num_links > 0
# uses: thollander/actions-comment-pull-request@v2
# with:
# message: |
# There are ${{ steps.link_check.outputs.num_links }} broken links. Check the code for these links:
# ${{ steps.link_check.outputs.links }}
# comment_tag: link_check_msg
- name: Run linting
run: |
pip install pre-commit
pre-commit run --all-files
42 changes: 42 additions & 0 deletions .github/workflows/rebase-reminder.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
name: Rebase reminder
on: [pull_request, pull_request_review]

jobs:
build:
name: rebuild-reminder
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Find behind count
id: behind_count
run: |
echo "behind_count=$(git rev-list --count ${{ github.event.pull_request.head.sha }}..${{ github.event.pull_request.base.sha }})" >> $GITHUB_OUTPUT
- name: Find ahead count
id: ahead_count
run: |
echo "ahead_count=$(git rev-list --count ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }})" >> $GITHUB_OUTPUT
- name: Find combined count
id: combined_count
run: |
echo "combined_count=$(expr ${{steps.behind_count.outputs.behind_count}} + ${{steps.ahead_count.outputs.ahead_count}})" >> $GITHUB_OUTPUT
- name: Edit PR comment - rebasing
if: steps.behind_count.outputs.behind_count > 0 && steps.combined_count.outputs.combined_count > 3
uses: thollander/actions-comment-pull-request@v1
with:
message: |
Needs rebasing :bangbang:
behind_count is ${{ steps.behind_count.outputs.behind_count }}
ahead_count is ${{ steps.ahead_count.outputs.ahead_count }}
comment_includes: 'rebasing'
- name: Edit PR comment - no rebasing
if: steps.behind_count.outputs.behind_count == 0 || steps.combined_count.outputs.combined_count <= 3
uses: thollander/actions-comment-pull-request@v1
with:
message: |
No need for rebasing :+1:
behind_count is ${{ steps.behind_count.outputs.behind_count }}
ahead_count is ${{ steps.ahead_count.outputs.ahead_count }}
comment_includes: 'rebasing'
37 changes: 37 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.1.0
hooks:
- id: trailing-whitespace
exclude: "__snapshots__"
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
- id: check-json
# The next step only applies if you have javascript files.
# There should be a package.json that installs eslint
# (or eslint-config-react-app if you are using gatsby).
# - repo: https://github.com/pre-commit/mirrors-eslint
# rev: v8.24.0
# hooks:
# - id: eslint
- repo: https://github.com/PyCQA/isort
rev: 5.11.5
hooks:
- id: isort
- repo: https://github.com/ambv/black
rev: 22.3.0
hooks:
- id: black
language_version: python3
- repo: https://github.com/PyCQA/flake8
rev: 4.0.1
hooks:
- id: flake8
#- repo: https://github.com/sqlfluff/sqlfluff
# rev: 0.10.1
# hooks:
# - id: sqlfluff-lint
# - id: sqlfluff-fix
14 changes: 14 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[tool.black]
py36 = true
include = '\.pyi?$'
exclude = '''
/(
\.git
| \.venv
| build
| dist
)/
'''

[tool.isort]
profile = "black"
82 changes: 48 additions & 34 deletions ror_dag.py
Original file line number Diff line number Diff line change
@@ -1,57 +1,61 @@
import json
from datetime import datetime

from airflow import DAG
from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator, BigQueryCheckOperator
from airflow.providers.google.cloud.transfers.bigquery_to_bigquery import BigQueryToBigQueryOperator
from airflow.operators.python import PythonOperator
from airflow.providers.google.cloud.operators.bigquery import (
BigQueryCheckOperator,
BigQueryInsertJobOperator,
)
from airflow.providers.google.cloud.operators.gcs import GCSDeleteObjectsOperator
from airflow.providers.google.cloud.transfers.gcs_to_bigquery import GCSToBigQueryOperator
from airflow.providers.google.cloud.operators.kubernetes_engine import (
GKEStartPodOperator,
)
from airflow.operators.python import PythonOperator
from datetime import datetime

from airflow.providers.google.cloud.transfers.bigquery_to_bigquery import (
BigQueryToBigQueryOperator,
)
from airflow.providers.google.cloud.transfers.gcs_to_bigquery import (
GCSToBigQueryOperator,
)
from dataloader.airflow_utils.defaults import (
DATA_BUCKET,
DAGS_DIR,
DATA_BUCKET,
GCP_ZONE,
PROJECT_ID,
get_default_args,
get_post_success,
)
from dataloader.scripts.populate_documentation import update_table_descriptions


args = get_default_args(pocs=["Jennifer"])
args["retries"] = 1
args["on_failure_callback"] = None


with DAG("ror_updater",
default_args=args,
description="Links articles across our scholarly lit holdings.",
schedule_interval="0 0 * * 5",
catchup=False
) as dag:
with DAG(
"ror_updater",
default_args=args,
description="Links articles across our scholarly lit holdings.",
schedule_interval="0 0 * * 5",
catchup=False,
) as dag:
gcs_folder = "ror"
tmp_dir = f"{gcs_folder}/tmp"
raw_data_dir = f"{gcs_folder}/data"
schema_dir = f"{gcs_folder}/schemas"
sql_dir = f"sql/{gcs_folder}"
production_dataset = "gcp_cset_ror"
staging_dataset = "staging_"+production_dataset
backup_dataset = production_dataset+"_backups"
staging_dataset = "staging_" + production_dataset
backup_dataset = production_dataset + "_backups"

# We keep several intermediate outputs in a tmp dir on gcs, so clean it out at the start of each run. We clean at
# the start of the run so if the run fails we can examine the failed data
clear_tmp_dir = GCSDeleteObjectsOperator(
task_id="clear_tmp_gcs_dir",
bucket_name=DATA_BUCKET,
prefix=tmp_dir + "/"
task_id="clear_tmp_gcs_dir", bucket_name=DATA_BUCKET, prefix=tmp_dir + "/"
)

# Retrieve and expand the data
json_loc = tmp_dir+"/ror.jsonl"
json_loc = tmp_dir + "/ror.jsonl"
working_dir = "ror_working_dir"
setup_commands = f"rm -rf {working_dir};" + " && ".join(
[
Expand Down Expand Up @@ -113,31 +117,33 @@
destination_project_dataset_table=f"{staging_dataset}.ror",
source_format="NEWLINE_DELIMITED_JSON",
create_disposition="CREATE_IF_NEEDED",
write_disposition="WRITE_TRUNCATE"
write_disposition="WRITE_TRUNCATE",
)

# Check that the number of ids is >= what we have in production and that the ids are unique
checks = [
BigQueryCheckOperator(
task_id="check_unique_ids",
sql=(f"select count(distinct(id)) = count(id) from {staging_dataset}.ror"),
use_legacy_sql=False
use_legacy_sql=False,
),
BigQueryCheckOperator(
task_id="check_monotonic_increase",
sql=(f"select (select count(0) from {staging_dataset}.ror) >= "
f"(select count(0) from {production_dataset}.ror)"),
use_legacy_sql=False
)
sql=(
f"select (select count(0) from {staging_dataset}.ror) >= "
f"(select count(0) from {production_dataset}.ror)"
),
use_legacy_sql=False,
),
]

# Load into production
load_production = BigQueryToBigQueryOperator(
task_id=f"load_production",
task_id="load_production",
source_project_dataset_tables=[f"{staging_dataset}.ror"],
destination_project_dataset_table=f"{production_dataset}.ror",
create_disposition="CREATE_IF_NEEDED",
write_disposition="WRITE_TRUNCATE"
write_disposition="WRITE_TRUNCATE",
)

# Update descriptions
Expand All @@ -148,23 +154,31 @@
op_kwargs={
"input_schema": f"{DAGS_DIR}/schemas/{gcs_folder}/ror.json",
"table_name": f"{production_dataset}.ror",
"table_description": table_desc["ror"]
"table_description": table_desc["ror"],
},
python_callable=update_table_descriptions
python_callable=update_table_descriptions,
)

# Copy to backups
curr_date = datetime.now().strftime("%Y%m%d")
backup = BigQueryToBigQueryOperator(
task_id=f"snapshot_ror",
task_id="snapshot_ror",
source_project_dataset_tables=[f"{production_dataset}.ror"],
destination_project_dataset_table=f"{backup_dataset}.ror_{curr_date}",
create_disposition="CREATE_IF_NEEDED",
write_disposition="WRITE_TRUNCATE"
write_disposition="WRITE_TRUNCATE",
)

# Declare victory
success_alert = get_post_success("ROR update succeeded!", dag)

(clear_tmp_dir >> download_data >> load_staging >> checks >> load_production >> pop_descriptions >> backup >>
success_alert)
(
clear_tmp_dir
>> download_data
>> load_staging
>> checks
>> load_production
>> pop_descriptions
>> backup
>> success_alert
)
18 changes: 12 additions & 6 deletions ror_scripts/fetch.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import argparse
import json
import os
import requests
import tempfile
from zipfile import ZipFile

import requests
from google.cloud import storage
from zipfile import ZipFile


def fetch(output_bucket: str, output_loc: str) -> None:
Expand All @@ -16,7 +16,9 @@ def fetch(output_bucket: str, output_loc: str) -> None:
:param output_loc: blob name where data should be written on GCS
:return: None
"""
resp = requests.get("https://zenodo.org/api/records/?communities=ror-data&sort=mostrecent")
resp = requests.get(
"https://zenodo.org/api/records/?communities=ror-data&sort=mostrecent"
)
dataset_js = resp.json()
latest_delivery = dataset_js["hits"]["hits"][0]["files"][0]["links"]["self"]
zip_resp = requests.get(latest_delivery)
Expand All @@ -34,7 +36,7 @@ def fetch(output_bucket: str, output_loc: str) -> None:
js = json.loads(f.read())
with open(output_file, mode="w") as out:
for elt in js:
out.write(json.dumps(elt)+"\n")
out.write(json.dumps(elt) + "\n")
storage_client = storage.Client()
bucket = storage_client.bucket(output_bucket)
blob = bucket.blob(output_loc)
Expand All @@ -43,8 +45,12 @@ def fetch(output_bucket: str, output_loc: str) -> None:

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--output_bucket", help="GCS bucket where data should be written", required=True)
parser.add_argument("--output_loc", help="Blob name where data shuld be written", required=True)
parser.add_argument(
"--output_bucket", help="GCS bucket where data should be written", required=True
)
parser.add_argument(
"--output_loc", help="Blob name where data shuld be written", required=True
)
args = parser.parse_args()

fetch(args.output_bucket, args.output_loc)
2 changes: 1 addition & 1 deletion schemas/ror.json
Original file line number Diff line number Diff line change
Expand Up @@ -461,4 +461,4 @@
"name": "external_ids",
"type": "RECORD"
}
]
]
Loading

0 comments on commit 6e18184

Please sign in to comment.