Skip to content

Commit

Permalink
add some tests
Browse files Browse the repository at this point in the history
  • Loading branch information
sborms committed Oct 8, 2023
1 parent cf9ead5 commit 12955b4
Show file tree
Hide file tree
Showing 10 changed files with 104 additions and 17 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/cicd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ name: Run CI/CD pipeline

on:
push:
# branches: [ "main" ]
branches-ignore:
- "main"
branches: [ "main" ]
# branches-ignore:
# - "main"
pull_request:
branches: [ "main" ]
# allow manual trigger
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ test:
python -m pytest -vv

scrape:
@echo ">>> Scraping data from PCS"
@echo ">>> Scraping data from LZV Cup"
python ./scraper/main.py

push:
Expand Down
18 changes: 11 additions & 7 deletions scraper/main.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
import os
import sys

import pandas as pd
from base import DataStorage
from logger import Logger
from parsers.lzvcup import LZVCupParser
from utils import ymd

DIR_SCRIPT = os.path.dirname(os.path.abspath(__file__))
DIR_LOGS = f"{DIR_SCRIPT}/logs/{ymd()}/" # create a new log directory for each day
sys.path.append(os.path.dirname(DIR_SCRIPT)) # so we can keep main.py in scraper/

from scraper.parsers.lzvcup import LZVCupParser
from scraper.utils.base import DataStorage
from scraper.utils.logger import Logger
from scraper.utils.utils import ymd

DIR_LOGS = f"{DIR_SCRIPT}/logs/{ymd()}/" # subdivide logs by day of script execution


def scrape(config, log_main):
Expand Down Expand Up @@ -45,7 +49,7 @@ def scrape(config, log_main):
)
dict_regions[region] = competitions

# get current competition standings and player statistics per team
# get current competition standings, player statistics, and team palmares
df_standings, df_stats, df_palmares = parser.parse_standings_and_stats(
dict_competitions=competitions["competitions"], region=region
)
Expand All @@ -68,7 +72,7 @@ def scrape(config, log_main):
df_palmares_all = pd.concat(list_palmares, axis=0)
df_sportshalls_all = pd.concat(list_sportshalls, axis=0)

# get historical player statistics
# get historical player statistics if enabled in config
if config["steps"]["historical_players"] is True:
log_main.info(f"Processing all historical player statistics")
df_stats_historical_players = LZVCupParser.parse_player_stats_history(
Expand Down
File renamed without changes.
12 changes: 9 additions & 3 deletions scraper/parsers/lzvcup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,19 @@
import numpy as np
import pandas as pd
import requests
from base import BaseScraper
from utils import add_columns_to_df, chunks

from scraper.utils.base import BaseScraper
from scraper.utils.utils import add_columns_to_df, chunks


class LZVCupParser(BaseScraper):
def __init__(self, config, **kwargs) -> None:
"""Config should minimally include: 'base_url', 'area_name', and 'area_url'."""
"""
The config dictionary should include 'base_url', 'area_name', and 'area_url'.
Keys from the config will be stored with a _ prefix as attributes of the class.
Additional keyword arguments won't have such prefix, except for the optional
logger= argument, which will be stored as self._logger.
"""
super().__init__(config, **kwargs)

# complete area url
Expand Down
Empty file added scraper/utils/__init__.py
Empty file.
7 changes: 5 additions & 2 deletions scraper/base.py → scraper/utils/base.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import json
import logging

import pandas as pd
import requests
import structlog
from bs4 import BeautifulSoup


Expand Down Expand Up @@ -38,13 +38,16 @@ def load_csv(dir):


class BaseScraper(DataStorage):
def __init__(self, config={}, logger=logging.getLogger()) -> None:
def __init__(self, config={}, logger=structlog.getLogger(), **kwargs) -> None:
"""Assigns config keys as separate attributes prefixed with _."""
for name, value in config.items():
setattr(self, f"_{name}", value)

self._logger = logger

for name, value in kwargs.items():
setattr(self, name, value)

def convert_to_full_url(self, url_end):
"""Joins 'url_end' with self._base_url."""
if url_end[0] == "/":
Expand Down
2 changes: 1 addition & 1 deletion scraper/logger.py → scraper/utils/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def __init__(self, log_name, log_file, level) -> None:
# define basic configuration
structlog.configure(
processors=[
structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S"),
structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S", utc=False),
structlog.stdlib.add_log_level,
structlog.stdlib.add_logger_name,
structlog.processors.StackInfoRenderer(),
Expand Down
File renamed without changes.
74 changes: 74 additions & 0 deletions tests/test_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import pytest

from scraper.parsers.lzvcup import LZVCupParser

config_ = {
"base_url": "https://www.lzvcup.be",
"area_name": "VLAAMS BRABANT",
"area_url": "results/5",
}
parser = LZVCupParser(config_, region="Regio Ring Oost")


def test_parse_region_cards_and_competitions_from_region_card():
region_cards = parser.parse_region_cards()
regions = list(region_cards.keys())

assert regions == [
"Regio Dames Oost-Brabant",
"Regio Hageland",
"Regio Leuven",
"Regio Leuven Studenten",
"Regio Pajottenland",
"Regio Ring Noord",
"Regio Ring Oost",
]

competitions = parser.parse_competitions_from_region_card(
region_cards[parser.region]
)

assert competitions.keys() == {"competitions", "sportshalls"}


def test_parse_standings_and_stats():
dict_competitions = {
"1e Klasse": {
"url": "https://www.lzvcup.be/results/5/16/1",
"teams": {
"ZVC Vollentip": "https://www.lzvcup.be/teams/detail/365",
"Oppem Boys": "https://www.lzvcup.be/teams/detail/1242",
"The Crows": "https://www.lzvcup.be/teams/detail/1971",
"Eppegem City": "https://www.lzvcup.be/teams/detail/1970",
"Tervuren United": "https://www.lzvcup.be/teams/detail/551",
},
},
"2e Klasse": {
"url": "https://www.lzvcup.be/results/5/16/2",
"teams": {
"Aston Birra": "https://www.lzvcup.be/teams/detail/2001",
"ZVC Copains": "https://www.lzvcup.be/teams/detail/1525",
"Chiro Mik Mak": "https://www.lzvcup.be/teams/detail/1526",
"FC Degradé": "https://www.lzvcup.be/teams/detail/2002",
"The Blinders": "https://www.lzvcup.be/teams/detail/1605",
},
},
}

df_standings, df_stats, df_palmares = parser.parse_standings_and_stats(
dict_competitions=dict_competitions, region=parser.region
)

assert len(df_standings) > 0
assert len(df_stats) > 0
assert len(df_palmares) > 0


@pytest.mark.parametrize(
"url, region",
[("https://www.lzvcup.be/sportshalls/16", "Regio Ring Oost")],
)
def test_parse_sporthalls(url, region):
df = parser.parse_sporthalls(url_sportshalls=url, region=region)

assert len(df) > 0

0 comments on commit 12955b4

Please sign in to comment.