add some tests

sborms · Oct 8, 2023 · 12955b4 · 12955b4
1 parent cf9ead5
commit 12955b4
Show file tree

Hide file tree

Showing 10 changed files with 104 additions and 17 deletions.
diff --git a/.github/workflows/cicd.yaml b/.github/workflows/cicd.yaml
@@ -2,9 +2,9 @@ name: Run CI/CD pipeline
 
 on:
   push:
-    # branches: [ "main" ]
-    branches-ignore:    
-      - "main"
+    branches: [ "main" ]
+    # branches-ignore:    
+    #   - "main"
   pull_request:
     branches: [ "main" ]
   # allow manual trigger  

diff --git a/Makefile b/Makefile
@@ -27,7 +27,7 @@ test:
 	python -m pytest -vv
 
 scrape:
-	@echo ">>> Scraping data from PCS"
+	@echo ">>> Scraping data from LZV Cup"
 	python ./scraper/main.py
 
 push:

diff --git a/scraper/main.py b/scraper/main.py
@@ -1,13 +1,17 @@
 import os
+import sys
 
 import pandas as pd
-from base import DataStorage
-from logger import Logger
-from parsers.lzvcup import LZVCupParser
-from utils import ymd
 
 DIR_SCRIPT = os.path.dirname(os.path.abspath(__file__))
-DIR_LOGS = f"{DIR_SCRIPT}/logs/{ymd()}/"  # create a new log directory for each day
+sys.path.append(os.path.dirname(DIR_SCRIPT))  # so we can keep main.py in scraper/
+
+from scraper.parsers.lzvcup import LZVCupParser
+from scraper.utils.base import DataStorage
+from scraper.utils.logger import Logger
+from scraper.utils.utils import ymd
+
+DIR_LOGS = f"{DIR_SCRIPT}/logs/{ymd()}/"  # subdivide logs by day of script execution
 
 
 def scrape(config, log_main):
@@ -45,7 +49,7 @@ def scrape(config, log_main):
             )
             dict_regions[region] = competitions
 
-            # get current competition standings and player statistics per team
+            # get current competition standings, player statistics, and team palmares
             df_standings, df_stats, df_palmares = parser.parse_standings_and_stats(
                 dict_competitions=competitions["competitions"], region=region
             )
@@ -68,7 +72,7 @@ def scrape(config, log_main):
     df_palmares_all = pd.concat(list_palmares, axis=0)
     df_sportshalls_all = pd.concat(list_sportshalls, axis=0)
 
-    # get historical player statistics
+    # get historical player statistics if enabled in config
     if config["steps"]["historical_players"] is True:
         log_main.info(f"Processing all historical player statistics")
         df_stats_historical_players = LZVCupParser.parse_player_stats_history(

diff --git a/scraper/parsers/__init__py → scraper/parsers/__init__.py b/scraper/parsers/__init__py → scraper/parsers/__init__.py
diff --git a/scraper/parsers/lzvcup.py b/scraper/parsers/lzvcup.py
@@ -4,13 +4,19 @@
 import numpy as np
 import pandas as pd
 import requests
-from base import BaseScraper
-from utils import add_columns_to_df, chunks
+
+from scraper.utils.base import BaseScraper
+from scraper.utils.utils import add_columns_to_df, chunks
 
 
 class LZVCupParser(BaseScraper):
     def __init__(self, config, **kwargs) -> None:
-        """Config should minimally include: 'base_url', 'area_name', and 'area_url'."""
+        """
+        The config dictionary should include 'base_url', 'area_name', and 'area_url'.
+        Keys from the config will be stored with a _ prefix as attributes of the class.
+        Additional keyword arguments won't have such prefix, except for the optional
+        logger= argument, which will be stored as self._logger.
+        """
         super().__init__(config, **kwargs)
 
         # complete area url

diff --git a/scraper/utils/__init__.py b/scraper/utils/__init__.py
diff --git a/scraper/base.py → scraper/utils/base.py b/scraper/base.py → scraper/utils/base.py
@@ -1,8 +1,8 @@
 import json
-import logging
 
 import pandas as pd
 import requests
+import structlog
 from bs4 import BeautifulSoup
 
 
@@ -38,13 +38,16 @@ def load_csv(dir):
 
 
 class BaseScraper(DataStorage):
-    def __init__(self, config={}, logger=logging.getLogger()) -> None:
+    def __init__(self, config={}, logger=structlog.getLogger(), **kwargs) -> None:
         """Assigns config keys as separate attributes prefixed with _."""
         for name, value in config.items():
             setattr(self, f"_{name}", value)
 
         self._logger = logger
 
+        for name, value in kwargs.items():
+            setattr(self, name, value)
+
     def convert_to_full_url(self, url_end):
         """Joins 'url_end' with self._base_url."""
         if url_end[0] == "/":

diff --git a/scraper/logger.py → scraper/utils/logger.py b/scraper/logger.py → scraper/utils/logger.py
@@ -10,7 +10,7 @@ def __init__(self, log_name, log_file, level) -> None:
         # define basic configuration
         structlog.configure(
             processors=[
-                structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S"),
+                structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S", utc=False),
                 structlog.stdlib.add_log_level,
                 structlog.stdlib.add_logger_name,
                 structlog.processors.StackInfoRenderer(),

diff --git a/scraper/utils.py → scraper/utils/utils.py b/scraper/utils.py → scraper/utils/utils.py
diff --git a/tests/test_scraper.py b/tests/test_scraper.py
@@ -0,0 +1,74 @@
+import pytest
+
+from scraper.parsers.lzvcup import LZVCupParser
+
+config_ = {
+    "base_url": "https://www.lzvcup.be",
+    "area_name": "VLAAMS BRABANT",
+    "area_url": "results/5",
+}
+parser = LZVCupParser(config_, region="Regio Ring Oost")
+
+
+def test_parse_region_cards_and_competitions_from_region_card():
+    region_cards = parser.parse_region_cards()
+    regions = list(region_cards.keys())
+
+    assert regions == [
+        "Regio Dames Oost-Brabant",
+        "Regio Hageland",
+        "Regio Leuven",
+        "Regio Leuven Studenten",
+        "Regio Pajottenland",
+        "Regio Ring Noord",
+        "Regio Ring Oost",
+    ]
+
+    competitions = parser.parse_competitions_from_region_card(
+        region_cards[parser.region]
+    )
+
+    assert competitions.keys() == {"competitions", "sportshalls"}
+
+
+def test_parse_standings_and_stats():
+    dict_competitions = {
+        "1e Klasse": {
+            "url": "https://www.lzvcup.be/results/5/16/1",
+            "teams": {
+                "ZVC Vollentip": "https://www.lzvcup.be/teams/detail/365",
+                "Oppem Boys": "https://www.lzvcup.be/teams/detail/1242",
+                "The Crows": "https://www.lzvcup.be/teams/detail/1971",
+                "Eppegem City": "https://www.lzvcup.be/teams/detail/1970",
+                "Tervuren United": "https://www.lzvcup.be/teams/detail/551",
+            },
+        },
+        "2e Klasse": {
+            "url": "https://www.lzvcup.be/results/5/16/2",
+            "teams": {
+                "Aston Birra": "https://www.lzvcup.be/teams/detail/2001",
+                "ZVC Copains": "https://www.lzvcup.be/teams/detail/1525",
+                "Chiro Mik Mak": "https://www.lzvcup.be/teams/detail/1526",
+                "FC Degradé": "https://www.lzvcup.be/teams/detail/2002",
+                "The Blinders": "https://www.lzvcup.be/teams/detail/1605",
+            },
+        },
+    }
+
+    df_standings, df_stats, df_palmares = parser.parse_standings_and_stats(
+        dict_competitions=dict_competitions, region=parser.region
+    )
+
+    assert len(df_standings) > 0
+    assert len(df_stats) > 0
+    assert len(df_palmares) > 0
+
+
+@pytest.mark.parametrize(
+    "url, region",
+    [("https://www.lzvcup.be/sportshalls/16", "Regio Ring Oost")],
+)
+def test_parse_sporthalls(url, region):
+    df = parser.parse_sporthalls(url_sportshalls=url, region=region)
+
+    assert len(df) > 0