diff --git a/README.md b/README.md index 1f9f94e..85b26f2 100644 --- a/README.md +++ b/README.md @@ -32,5 +32,4 @@ Leave any comments or suggestions in [an issue](https://github.com/SummitCode/py `pyball` is licensed under the [MIT license](https://github.com/SummitCode/pyball/blob/master/LICENSE) ## To-do -- I think the cache is broken? Or the lookup is slow, investigate. - Would like to make a base class of shared functions (_get_soup(), _find_table(), ...) but I kinda hate how python classes work. \ No newline at end of file diff --git a/pyball/utils.py b/pyball/utils.py index afc6aba..9a9f495 100644 --- a/pyball/utils.py +++ b/pyball/utils.py @@ -4,56 +4,33 @@ # # Description: File containing various utility functions used in pyball -from functools import lru_cache, wraps -from datetime import datetime, timedelta -from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError +import time +import hashlib +import diskcache from bs4 import BeautifulSoup +from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError -def timed_lru_cache(seconds: int, maxsize: int = 128): - """ - A decorator that combines LRU caching with a time-based expiration. +cache = diskcache.Cache('./.pyball_cache') - Args: - seconds (int): The number of seconds after which the cache should expire. - maxsize (int, optional): The maximum number of function calls to cache. Defaults to 128. - Returns: - function: The decorated function. +def fetch_url_content(url, cache_time=86400): """ - def wrapper_cache(func): - func = lru_cache(maxsize=maxsize)(func) - func.lifetime = timedelta(seconds=seconds) - func.expiration = datetime.now() + func.lifetime - - @wraps(func) - def wrapped_func(*args, **kwargs): - if datetime.now() >= func.expiration: - func.cache_clear() - func.expiration = datetime.now() + func.lifetime - - return func(*args, **kwargs) - - return wrapped_func - - return wrapper_cache - - -@timed_lru_cache(seconds=86400) # Cache for 1 day -def read_url(url): + Function to read a url and return the BeautifulSoup object, using disk cache when available """ - Function to read a url and return the html content using Playwright + # Create a unique key for this URL + url_hash = hashlib.md5(url.encode()).hexdigest() - Parameters - ---------- - url: String - The URL to read. + # Check if we have a valid cached version + cached_data = cache.get(url_hash) + if cached_data is not None: + timestamp, html = cached_data + if time.time() - timestamp < cache_time: + print("Using cached data") + return BeautifulSoup(html, "html.parser") - Returns - ---------- - BeautifulSoup object - Contains the HTML content of the URL. - """ + # If no valid cache, fetch the content + print("Fetching from URL") with sync_playwright() as p: browser = p.chromium.launch(headless=True) page = browser.new_page() @@ -74,12 +51,24 @@ def read_url(url): browser.close() if html: - soup = BeautifulSoup(html, "html.parser") - return soup + # Cache the new content + cache.set(url_hash, (time.time(), html)) + return BeautifulSoup(html, "html.parser") else: return None +def read_url(url): + """ + Function to read a url, using cache when available + """ + try: + return fetch_url_content(url) + except Exception as e: + print(f"Error fetching URL: {e}") + return None + + def make_bbref_player_url(bbref_key): """ Function to generate baseball-reference url from bbref_key @@ -169,6 +158,7 @@ def make_savant_player_url(last, first, key_mlbam): return url + def is_savant_url(url): """ Checks if the given string is a valid Baseball Savant url. @@ -179,4 +169,4 @@ def is_savant_url(url): Returns: bool: True if the URL contains 'baseballsavant', False otherwise. """ - return "baseballsavant" in url \ No newline at end of file + return "baseballsavant" in url diff --git a/pyproject.toml b/pyproject.toml index 45214f2..902b47e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "pyball" -version = "1.4.1" +version = "1.4.2" description = "Python3 library for obtaining baseball information" authors = ["gdifiore"] readme = "README.md" @@ -18,6 +18,7 @@ bs4 = "^0.0.1" requests = "^2.26.0" playwright = "^1.45.0" lxml = "^5.2.2" +diskcache = "^5.6.3" [tool.poetry.group.dev.dependencies] pytest = "^8.3.1"