Skip to content

Commit

Permalink
Merge pull request #7 from gdifiore/investigate-cache
Browse files Browse the repository at this point in the history
Investigate & fix cache
  • Loading branch information
gdifiore authored Jul 26, 2024
2 parents a0d6c5e + b9db05d commit b1b9131
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 46 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,4 @@ Leave any comments or suggestions in [an issue](https://github.com/SummitCode/py
`pyball` is licensed under the [MIT license](https://github.com/SummitCode/pyball/blob/master/LICENSE)

## To-do
- I think the cache is broken? Or the lookup is slow, investigate.
- Would like to make a base class of shared functions (_get_soup(), _find_table(), ...) but I kinda hate how python classes work.
78 changes: 34 additions & 44 deletions pyball/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,56 +4,33 @@
#
# Description: File containing various utility functions used in pyball

from functools import lru_cache, wraps
from datetime import datetime, timedelta
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
import time
import hashlib
import diskcache
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError


def timed_lru_cache(seconds: int, maxsize: int = 128):
"""
A decorator that combines LRU caching with a time-based expiration.
cache = diskcache.Cache('./.pyball_cache')

Args:
seconds (int): The number of seconds after which the cache should expire.
maxsize (int, optional): The maximum number of function calls to cache. Defaults to 128.

Returns:
function: The decorated function.
def fetch_url_content(url, cache_time=86400):
"""
def wrapper_cache(func):
func = lru_cache(maxsize=maxsize)(func)
func.lifetime = timedelta(seconds=seconds)
func.expiration = datetime.now() + func.lifetime

@wraps(func)
def wrapped_func(*args, **kwargs):
if datetime.now() >= func.expiration:
func.cache_clear()
func.expiration = datetime.now() + func.lifetime

return func(*args, **kwargs)

return wrapped_func

return wrapper_cache


@timed_lru_cache(seconds=86400) # Cache for 1 day
def read_url(url):
Function to read a url and return the BeautifulSoup object, using disk cache when available
"""
Function to read a url and return the html content using Playwright
# Create a unique key for this URL
url_hash = hashlib.md5(url.encode()).hexdigest()

Parameters
----------
url: String
The URL to read.
# Check if we have a valid cached version
cached_data = cache.get(url_hash)
if cached_data is not None:
timestamp, html = cached_data
if time.time() - timestamp < cache_time:
print("Using cached data")
return BeautifulSoup(html, "html.parser")

Returns
----------
BeautifulSoup object
Contains the HTML content of the URL.
"""
# If no valid cache, fetch the content
print("Fetching from URL")
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
Expand All @@ -74,12 +51,24 @@ def read_url(url):
browser.close()

if html:
soup = BeautifulSoup(html, "html.parser")
return soup
# Cache the new content
cache.set(url_hash, (time.time(), html))
return BeautifulSoup(html, "html.parser")
else:
return None


def read_url(url):
"""
Function to read a url, using cache when available
"""
try:
return fetch_url_content(url)
except Exception as e:
print(f"Error fetching URL: {e}")
return None


def make_bbref_player_url(bbref_key):
"""
Function to generate baseball-reference url from bbref_key
Expand Down Expand Up @@ -169,6 +158,7 @@ def make_savant_player_url(last, first, key_mlbam):

return url


def is_savant_url(url):
"""
Checks if the given string is a valid Baseball Savant url.
Expand All @@ -179,4 +169,4 @@ def is_savant_url(url):
Returns:
bool: True if the URL contains 'baseballsavant', False otherwise.
"""
return "baseballsavant" in url
return "baseballsavant" in url
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "pyball"
version = "1.4.1"
version = "1.4.2"
description = "Python3 library for obtaining baseball information"
authors = ["gdifiore"]
readme = "README.md"
Expand All @@ -18,6 +18,7 @@ bs4 = "^0.0.1"
requests = "^2.26.0"
playwright = "^1.45.0"
lxml = "^5.2.2"
diskcache = "^5.6.3"

[tool.poetry.group.dev.dependencies]
pytest = "^8.3.1"
Expand Down

0 comments on commit b1b9131

Please sign in to comment.