Skip to content

Commit

Permalink
give the ability to cache specific status_codes
Browse files Browse the repository at this point in the history
  • Loading branch information
JessyBarrette committed Apr 17, 2024
1 parent 934efb8 commit ae9c3d0
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 29 deletions.
2 changes: 1 addition & 1 deletion harvest_config.sample.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ erddaps:
dataset_ids: []

# set to true to test with caching. Not available in production docker compose file
# defaults to false
# defaults to false or a list of status_code to cache (e.g. [200, 404])
cache_requests: true

# defaults to 'harvest'
Expand Down
33 changes: 20 additions & 13 deletions harvester/cde_harvester/ERDDAP.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,25 @@ def parse_erddap_dates(series):

def erddap_csv_to_df(self, url, skiprows=[1], dataset=None):
"""If theres an error in the request, this raises up to the dataset loop, so this dataset gets skipped"""

def _cache_request(url):
if self.cache_requests and url in self.cache:
logger.debug("load CACHE")
response = self.cache[url]
if self.cache_requests == True:
return response
elif (
isinstance(self.cache_requests, list)
and response.status_code in self.cache_requests
):
return response

response = self.session.get(url, timeout=3600)
if self.cache_requests:
logger.debug("save CACHE")
self.cache[url] = response
return response

if dataset:
erddap_url = dataset.erddap_url
else:
Expand All @@ -106,19 +125,7 @@ def erddap_csv_to_df(self, url, skiprows=[1], dataset=None):

self.logger.debug(unquote(url_combined))

response = None
if self.cache_requests:
cache = self.cache
if url_combined in self.cache:
logger.debug("load CACHE")
response = cache[url_combined]
else:
self.logger.debug("miss CACHE")
response = self.session.get(url_combined, timeout=3600)
cache[url_combined] = response
else:
response = self.session.get(url_combined, timeout=3600)

response = _cache_request(url_combined)
if len(response.content) > MAX_RESPONSE_SIZE:
raise RuntimeError("Response too big")

Expand Down
54 changes: 40 additions & 14 deletions harvester/cde_harvester/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def review_standard_names_not_supported(standard_names: list):
unsupported_standard_names,
)


def cleanup_datasets_table(datasets):
logger.info("Cleaning up data")
datasets = datasets.replace(np.nan, None)
Expand Down Expand Up @@ -99,6 +100,7 @@ def cleanup_datasets_table(datasets):
datasets = datasets.replace(r"\n", " ", regex=True)
return datasets


@monitor(monitor_slug="main-harvester")
def main(erddaps, cache_requests, folder: Path, max_workers: int):

Expand Down Expand Up @@ -173,7 +175,13 @@ def worker():
)

# write files to disk
logger.info("Writing data to files: {}, {}, {}, {}", datasets_file, profiles_file, ckan_file, skipped_datasets_file)
logger.info(
"Writing data to files: {}, {}, {}, {}",
datasets_file,
profiles_file,
ckan_file,
skipped_datasets_file,
)
datasets.drop_duplicates(["erddap_url", "dataset_id"]).to_csv(
datasets_file, index=False
)
Expand Down Expand Up @@ -218,13 +226,22 @@ def load_config(config_file):
default="",
)
@click.option(
"--cache-requests/--no-cache-requests", "--cache/--no-cache", help="Cache requests, for testing only", default=True
"--cache-requests/--no-cache-requests",
"--cache/--no-cache",
help="Cache requests, for testing only",
default=None,
)
@click.option(
"--cache-requests-status-codes",
help="Cache requests with these status codes, comma separated list of integers",
type=str,
default=None,
)
@click.option(
"--folder",
help="Folder to save harvested data to",
default=Path("harvest"),
type=click.Path(dir_okay=True, file_okay=False)
type=click.Path(dir_okay=True, file_okay=False),
)
@click.option(
"--log-level",
Expand All @@ -251,19 +268,28 @@ def load_config(config_file):
@logger.catch(reraise=True, message="Harvester failed!!!")
def cli(**kwargs):
"""Harvest ERDDAP datasets and profiles and save to CSV files"""
config = kwargs.pop("config")
config = kwargs.pop("config",{})
cache_requests_status_code = kwargs.pop("cache_requests_status_codes")
if config:
config = load_config(config)
if erddap_urls := kwargs.pop("erddap_urls"):
config["erddaps"] = [
{"url": erddap_url} for erddap_url in erddap_urls.split(",")
]
if dataset_ids := kwargs.pop("dataset_ids"):
dataset_ids = dataset_ids.split(",")
for id, _ in enumerate(config["erddaps"]):
config["erddaps"][id]["dataset_ids"] = dataset_ids

config.update(kwargs)
else:
config = {}
if erddap_urls := kwargs.pop("erddap_urls"):
config["erddaps"] = [
{"url": erddap_url} for erddap_url in erddap_urls.split(",")
]
if dataset_ids := kwargs.pop("dataset_ids"):
dataset_ids = dataset_ids.split(",")
for id, _ in enumerate(config["erddaps"]):
config["erddaps"][id]["dataset_ids"] = dataset_ids

config.update(kwargs)
cache_requests_status_code = kwargs.pop("cache_requests_status_codes")

if cache_requests_status_code:
config["cache_requests"] = [
int(x) for x in cache_requests_status_code.split(",")
]
setup_logging(config.pop("log_level"))
main(**config)

Expand Down
2 changes: 1 addition & 1 deletion harvester/cde_harvester/harvest_erddap.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def harvest_erddap(erddap_conn, result, cache_requests=False):

hostname = urlparse(erddap_url).hostname
datasets_to_skip = get_datasets_to_skip().get(hostname, [])

def skipped_reason(code):
return [[erddap.domain, dataset_id, code]]

Expand Down

0 comments on commit ae9c3d0

Please sign in to comment.