give the ability to cache specific status_codes

HakaiInstitute · Apr 17, 2024 · ae9c3d0 · ae9c3d0
1 parent 934efb8
commit ae9c3d0
Show file tree

Hide file tree

Showing 4 changed files with 62 additions and 29 deletions.
diff --git a/harvest_config.sample.yaml b/harvest_config.sample.yaml
@@ -6,7 +6,7 @@ erddaps:
     dataset_ids: []
 
 # set to true to test with caching. Not available in production docker compose file
-# defaults to false
+# defaults to false or a list of status_code to cache (e.g. [200, 404])
 cache_requests: true
 
 # defaults to 'harvest'

diff --git a/harvester/cde_harvester/ERDDAP.py b/harvester/cde_harvester/ERDDAP.py
@@ -97,6 +97,25 @@ def parse_erddap_dates(series):
 
     def erddap_csv_to_df(self, url, skiprows=[1], dataset=None):
         """If theres an error in the request, this raises up to the dataset loop, so this dataset gets skipped"""
+
+        def _cache_request(url):
+            if self.cache_requests and url in self.cache:
+                logger.debug("load CACHE")
+                response = self.cache[url]
+                if self.cache_requests == True:
+                    return response
+                elif (
+                    isinstance(self.cache_requests, list)
+                    and response.status_code in self.cache_requests
+                ):
+                    return response
+
+            response = self.session.get(url, timeout=3600)
+            if self.cache_requests:
+                logger.debug("save CACHE")
+                self.cache[url] = response
+            return response
+
         if dataset:
             erddap_url = dataset.erddap_url
         else:
@@ -106,19 +125,7 @@ def erddap_csv_to_df(self, url, skiprows=[1], dataset=None):
 
         self.logger.debug(unquote(url_combined))
 
-        response = None
-        if self.cache_requests:
-            cache = self.cache
-            if url_combined in self.cache:
-                logger.debug("load CACHE")
-                response = cache[url_combined]
-            else:
-                self.logger.debug("miss CACHE")
-                response = self.session.get(url_combined, timeout=3600)
-                cache[url_combined] = response
-        else:
-            response = self.session.get(url_combined, timeout=3600)
-
+        response = _cache_request(url_combined)
         if len(response.content) > MAX_RESPONSE_SIZE:
             raise RuntimeError("Response too big")
 

diff --git a/harvester/cde_harvester/__main__.py b/harvester/cde_harvester/__main__.py
@@ -67,6 +67,7 @@ def review_standard_names_not_supported(standard_names: list):
             unsupported_standard_names,
         )
 
+
 def cleanup_datasets_table(datasets):
     logger.info("Cleaning up data")
     datasets = datasets.replace(np.nan, None)
@@ -99,6 +100,7 @@ def cleanup_datasets_table(datasets):
     datasets = datasets.replace(r"\n", " ", regex=True)
     return datasets
 
+
 @monitor(monitor_slug="main-harvester")
 def main(erddaps, cache_requests, folder: Path, max_workers: int):
 
@@ -173,7 +175,13 @@ def worker():
     )
 
     # write files to disk
-    logger.info("Writing data to files: {}, {}, {}, {}", datasets_file, profiles_file, ckan_file, skipped_datasets_file)
+    logger.info(
+        "Writing data to files: {}, {}, {}, {}",
+        datasets_file,
+        profiles_file,
+        ckan_file,
+        skipped_datasets_file,
+    )
     datasets.drop_duplicates(["erddap_url", "dataset_id"]).to_csv(
         datasets_file, index=False
     )
@@ -218,13 +226,22 @@ def load_config(config_file):
     default="",
 )
 @click.option(
-    "--cache-requests/--no-cache-requests", "--cache/--no-cache", help="Cache requests, for testing only", default=True
+    "--cache-requests/--no-cache-requests",
+    "--cache/--no-cache",
+    help="Cache requests, for testing only",
+    default=None,
+)
+@click.option(
+    "--cache-requests-status-codes",
+    help="Cache requests with these status codes, comma separated list of integers",
+    type=str,
+    default=None,
 )
 @click.option(
     "--folder",
     help="Folder to save harvested data to",
     default=Path("harvest"),
-    type=click.Path(dir_okay=True, file_okay=False)
+    type=click.Path(dir_okay=True, file_okay=False),
 )
 @click.option(
     "--log-level",
@@ -251,19 +268,28 @@ def load_config(config_file):
 @logger.catch(reraise=True, message="Harvester failed!!!")
 def cli(**kwargs):
     """Harvest ERDDAP datasets and profiles and save to CSV files"""
-    config = kwargs.pop("config")
+    config = kwargs.pop("config",{})
+    cache_requests_status_code = kwargs.pop("cache_requests_status_codes")
     if config:
         config = load_config(config)
-    if erddap_urls := kwargs.pop("erddap_urls"):
-        config["erddaps"] = [
-            {"url": erddap_url} for erddap_url in erddap_urls.split(",")
-        ]
-    if dataset_ids := kwargs.pop("dataset_ids"):
-        dataset_ids = dataset_ids.split(",")
-        for id, _ in enumerate(config["erddaps"]):
-            config["erddaps"][id]["dataset_ids"] = dataset_ids
-
-    config.update(kwargs)
+    else:
+        config = {}
+        if erddap_urls := kwargs.pop("erddap_urls"):
+            config["erddaps"] = [
+                {"url": erddap_url} for erddap_url in erddap_urls.split(",")
+            ]
+        if dataset_ids := kwargs.pop("dataset_ids"):
+            dataset_ids = dataset_ids.split(",")
+            for id, _ in enumerate(config["erddaps"]):
+                config["erddaps"][id]["dataset_ids"] = dataset_ids
+
+        config.update(kwargs)
+        cache_requests_status_code = kwargs.pop("cache_requests_status_codes")
+
+        if cache_requests_status_code:
+            config["cache_requests"] = [
+                int(x) for x in cache_requests_status_code.split(",")
+            ]
     setup_logging(config.pop("log_level"))
     main(**config)
 

diff --git a/harvester/cde_harvester/harvest_erddap.py b/harvester/cde_harvester/harvest_erddap.py
@@ -96,7 +96,7 @@ def harvest_erddap(erddap_conn, result, cache_requests=False):
 
     hostname = urlparse(erddap_url).hostname
     datasets_to_skip = get_datasets_to_skip().get(hostname, [])
-    
+
     def skipped_reason(code):
         return [[erddap.domain, dataset_id, code]]