Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

valid h5 file after downloading #7

Merged
merged 1 commit into from
Feb 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

# gediDB: A toolbox for Global Ecosystem Dynamics Investigation (GEDI) L2A-B and L4A-C data

[![Pipelines](https://github.com/simonbesnard1/gedidb/workflows/CI/badge.svg?branch=main)](https://github.com/simonbesnard1/gedidb/actions?query=workflow%3ACI)
[![Pipelines](https://github.com/simonbesnard1/gedidb/actions/workflows/ci.yaml/badge.svg)](https://github.com/simonbesnard1/gedidb/actions?query=workflow%3ACI)
[![Code coverage](https://codecov.io/gh/simonbesnard1/gedidb/branch/main/graph/badge.svg?flag=unittests)](https://codecov.io/gh/simonbesnard1/gedidb)
[![Docs](https://readthedocs.org/projects/gedidb/badge/?version=latest)](https://gedidb.readthedocs.io/en/latest/)
[![Formatted with black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black)
Expand Down
56 changes: 42 additions & 14 deletions gedidb/downloader/data_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from collections import defaultdict
from retry import retry
from urllib3.exceptions import NewConnectionError
import h5py
from requests.exceptions import (
HTTPError, ConnectionError, ChunkedEncodingError, Timeout, RequestException, ReadTimeout,
)
Expand Down Expand Up @@ -161,6 +162,7 @@ def _filter_granules_with_all_products(self, granules: dict) -> dict:

return filtered_granules


class H5FileDownloader:
"""
Downloader for HDF5 files from URLs, with resume and retry support,
Expand All @@ -169,7 +171,7 @@ class H5FileDownloader:

def __init__(self, download_path: str = "."):
self.download_path = pathlib.Path(download_path)

@retry(
(ValueError, TypeError, HTTPError, ConnectionError, ChunkedEncodingError, Timeout, RequestException, OSError),
tries=10,
Expand All @@ -188,9 +190,13 @@ def download(self, granule_key: str, url: str, product: str) -> Tuple[str, Tuple
temp_path = granule_dir / f"{product.name}.h5.part"
os.makedirs(granule_dir, exist_ok=True)

# Check if file already exists
# Check if file already exists and is valid
if final_path.exists():
return granule_key, (product.value, str(final_path))
if self._is_hdf5_valid(final_path):
return granule_key, (product.value, str(final_path))
else:
logger.warning(f"Corrupt HDF5 file detected: {final_path}. Deleting and retrying.")
final_path.unlink()

# Get the size of partially downloaded file
downloaded_size = temp_path.stat().st_size if temp_path.exists() else 0
Expand All @@ -201,12 +207,17 @@ def download(self, granule_key: str, url: str, product: str) -> Tuple[str, Tuple

try:
partial_response = requests.get(url, headers=headers, stream=True, timeout=30)
partial_response.raise_for_status() # Ensure HTTP errors are caught
partial_response.raise_for_status()
if "Content-Range" in partial_response.headers:
total_size = int(partial_response.headers["Content-Range"].split("/")[-1])
if downloaded_size == total_size:
temp_path.rename(final_path)
return granule_key, (product.value, str(final_path))
if self._is_hdf5_valid(final_path):
return granule_key, (product.value, str(final_path))
else:
logger.warning(f"Downloaded file {final_path} is corrupt. Deleting and retrying.")
final_path.unlink()
raise ValueError("Invalid HDF5 file after download.")
headers["Range"] = f"bytes={downloaded_size}-"
else:
headers = {} # Server doesn't support Range requests
Expand All @@ -232,23 +243,40 @@ def download(self, granule_key: str, url: str, product: str) -> Tuple[str, Tuple
# Validate final size
final_downloaded_size = temp_path.stat().st_size
if total_size is not None and final_downloaded_size != total_size:
temp_path.unlink(missing_ok=True) # Clean up corrupt file
raise ValueError("Downloaded final size mismatch with expected size")
temp_path.unlink(missing_ok=True)
raise ValueError("Downloaded final size mismatch with expected size.")

# Rename to final name upon successful download
temp_path.rename(final_path)

# Validate the HDF5 file
if not self._is_hdf5_valid(final_path):
logger.warning(f"Downloaded file {final_path} is corrupt. Deleting and retrying.")
final_path.unlink()
raise ValueError("Invalid HDF5 file after download.")

return granule_key, (product.value, str(final_path))

except (HTTPError, ConnectionError, ChunkedEncodingError, ReadTimeout, OSError, ValueError) as e:
except (HTTPError, ConnectionError, ChunkedEncodingError, Timeout, OSError, ValueError) as e:
if isinstance(e, OSError) and e.errno == 24:
logger.error(f"Too many open files for {product.name} of the granule {granule_key}: {e}. Retrying...")
logger.error(f"Too many open files for {product} of the granule {granule_key}: {e}. Retrying...")
time.sleep(5)
logger.error(f"Error encountered for {product.name} of the granule {granule_key}: {e}. Retrying...")
raise # This is what triggers the retry

logger.error(f"Error encountered for {product} of the granule {granule_key}: {e}. Retrying...")
raise # This triggers the retry mechanism

except Exception as e:
logger.error(f"Download failed after all retries for {product.name} of the granule {granule_key}: {e}")
logger.error(f"Download failed after all retries for {product} of the granule {granule_key}: {e}")
if temp_path.exists():
temp_path.unlink()
return granule_key, (product.value, None)
return granule_key, (product.value, None)

def _is_hdf5_valid(self, file_path: pathlib.Path) -> bool:
"""
Check if an HDF5 file is valid and can be opened.
"""
try:
with h5py.File(file_path, "r") as f:
return True # File is valid
except OSError:
return False # File is corrupt or not a valid HDF5
Loading