Skip to content

Commit

Permalink
Merge pull request #11 from simonbesnard1/10-improve-reading-speed
Browse files Browse the repository at this point in the history
10 improve reading speed
  • Loading branch information
simonbesnard1 authored Feb 25, 2025
2 parents f365a66 + 251cb2a commit 3dd5992
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 20 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<p align="center">
<a href="https://github.com/simonbesnard1/gedidb.git">
<img src="https://github.com/simonbesnard1/gedidb/blob/main/doc/_static/logos/gediDB_logo.svg"
<img src="https://raw.githubusercontent.com/simonbesnard1/gedidb/main/doc/_static/logos/gediDB_logo.svg"
alt="gediDB Logo" height="200px" hspace="0px" vspace="30px" align="left">
</a>
</p>
Expand Down
2 changes: 0 additions & 2 deletions doc/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@ gediDB Documentation
Discussions <https://github.com/simonbesnard1/gedidb/discussions>
Development <user/contributing>

**Version**: |version|

**gediDB** is an open-source Python package designed to simplify working with GEDI L2A-B and L4A-C datasets, streamlining data analysis for efficient data exploration. It offers intuitive modules for processing, querying, and analyzing GEDI data stored in **tileDB databases**.

.. grid:: 1 1 2 2
Expand Down
6 changes: 3 additions & 3 deletions gedidb/core/gedidatabase.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,19 +406,19 @@ def _create_domain(self) -> tiledb.Domain:
tiledb.Dim(
"latitude",
domain=(lat_min, lat_max),
tile=self.config.get("tiledb", {}).get("latitude_tile", 1),
tile=self.config.get("tiledb", {}).get("latitude_tile", 0.5),
dtype="float64",
),
tiledb.Dim(
"longitude",
domain=(lon_min, lon_max),
tile=self.config.get("tiledb", {}).get("longitude_tile", 1),
tile=self.config.get("tiledb", {}).get("longitude_tile", 0.5),
dtype="float64",
),
tiledb.Dim(
"time",
domain=(time_min, time_max),
tile=self.config.get("tiledb", {}).get("time_tile", 1825),
tile=self.config.get("tiledb", {}).get("time_tile", 365),
dtype="int64",
),
]
Expand Down
19 changes: 9 additions & 10 deletions gedidb/core/gediprovider.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,12 @@ def get_data(
f"Invalid query_type '{query_type}'. Must be 'bounding_box' or 'nearest'."
)

# Ensure return_type is valid
if return_type not in {"xarray", "dataframe"}:
raise ValueError(
f"Invalid return_type '{return_type}'. Must be either 'xarray' or 'dataframe'."
)

# Validation for bounding_box queries
if query_type == "bounding_box":
if geometry is None or not isinstance(geometry, gpd.GeoDataFrame):
Expand Down Expand Up @@ -361,25 +367,16 @@ def get_data(
scalar_data, profile_vars = self.query_data(
variables, geometry, start_time, end_time, **quality_filters
)
else:
raise ValueError(
"Invalid query_type. Must be either 'nearest' or 'bounding_box'."
)

if not scalar_data:
logger.info("No data found for specified criteria.")
return None

metadata = self.get_available_variables()

if return_type == "xarray":
metadata = self.get_available_variables()
return self.to_xarray(scalar_data, metadata, profile_vars)
elif return_type == "dataframe":
return self.to_dataframe(scalar_data)
else:
raise ValueError(
"Invalid return_type. Must be either 'xarray' or 'dataframe'."
)

def to_dataframe(self, scalar_data: Dict[str, np.ndarray]) -> pd.DataFrame:
"""
Expand Down Expand Up @@ -417,6 +414,7 @@ def to_dataframe(self, scalar_data: Dict[str, np.ndarray]) -> pd.DataFrame:
# Convert scalar data to DataFrame
scalar_data["time"] = _timestamp_to_datetime(scalar_data["time"])
scalar_df = pd.DataFrame.from_dict(scalar_data)
scalar_df = scalar_df.sort_values(by="time")

# Merge scalar and profile data on shot_number
return scalar_df
Expand Down Expand Up @@ -516,6 +514,7 @@ def to_xarray(
"time": ("shot_number", times),
}
)
dataset = dataset.sortby("time")

self._attach_metadata(dataset, metadata)

Expand Down
15 changes: 11 additions & 4 deletions gedidb/providers/tiledb_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,7 @@ def __init__(
url: Optional[str] = None,
region: str = "eu-central-1",
credentials: Optional[dict] = None,
n_workers: int = 5,
):
self.n_workers = n_workers

# Validate storage_type
if not storage_type or not isinstance(storage_type, str):
Expand Down Expand Up @@ -73,8 +71,14 @@ def _initialize_s3_context(
config = {
"vfs.s3.endpoint_override": url,
"vfs.s3.region": region,
"py.init_buffer_bytes": "512000000", # Increase buffer size
"py.init_buffer_bytes": "17179869184", # 2GB buffer
"sm.tile_cache_size": "17179869184", # 2GB cache
"sm.num_reader_threads": "128", # More parallel reads
"sm.num_tiledb_threads": "128",
"vfs.s3.max_parallel_ops": "64", # Maximize parallel S3 ops
"vfs.s3.use_virtual_addressing": "true",
}
return tiledb.Ctx(config)

# Add credentials if provided
if credentials:
Expand All @@ -92,7 +96,10 @@ def _initialize_s3_context(
def _initialize_local_context(self) -> tiledb.Ctx:
return tiledb.Ctx(
{
"py.init_buffer_bytes": "512000000", # Increase buffer size
"py.init_buffer_bytes": "2048000000", # 2GB buffer
"sm.tile_cache_size": "2048000000", # 2GB cache
"sm.num_reader_threads": "32", # More parallel reads
"sm.num_tiledb_threads": "32",
}
)

Expand Down

0 comments on commit 3dd5992

Please sign in to comment.