fsspec · mpiannucci · Oct 4, 2024 · Oct 5, 2024 · Oct 5, 2024 · Oct 8, 2024
diff --git a/kerchunk/combine.py b/kerchunk/combine.py
@@ -203,7 +203,7 @@ def append(
         ds = xr.open_dataset(
             fs.get_mapper(), engine="zarr", backend_kwargs={"consolidated": False}
         )
-        z = zarr.open(fs.get_mapper())
+        z = zarr.open(fs.get_mapper(), zarr_format=2)
         mzz = MultiZarrToZarr(
             path,
             out=fs.references,  # dict or parquet/lazy
@@ -360,7 +360,7 @@ def first_pass(self):
                 fs._dircache_from_items()
 
             logger.debug("First pass: %s", i)
-            z = zarr.open_group(fs.get_mapper(""))
+            z = zarr.open_group(fs.get_mapper(""), zarr_format=2)
             for var in self.concat_dims:
                 value = self._get_value(i, z, var, fn=self._paths[i])
                 if isinstance(value, np.ndarray):
@@ -387,7 +387,7 @@ def store_coords(self):
         """
         kv = {}
         store = zarr.storage.KVStore(kv)
-        group = zarr.open(store)
+        group = zarr.open(store, zarr_format=2)
         m = self.fss[0].get_mapper("")
         z = zarr.open(m)
         for k, v in self.coos.items():
@@ -461,7 +461,7 @@ def second_pass(self):
         for i, fs in enumerate(self.fss):
             to_download = {}
             m = fs.get_mapper("")
-            z = zarr.open(m)
+            z = zarr.open(m, zarr_format=2)
 
             if no_deps is None:
                 # done first time only

diff --git a/kerchunk/fits.py b/kerchunk/fits.py
@@ -8,7 +8,7 @@
 from fsspec.implementations.reference import LazyReferenceMapper
 
 
-from kerchunk.utils import class_factory
+from kerchunk.utils import class_factory, dict_to_store
 from kerchunk.codecs import AsciiTableCodec, VarArrCodec
 
 try:
@@ -72,7 +72,8 @@ def process_file(
 
     storage_options = storage_options or {}
     out = out or {}
-    g = zarr.open(out)
+    store = dict_to_store(out)
+    g = zarr.open_group(store=store, zarr_format=2)
 
     with fsspec.open(url, mode="rb", **storage_options) as f:
         infile = fits.open(f, do_not_scale_image_data=True)
@@ -164,7 +165,7 @@ def process_file(
             # TODO: we could sub-chunk on biggest dimension
             name = hdu.name or str(ext)
             arr = g.empty(
-                name, dtype=dtype, shape=shape, chunks=shape, compression=None, **kwargs
+                name=name, dtype=dtype, shape=shape, chunks=shape, compressor=None, zarr_format=2, **kwargs
             )
             arr.attrs.update(
                 {

diff --git a/kerchunk/grib2.py b/kerchunk/grib2.py
@@ -191,7 +191,7 @@ def scan_grib(
             if good is False:
                 continue
 
-            z = zarr.open_group(store)
+            z = zarr.open_group(store, zarr_format=2)
             global_attrs = {
                 f"GRIB_{k}": m[k]
                 for k in cfgrib.dataset.GLOBAL_ATTRIBUTES_KEYS
@@ -398,7 +398,7 @@ def grib_tree(
 
     # TODO allow passing a LazyReferenceMapper as output?
     zarr_store = {}
-    zroot = zarr.open_group(store=zarr_store)
+    zroot = zarr.open_group(store=zarr_store, zarr_format=2)
 
     aggregations: Dict[str, List] = defaultdict(list)
     aggregation_dims: Dict[str, Set] = defaultdict(set)

diff --git a/kerchunk/hdf.py b/kerchunk/hdf.py
@@ -10,7 +10,7 @@
 import numcodecs
 
 from .codecs import FillStringsCodec
-from .utils import _encode_for_JSON
+from .utils import _encode_for_JSON, encode_fill_value, dict_to_store, translate_refs_serializable
 
 try:
     import h5py
@@ -21,12 +21,6 @@
         "for more details."
     )
 
-try:
-    from zarr.meta import encode_fill_value
-except ModuleNotFoundError:
-    # https://github.com/zarr-developers/zarr-python/issues/2021
-    from zarr.v2.meta import encode_fill_value
-
 lggr = logging.getLogger("h5-to-zarr")
 _HIDDEN_ATTRS = {  # from h5netcdf.attrs
     "REFERENCE_LIST",
@@ -111,9 +105,9 @@ def __init__(
         if vlen_encode not in ["embed", "null", "leave", "encode"]:
             raise NotImplementedError
         self.vlen = vlen_encode
-        self.store = out or {}
-        self._zroot = zarr.group(store=self.store, overwrite=True)
-
+        self.store_dict = out or {}
+        self.store = dict_to_store(self.store_dict)
+        self._zroot = zarr.group(store=self.store, zarr_format=2, overwrite=True)
         self._uri = url
         self.error = error
         lggr.debug(f"HDF5 file URI: {self._uri}")
@@ -140,7 +134,6 @@ def translate(self, preserve_linked_dsets=False):
         """
         lggr.debug("Translation begins")
         self._transfer_attrs(self._h5f, self._zroot)
-
         self._h5f.visititems(self._translator)
 
         if preserve_linked_dsets:
@@ -157,7 +150,8 @@ def translate(self, preserve_linked_dsets=False):
             self.store.flush()
             return self.store
         else:
-            store = _encode_for_JSON(self.store)
+            translate_refs_serializable(self.store_dict)
+            store = _encode_for_JSON(self.store_dict)
             return {"version": 1, "refs": store}
 
     def _unref(self, ref):
@@ -465,26 +459,30 @@ def _translator(
                     if h5py.h5ds.is_scale(h5obj.id) and not cinfo:
                         return
                     if h5obj.attrs.get("_FillValue") is not None:
+                        fill = h5obj.attrs.get("_FillValue")
                         fill = encode_fill_value(
                             h5obj.attrs.get("_FillValue"), dt or h5obj.dtype
                         )
 
-                # Create a Zarr array equivalent to this HDF5 dataset...
-                za = self._zroot.require_dataset(
-                    h5obj.name,
+                adims = self._get_array_dims(h5obj)
+
+                # Create a Zarr array equivalent to this HDF5 dataset..
+                za = self._zroot.require_array(
+                    name=h5obj.name,
                     shape=h5obj.shape,
                     dtype=dt or h5obj.dtype,
                     chunks=h5obj.chunks or False,
                     fill_value=fill,
-                    compression=None,
+                    compressor=None,
                     filters=filters,
-                    overwrite=True,
+                    attributes={
+                        "_ARRAY_DIMENSIONS": adims,
+                    },
                     **kwargs,
                 )
                 lggr.debug(f"Created Zarr array: {za}")
                 self._transfer_attrs(h5obj, za)
-                adims = self._get_array_dims(h5obj)
-                za.attrs["_ARRAY_DIMENSIONS"] = adims
+
                 lggr.debug(f"_ARRAY_DIMENSIONS = {adims}")
 
                 if "data" in kwargs:
@@ -496,6 +494,8 @@ def _translator(
                         if h5obj.fletcher32:
                             logging.info("Discarding fletcher32 checksum")
                             v["size"] -= 4
+                        key =  str.removeprefix(h5obj.name, "/") + "/" + ".".join(map(str, k))
+
                         if (
                             self.inline
                             and isinstance(v, dict)
@@ -508,9 +508,10 @@ def _translator(
                                 data.decode("ascii")
                             except UnicodeDecodeError:
                                 data = b"base64:" + base64.b64encode(data)
-                            self.store[za._chunk_key(k)] = data
+
+                            self.store_dict[key] = data
                         else:
-                            self.store[za._chunk_key(k)] = [
+                            self.store_dict[key] = [
                                 self._uri,
                                 v["offset"],
                                 v["size"],
@@ -681,3 +682,4 @@ def _is_netcdf_variable(dataset: h5py.Dataset):
 
 def has_visititems_links():
     return hasattr(h5py.Group, "visititems_links")
+
diff --git a/kerchunk/hdf4.py b/kerchunk/hdf4.py
@@ -144,7 +144,7 @@ def translate(self, filename=None, storage_options=None):
             remote_protocol=prot,
             remote_options=self.st,
         )
-        g = zarr.open_group("reference://", storage_options=dict(fs=fs))
+        g = zarr.open_group("reference://", storage_options=dict(fs=fs), zarr_format=2)
         refs = {}
         for k, v in output.items():
             if isinstance(v, dict):

diff --git a/kerchunk/netCDF3.py b/kerchunk/netCDF3.py
@@ -1,11 +1,12 @@
 from functools import reduce
+from packaging.version import Version
 from operator import mul
 
 import numpy as np
 from fsspec.implementations.reference import LazyReferenceMapper
 import fsspec
 
-from kerchunk.utils import _encode_for_JSON, inline_array
+from kerchunk.utils import _encode_for_JSON, dict_to_store, inline_array, translate_refs_serializable
 
 try:
     from scipy.io._netcdf import ZERO, NC_VARIABLE, netcdf_file, netcdf_variable
@@ -167,7 +168,9 @@ def translate(self):
         import zarr
 
         out = self.out
-        z = zarr.open(out, mode="w")
+        store = dict_to_store(out)
+        z = zarr.open(store, mode="w", zarr_format=2, overwrite=True)
+
         for dim, var in self.variables.items():
             if dim in self.chunks:
                 shape = self.chunks[dim][-1]
@@ -191,13 +194,13 @@ def translate(self):
                     fill = float(fill)
                 if fill is not None and var.data.dtype.kind == "i":
                     fill = int(fill)
-                arr = z.create_dataset(
+                arr = z.create_array(
                     name=dim,
                     shape=shape,
                     dtype=var.data.dtype,
                     fill_value=fill,
                     chunks=shape,
-                    compression=None,
+                    compressor=None,
                 )
                 part = ".".join(["0"] * len(shape)) or "0"
                 k = f"{dim}/{part}"
@@ -245,13 +248,13 @@ def translate(self):
                     fill = float(fill)
                 if fill is not None and base.kind == "i":
                     fill = int(fill)
-                arr = z.create_dataset(
+                arr = z.create_array(
                     name=name,
                     shape=shape,
                     dtype=base,
                     fill_value=fill,
                     chunks=(1,) + dtype.shape,
-                    compression=None,
+                    compressor=None,
                 )
                 arr.attrs.update(
                     {
@@ -295,6 +298,7 @@ def translate(self):
             out.flush()
             return out
         else:
+            translate_refs_serializable(out)
             out = _encode_for_JSON(out)
             return {"version": 1, "refs": out}
 

diff --git a/kerchunk/tests/test_combine.py b/kerchunk/tests/test_combine.py
@@ -133,14 +133,14 @@
 
 # simple time arrays - xarray can't make these!
 m = fs.get_mapper("time1.zarr")
-z = zarr.open(m, mode="w")
+z = zarr.open(m, mode="w", zarr_format=2)
 ar = z.create_dataset("time", data=np.array([1], dtype="M8[s]"))
 ar.attrs.update({"_ARRAY_DIMENSIONS": ["time"]})
 ar = z.create_dataset("data", data=arr)
 ar.attrs.update({"_ARRAY_DIMENSIONS": ["time", "x", "y"]})
 
 m = fs.get_mapper("time2.zarr")
-z = zarr.open(m, mode="w")
+z = zarr.open(m, mode="w", zarr_format=2)
 ar = z.create_dataset("time", data=np.array([2], dtype="M8[s]"))
 ar.attrs.update({"_ARRAY_DIMENSIONS": ["time"]})
 ar = z.create_dataset("data", data=arr)
@@ -272,7 +272,7 @@ def test_get_coos(refs, selector, expected):
     mzz.first_pass()
     assert mzz.coos["time"].tolist() == expected
     mzz.store_coords()
-    g = zarr.open(mzz.out)
+    g = zarr.open(mzz.out, zarr_format=2)
     assert g["time"][:].tolist() == expected
     assert dict(g.attrs)
 

diff --git a/kerchunk/tests/test_combine_concat.py b/kerchunk/tests/test_combine_concat.py
@@ -51,7 +51,7 @@ def test_success(tmpdir, arrays, chunks, axis, m):
     refs = []
     for i, x in enumerate(arrays):
         fn = f"{tmpdir}/out{i}.zarr"
-        g = zarr.open(fn)
+        g = zarr.open(fn, zarr_format=2)
         g.create_dataset("x", data=x, chunks=chunks)
         fns.append(fn)
         ref = kerchunk.zarr.single_zarr(fn, inline=0)
@@ -62,7 +62,7 @@ def test_success(tmpdir, arrays, chunks, axis, m):
     )
 
     mapper = fsspec.get_mapper("reference://", fo=out)
-    g = zarr.open(mapper)
+    g = zarr.open(mapper, zarr_format=2)
     assert (g.x[:] == np.concatenate(arrays, axis=axis)).all()
 
     try:
@@ -76,7 +76,7 @@ def test_success(tmpdir, arrays, chunks, axis, m):
         remote_protocol="file",
         skip_instance_cache=True,
     )
-    g = zarr.open(mapper)
+    g = zarr.open(mapper, zarr_format=2)
     assert (g.x[:] == np.concatenate(arrays, axis=axis)).all()
 
     kerchunk.df.refs_to_dataframe(out, "memory://out.parq", record_size=1)
@@ -86,7 +86,7 @@ def test_success(tmpdir, arrays, chunks, axis, m):
         remote_protocol="file",
         skip_instance_cache=True,
     )
-    g = zarr.open(mapper)
+    g = zarr.open(mapper, zarr_format=2)
     assert (g.x[:] == np.concatenate(arrays, axis=axis)).all()
 
 
@@ -95,9 +95,9 @@ def test_fail_chunks(tmpdir):
     fn2 = f"{tmpdir}/out2.zarr"
     x1 = np.arange(10)
     x2 = np.arange(10, 20)
-    g = zarr.open(fn1)
+    g = zarr.open(fn1, zarr_format=2)
     g.create_dataset("x", data=x1, chunks=(2,))
-    g = zarr.open(fn2)
+    g = zarr.open(fn2, zarr_format=2)
     g.create_dataset("x", data=x2, chunks=(3,))
 
     ref1 = kerchunk.zarr.single_zarr(fn1, inline=0)
@@ -112,9 +112,9 @@ def test_fail_shape(tmpdir):
     fn2 = f"{tmpdir}/out2.zarr"
     x1 = np.arange(12).reshape(6, 2)
     x2 = np.arange(12, 24)
-    g = zarr.open(fn1)
+    g = zarr.open(fn1, zarr_format=2)
     g.create_dataset("x", data=x1, chunks=(2,))
-    g = zarr.open(fn2)
+    g = zarr.open(fn2, zarr_format=2)
     g.create_dataset("x", data=x2, chunks=(2,))
 
     ref1 = kerchunk.zarr.single_zarr(fn1, inline=0)
@@ -129,9 +129,9 @@ def test_fail_irregular_chunk_boundaries(tmpdir):
     fn2 = f"{tmpdir}/out2.zarr"
     x1 = np.arange(10)
     x2 = np.arange(10, 24)
-    g = zarr.open(fn1)
+    g = zarr.open(fn1, zarr_format=2)
     g.create_dataset("x", data=x1, chunks=(4,))
-    g = zarr.open(fn2)
+    g = zarr.open(fn2, zarr_format=2)
     g.create_dataset("x", data=x2, chunks=(4,))
 
     ref1 = kerchunk.zarr.single_zarr(fn1, inline=0)