Skip to content

Commit

Permalink
Merge pull request #655 from jamesmudd/implicit-index
Browse files Browse the repository at this point in the history
Add Implicit Index Support
  • Loading branch information
jamesmudd authored Jan 11, 2025
2 parents 070ebed + 0dab1b9 commit 1b40888
Show file tree
Hide file tree
Showing 6 changed files with 174 additions and 2 deletions.
11 changes: 11 additions & 0 deletions jhdf/src/main/java/io/jhdf/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -456,4 +456,15 @@ private static void flattenInternal(Object data, List<Object> flat) {
flat.add(data);
}
}

public static int totalChunks(int[] datasetDimensions, int[] chunkDimensions) {
int chunks = 1;
for (int i = 0; i < datasetDimensions.length; i++) {
int chunksInDim = datasetDimensions[i] / chunkDimensions[i];
// If there is a partial chunk then we need to add one chunk in this dim
if(datasetDimensions[i] % chunkDimensions[i] != 0 ) chunksInDim++;
chunks *= chunksInDim;
}
return chunks;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
import io.jhdf.dataset.chunked.indexing.EmptyChunkIndex;
import io.jhdf.dataset.chunked.indexing.ExtensibleArrayIndex;
import io.jhdf.dataset.chunked.indexing.FixedArrayIndex;
import io.jhdf.dataset.chunked.indexing.ImplicitChunkIndex;
import io.jhdf.dataset.chunked.indexing.SingleChunkIndex;
import io.jhdf.exceptions.HdfException;
import io.jhdf.exceptions.UnsupportedHdfException;
import io.jhdf.object.message.DataLayoutMessage.ChunkedDataLayoutMessageV4;
import io.jhdf.storage.HdfBackingStorage;
import org.apache.commons.lang3.ArrayUtils;
Expand Down Expand Up @@ -85,7 +85,9 @@ protected Map<ChunkOffset, Chunk> initialize() {
chunkIndex = new SingleChunkIndex(layoutMessage, datasetInfo);
break;
case 2: // Implicit
throw new UnsupportedHdfException("Implicit indexing is currently not supported");
logger.debug("Reading implicit indexed dataset");
chunkIndex = new ImplicitChunkIndex(layoutMessage.getAddress(), datasetInfo);
break;
case 3: // Fixed array
logger.debug("Reading fixed array indexed dataset");
chunkIndex = new FixedArrayIndex(hdfBackingStorage, layoutMessage.getAddress(), datasetInfo);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/*
* This file is part of jHDF. A pure Java library for accessing HDF5 files.
*
* https://jhdf.io
*
* Copyright (c) 2024 James Mudd
*
* MIT License see 'LICENSE' file
*/
package io.jhdf.dataset.chunked.indexing;

import io.jhdf.Utils;
import io.jhdf.dataset.chunked.Chunk;
import io.jhdf.dataset.chunked.DatasetInfo;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

public class ImplicitChunkIndex implements ChunkIndex {

private final int chunkSize;
private final int[] datasetDimensions;
private final int[] chunkDimensions;
private final long baseAddress;

public ImplicitChunkIndex(long baseAddress, DatasetInfo datasetInfo) {
this.baseAddress = baseAddress;
this.chunkSize = datasetInfo.getChunkSizeInBytes();
this.datasetDimensions = datasetInfo.getDatasetDimensions();
this.chunkDimensions = datasetInfo.getChunkDimensions();
}

@Override
public Collection<Chunk> getAllChunks() {
int totalChunks = Utils.totalChunks(datasetDimensions, chunkDimensions);
List<Chunk> chunks = new ArrayList<>(totalChunks);
for (int i = 0; i < totalChunks; i++) {
chunks.add(new ChunkImpl(baseAddress + i* chunkSize,
chunkSize,
Utils.chunkIndexToChunkOffset(i, chunkDimensions, datasetDimensions)));
}
return chunks;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*
* This file is part of jHDF. A pure Java library for accessing HDF5 files.
*
* https://jhdf.io
*
* Copyright (c) 2024 James Mudd
*
* MIT License see 'LICENSE' file
*/
package io.jhdf.dataset.chunked.indexing;

import io.jhdf.HdfFile;
import io.jhdf.api.Dataset;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;

import static io.jhdf.TestUtils.loadTestHdfFile;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;

class ImplicitIndexTest {

private static final String HDF5_TEST_FILE_NAME = "implicit_index_datasets.hdf5";

private static HdfFile hdfFile;

@BeforeAll
static void setup() throws Exception {
hdfFile = loadTestHdfFile(HDF5_TEST_FILE_NAME);
}

@AfterAll
static void tearDown() {
hdfFile.close();
}

@Test
void testDataReadCorrectly() {
// Unfiltered
Dataset implicitIndex = hdfFile.getDatasetByPath("implicit_index_exact");
int[] implicitIndexDataFlat = (int[]) implicitIndex.getDataFlat();
assertThat(implicitIndexDataFlat).isEqualTo(expectedData(Math.toIntExact(implicitIndex.getSize())));

Dataset indexMismatch = hdfFile.getDatasetByPath("implicit_index_mismatch");
int[] indexMismatchDataFlat = (int[]) indexMismatch.getDataFlat();
assertThat(indexMismatchDataFlat).isEqualTo(expectedData(Math.toIntExact(indexMismatch.getSize())));

}

private int[] expectedData(int length) {
int[] data = new int[length];
for (int i = 0; i < length; i++) {
data[i] = i;
}
return data;
}
}
Binary file not shown.
56 changes: 56 additions & 0 deletions jhdf/src/test/resources/scripts/implicit_index_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# -------------------------------------------------------------------------------
# This file is part of jHDF. A pure Java library for accessing HDF5 files.
#
# https://jhdf.io
#
# Copyright (c) 2024 James Mudd
#
# MIT License see 'LICENSE' file
# -------------------------------------------------------------------------------
import h5py
import numpy

f = h5py.File("implicit_index_datasets.hdf5", "w", libver='latest')

data = numpy.arange(20)

dataspace = h5py.h5s.create_simple(data.shape) # Create simple dataspace
datatype = h5py.h5t.NATIVE_INT32

# Dataset creation property list
dcpl = h5py.h5p.create(h5py.h5p.DATASET_CREATE)
dcpl.set_alloc_time(h5py.h5d.ALLOC_TIME_EARLY)
# Set chunk dimensions (e.g., chunks of size 5)
chunk_dims = (5,) # Ensure chunks are compatible with dataspace shape
dcpl.set_chunk(chunk_dims)

# Create the dataset
dataset_name = "implicit_index_exact".encode('utf-8') # Dataset name must be bytes
dataset = h5py.h5d.create(f.id, dataset_name, datatype, dataspace, dcpl)

# Write data to the dataset
dataset.write(h5py.h5s.ALL, h5py.h5s.ALL, data)
dataset.close()

# Second dataset with chunk size mismatch
data = numpy.arange(50).reshape(10,5)

dataspace = h5py.h5s.create_simple(data.shape) # Create simple dataspace
datatype = h5py.h5t.NATIVE_INT32

# Dataset creation property list
dcpl = h5py.h5p.create(h5py.h5p.DATASET_CREATE)
dcpl.set_alloc_time(h5py.h5d.ALLOC_TIME_EARLY)
# Set chunk dimensions
chunk_dims = (3,2) # mismatched to data shape
dcpl.set_chunk(chunk_dims)

# Create the dataset
dataset_name = "implicit_index_mismatch".encode('utf-8') # Dataset name must be bytes
dataset = h5py.h5d.create(f.id, dataset_name, datatype, dataspace, dcpl)

# Write data to the dataset
dataset.write(h5py.h5s.ALL, h5py.h5s.ALL, data)
dataset.close()

f.close()

0 comments on commit 1b40888

Please sign in to comment.