From 1e02fc83b36985007d1d367900f832f4f06c08c9 Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Fri, 14 Jun 2024 18:55:31 +0200
Subject: [PATCH 01/37] Add h5bench_dlio extension

---
 CMakeLists.txt         |  22 ++
 dlio/README.md         |  43 ++++
 dlio/h5bench_dlio.c    | 512 +++++++++++++++++++++++++++++++++++++++++
 dlio/h5bench_dlio.h    |  28 +++
 dlio/stats.c           | 426 ++++++++++++++++++++++++++++++++++
 dlio/stats.h           |  74 ++++++
 dlio/utils.c           | 229 ++++++++++++++++++
 dlio/utils.h           |  81 +++++++
 samples/sync-dlio.json |  68 ++++++
 src/h5bench.py         |  79 +++++++
 10 files changed, 1562 insertions(+)
 create mode 100644 dlio/README.md
 create mode 100644 dlio/h5bench_dlio.c
 create mode 100644 dlio/h5bench_dlio.h
 create mode 100644 dlio/stats.c
 create mode 100644 dlio/stats.h
 create mode 100644 dlio/utils.c
 create mode 100644 dlio/utils.h
 create mode 100644 samples/sync-dlio.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 71959922..bed09d14 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,6 +29,7 @@ option(H5BENCH_AMREX            "Enable AMReX benchmark"            OFF)
 option(H5BENCH_OPENPMD          "Enable OpenPMD benchmark"          OFF)
 option(H5BENCH_E3SM             "Enable E3SM benchmark"             OFF)
 option(H5BENCH_MACSIO           "Enable MACSio benchmark"           OFF)
+option(H5BENCH_DLIO           "Enable DLIO benchmark"           OFF)
 
 message(STATUS "h5bench baseline: ON")
 
@@ -41,6 +42,7 @@ if(H5BENCH_ALL)
     set(H5BENCH_OPENPMD ON)
     set(H5BENCH_E3SM ON)
     set(H5BENCH_MACSIO ON)
+    set(H5BENCH_DLIO ON)
 endif()
 
 message(STATUS "h5bench METADATA: ${H5BENCH_METADATA}")
@@ -49,6 +51,7 @@ message(STATUS "h5bench AMREX: ${H5BENCH_AMREX}")
 message(STATUS "h5bench OPENPMD: ${H5BENCH_OPENPMD}")
 message(STATUS "h5bench E3SM: ${H5BENCH_E3SM}")
 message(STATUS "h5bench MACSIO: ${H5BENCH_MACSIO}")
+message(STATUS "h5bench DLIO: ${H5BENCH_DLIO}")
 
 # HDF5 Dependency #############################################################
 #
@@ -328,6 +331,17 @@ configure_file(${CMAKE_SOURCE_DIR}/src/h5bench.py ${CMAKE_BINARY_DIR}/h5bench CO
 configure_file(${CMAKE_SOURCE_DIR}/src/h5bench_version.py ${CMAKE_BINARY_DIR}/h5bench_version.py COPYONLY)
 configure_file(${CMAKE_SOURCE_DIR}/src/h5bench_configuration.py.in ${CMAKE_BINARY_DIR}/h5bench_configuration.py)
 
+# DLIO ######################################################################
+#
+# https://github.com/arcturus5340/h5bench
+
+if(H5BENCH_DLIO)
+    set(h5bench_dlio_src dlio/h5bench_dlio.c dlio/utils.c dlio/stats.c)
+
+    add_executable(h5bench_dlio ${h5bench_dlio_src})
+    target_link_libraries(h5bench_dlio h5bench_util hdf5 m MPI::MPI_C)
+endif()
+
 # Install binaries ############################################################
 
 install(
@@ -419,6 +433,14 @@ if(H5BENCH_MACSIO)
     )
 endif()
 
+if(H5BENCH_DLIO)
+    install(
+        TARGETS
+        h5bench_dlio
+        DESTINATION bin
+    )
+endif()
+
 # Testing #####################################################################
 
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tests)
diff --git a/dlio/README.md b/dlio/README.md
new file mode 100644
index 00000000..e4437276
--- /dev/null
+++ b/dlio/README.md
@@ -0,0 +1,43 @@
+# DLIO benchmark
+
+The benchmark is designed to measure the performance of training and evaluation of deep learning models on data stored as HDF5 files.
+
+## Overview
+
+### Command-line Arguments (Options)
+
+- ``--generate-data``: Enable generation of benchmarking data. [default: *false*]
+- ``--train``: Enable model training simulation [default: *false*]
+- ``--evaluation``: Enable model evaluation simulation [default: *false*]
+- ``--record-length <x>``: Record size of a single sample in bytes [default: *67108864*]
+- ``--num-files-train <x>``: The number of files used to train the model [default: *64*]
+- ``--num-files-eval <x>``: The number of files used to evaluate the model [default: *8*]
+- ``--num-samples-per-file <x>``: The number of samples in each file [default: *4*]
+- ``--data-folder <x>``: Name of the directory storing the benchmark data [default: *./data*]
+- ``--file-prefix <x>``: Prefix in the name of files containing training and evaluation data [default: *img*]
+- ``--chunking``: Enable chunking [default: *false*]
+- ``--chunk-size <x>``: Chunk size [default: *1024*]
+- ``--keep-files``: Does not delete data after the benchmark is finished [default: *1024*]
+- ``--compression``: Enable compression [default: *false*]
+- ``--compression-level <x>``: Compression level from 1 to 9 [default: *4*]
+- ``--batch-size <x>``: Training batch size [default: *4*]
+- ``--batch-size-eval <x>``: Evaluation batch size [default: *2*]
+- ``--shuffle``: Enable samples shuffle [default: *false*]
+- ``--preprocess-time <x>``: Preprocessing time after reading each sample in seconds [default: *0.0*]
+- ``--preprocess-time-stdev <x>``: Standard deviation in preprocessing time in seconds [default: *0.0*]
+- ``--epochs <x>``: The number of epochs [default: *5*]
+- ``--computation-time <x>``: Computation time after reading each batch in seconds [default: *0.323*]
+- ``--computation-time-stdev <x>``:  Standard deviation in computation time in seconds [default: *0.0*]
+- ``--random-seed <x>``: Random seed to be used [default: *42*]
+- ``--eval-time <x>``: Evaluation time after reading each batch in seconds [default: *0.323*]
+- ``--eval-time-stdev <x>``: Standard deviation in evaluation time in seconds [default: *0.0*]
+- ``--epochs-between-evals <x>``: The number of epochs between evaluations [default: *1*]
+- ``--train-data-folder <x>``: Name of the directory containing the training data [default: *train*]
+- ``--valid-data-folder <x>``: Name of the directory containing the validation data [default: *valid*]
+- ``--records-dataset-name <x>``: Name of the dataset with records [default: *records*]
+- ``--labels-dataset-name <x>``: Name of the dataset with labels [default: *labels*]
+- ``--seed-change-epoch``: Enable seed changes every epoch [default: *false*]
+
+### Exerciser Basics
+
+## Building Exerciser
diff --git a/dlio/h5bench_dlio.c b/dlio/h5bench_dlio.c
new file mode 100644
index 00000000..86ccd1e7
--- /dev/null
+++ b/dlio/h5bench_dlio.c
@@ -0,0 +1,512 @@
+// TODO:
+// - Add logging
+// - Add vol-async support
+// - Add subfiling support
+// - Add more DLIO features
+// - Add more data loaders: Tensorflow & dali
+// - Add prefetcher configuration?
+// - Add read_threads/computation_threads
+// - Add file shuffle configuration
+// - Add more compression filters
+
+#include <assert.h>
+#include <hdf5.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "h5bench_dlio.h"
+#include "stats.h"
+#include "utils.h"
+
+//#ifdef HAVE_SUBFILING
+//#include "H5FDsubfiling.h"
+//#include "H5FDioc.h"
+//#endif
+
+#define GENERATION_BUFFER_SIZE 2 * 1073741824lu
+
+// Global variables
+int NUM_RANKS, MY_RANK;
+uint32_t GENERATION_SIZE;
+uint32_t DIM;
+hid_t DCPL, FAPL, DAPL, DXPL;
+
+void generate_labels_dataset(hid_t file_id, hid_t filespace, hid_t memspace) {
+    hid_t dataset_id = H5Dcreate(file_id, config.LABELS_DATASET_NAME, H5T_STD_I64LE, filespace, H5P_DEFAULT, H5P_DEFAULT, DAPL);
+    assert(dataset_id >= 0);
+
+    uint64_t *data = (uint64_t*)malloc(config.NUM_SAMPLES_PER_FILE * sizeof(uint64_t));
+    if (data == NULL) {
+        exit(1);
+    }
+    for (uint32_t i = 0; i < config.NUM_SAMPLES_PER_FILE; i++) {
+        data[i] = 0;
+    }
+
+    hsize_t offset[1] = {0};
+    hsize_t dims[1] = {config.NUM_SAMPLES_PER_FILE};
+    H5Sselect_hyperslab(filespace, H5S_SELECT_SET, offset, NULL, dims, NULL);
+    herr_t status = H5Dwrite(dataset_id, H5T_STD_I64LE, memspace, filespace, DXPL, data);
+    assert(status >= 0);
+
+    free(data);
+    H5Dclose(dataset_id);
+}
+
+void generate_records_dataset(hid_t file_id, hid_t filespace, hid_t memspace, hid_t extra_memspace) {
+    hid_t dataset_id = H5Dcreate(file_id, config.RECORDS_DATASET_NAME, H5T_STD_U8LE, filespace, H5P_DEFAULT, DCPL, DAPL);
+    assert(dataset_id >= 0);
+
+    uint8_t *data = (uint8_t*)malloc(GENERATION_SIZE * sizeof(uint8_t));
+    if (data == NULL) {
+        exit(1);
+    }
+    for (size_t i = 0; i < GENERATION_SIZE; i++) {
+        data[i] = rand() % 255;
+    }
+
+    uint32_t num_iterations = (config.RECORD_LENGTH * config.NUM_SAMPLES_PER_FILE) / GENERATION_SIZE;
+    uint32_t extra_elements = (config.RECORD_LENGTH * config.NUM_SAMPLES_PER_FILE) % GENERATION_SIZE;
+
+    hsize_t offset[3] = {0, 0, 0};
+    hsize_t dims[3] = {config.NUM_SAMPLES_PER_FILE, DIM, DIM};
+
+    for (uint32_t i = 0; i < num_iterations; i++) {
+        offset[0] = i * config.RECORD_LENGTH * config.NUM_SAMPLES_PER_FILE;
+        H5Sselect_hyperslab(filespace, H5S_SELECT_SET, offset, NULL, dims, NULL);
+        herr_t status = H5Dwrite(dataset_id, H5T_STD_U8LE, memspace, filespace, DXPL, data);
+        assert(status >= 0);
+    }
+
+    if (extra_elements > 0) {
+        hsize_t extra_count[3] = {extra_elements, DIM, DIM};
+        offset[0] = num_iterations * config.RECORD_LENGTH * config.NUM_SAMPLES_PER_FILE;
+        H5Sselect_hyperslab(filespace, H5S_SELECT_SET, offset, NULL, extra_count, NULL);
+        herr_t status = H5Dwrite(dataset_id, H5T_STD_U8LE, extra_memspace, filespace, DXPL, data);
+        assert(status >= 0);
+    }
+
+    free(data);
+    H5Dclose(dataset_id);
+}
+
+void generate_file(const char *file_name, hid_t labels_filespace, hid_t labels_memspace,
+                   hid_t records_filespace, hid_t records_memspace, hid_t extra_records_memspace) {
+    hid_t file_id = H5Fcreate(file_name, H5F_ACC_TRUNC, H5P_DEFAULT, FAPL);
+    assert(file_id >= 0);
+
+    generate_records_dataset(file_id, records_filespace, records_memspace, extra_records_memspace);
+    generate_labels_dataset(file_id, labels_filespace, labels_memspace);
+
+    H5Fclose(file_id);
+}
+
+void generate_data() {
+    hsize_t labels_dims[1] = {config.NUM_SAMPLES_PER_FILE};
+    hid_t labels_filespace = H5Screate_simple(1, labels_dims, NULL);
+    assert(labels_filespace >= 0);
+    hid_t labels_memspace = H5Screate_simple(1, labels_dims, NULL);
+    assert(labels_memspace >= 0);
+
+    hsize_t records_dims[3] = {config.NUM_SAMPLES_PER_FILE, DIM, DIM};
+    hid_t records_filespace = H5Screate_simple(3, records_dims, NULL);
+    assert(records_filespace >= 0);
+    hid_t records_memspace = H5Screate_simple(3, records_dims, NULL);
+    assert(records_memspace >= 0);
+
+    hsize_t extra_records_count[3] = {(config.RECORD_LENGTH * config.NUM_SAMPLES_PER_FILE) % GENERATION_SIZE, DIM, DIM};
+    hid_t extra_records_memspace = H5Screate_simple(3, extra_records_count, NULL);
+    assert(extra_records_memspace >= 0);
+
+    for (uint32_t i = MY_RANK; i < config.NUM_FILES_TRAIN; i += NUM_RANKS) {
+        srand(config.RANDOM_SEED + i);
+
+        printf("Generate train file %u / %u\n", i + 1, config.NUM_FILES_TRAIN);
+        char file_name[256];
+        snprintf(file_name, sizeof(file_name), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER, config.TRAIN_DATA_FOLDER, config.FILE_PREFIX, i + 1, config.NUM_FILES_TRAIN);
+        generate_file(file_name, labels_filespace, labels_memspace, records_filespace, records_memspace, extra_records_memspace);
+    }
+
+    for (uint32_t i = MY_RANK; i < config.NUM_FILES_EVAL; i += NUM_RANKS) {
+        srand(config.RANDOM_SEED + config.NUM_FILES_TRAIN + i);
+
+        printf("Generate valid file %u / %u\n", i + 1, config.NUM_FILES_EVAL);
+        char file_name[256];
+        snprintf(file_name, sizeof(file_name), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER, config.VALID_DATA_FOLDER, config.FILE_PREFIX, i + 1, config.NUM_FILES_EVAL);
+        generate_file(file_name, labels_filespace, labels_memspace, records_filespace, records_memspace, extra_records_memspace);
+    }
+
+    H5Sclose(labels_memspace);
+    H5Sclose(labels_filespace);
+    H5Sclose(records_memspace);
+    H5Sclose(extra_records_memspace);
+    H5Sclose(records_filespace);
+}
+
+void read_sample(const char *file_path, uint32_t sample, uint64_t *metadata_time_out, uint64_t *read_time_out) {
+    hsize_t offset[3] = {sample, 0, 0};
+    hsize_t count[3] = {1, DIM, DIM};
+
+    uint64_t t1 = get_time_usec();
+    hid_t file_id = H5Fopen(file_path, H5F_ACC_RDONLY, FAPL);
+    hid_t dataset_id = H5Dopen(file_id, config.RECORDS_DATASET_NAME, DXPL);
+    hid_t filespace = H5Dget_space(dataset_id);
+    hid_t memspace = H5Screate_simple(3, count, NULL);
+    H5Sselect_hyperslab(filespace, H5S_SELECT_SET, offset, NULL, count, NULL);
+    uint64_t t2 = get_time_usec();
+    assert(file_id >= 0);
+    assert(dataset_id >= 0);
+    assert(filespace >= 0);
+    assert(memspace >= 0);
+
+    uint8_t *data = (uint8_t *)malloc(DIM * DIM * sizeof(uint8_t));
+    if (data == NULL) {
+        exit(1);
+    }
+
+    uint64_t t3 = get_time_usec();
+    herr_t status = H5Dread(dataset_id, H5T_STD_U8LE, memspace, filespace, DXPL, data);
+    uint64_t t4 = get_time_usec();
+    assert(status >= 0);
+
+    free(data);
+
+    uint64_t t5 = get_time_usec();
+    H5Sclose(memspace);
+    H5Sclose(filespace);
+    H5Dclose(dataset_id);
+    H5Fclose(file_id);
+    uint64_t t6 = get_time_usec();
+
+    *metadata_time_out = (t2 - t1) + (t6 - t5);
+    *read_time_out = t4 - t3;
+}
+
+uint64_t compute(float time, float time_stdev) {
+    if (time != 0.0 || time_stdev != 0.0) {
+        int t = (uint64_t)(generate_normal_random(time, time_stdev) * 1000000.0);
+        usleep(t > 0 ? t : 0);
+        return t;
+    }
+    return 0;
+}
+
+void eval(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out) {
+    uint32_t total_samples = config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE;
+
+    if (config.DO_SHUFFLE) {
+        shuffle(indices, total_samples);
+    }
+
+    uint32_t samples_per_rank = total_samples / NUM_RANKS;
+    uint32_t read_from = MY_RANK * samples_per_rank;
+    uint32_t read_to = (MY_RANK + 1) * samples_per_rank;
+    uint32_t read_counter = 0;
+
+    uint64_t t0 = get_time_usec();
+    for (uint32_t i = read_from; i < read_to; i++) {
+        uint32_t file_num = indices[i] / config.NUM_SAMPLES_PER_FILE + 1;
+        uint32_t sample_num = indices[i] % config.NUM_SAMPLES_PER_FILE;
+        char file_path[256];
+        snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER, config.VALID_DATA_FOLDER, config.FILE_PREFIX, file_num, config.NUM_FILES_EVAL);
+
+        uint64_t metadata_time = 0, read_time = 0;
+        read_sample(file_path, sample_num, &metadata_time, &read_time);
+        read_counter++;
+        compute(config.PREPROCESS_TIME, config.PREPROCESS_TIME_STDEV);
+
+        *local_metadata_time_out += metadata_time;
+        *local_read_time_out += read_time;
+
+        if (read_counter % config.BATCH_SIZE_EVAL == 0 && read_counter != 0) {
+            batch_loaded_eval(epoch, t0);
+
+            uint64_t t = compute(config.EVAL_TIME, config.EVAL_TIME_STDEV);
+            batch_processed_eval(epoch, t, t0);
+            read_counter = 0;
+
+            t0 = get_time_usec();
+        }
+    }
+
+    for (uint32_t iteration = MY_RANK; iteration < total_samples - NUM_RANKS * samples_per_rank; iteration += NUM_RANKS) {
+        uint32_t i = NUM_RANKS * samples_per_rank + iteration;
+        uint32_t file_num = indices[i] / config.NUM_SAMPLES_PER_FILE + 1;
+        uint32_t sample_num = indices[i] % config.NUM_SAMPLES_PER_FILE;
+        char file_path[256];
+        snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER, config.VALID_DATA_FOLDER, config.FILE_PREFIX, file_num, config.NUM_FILES_EVAL);
+
+        uint64_t metadata_time = 0, read_time = 0;
+        read_sample(file_path, sample_num, &metadata_time, &read_time);
+        read_counter++;
+        compute(config.PREPROCESS_TIME, config.PREPROCESS_TIME_STDEV);
+
+        *local_metadata_time_out += metadata_time;
+        *local_read_time_out += read_time;
+
+        if (read_counter % config.BATCH_SIZE_EVAL == 0){
+            batch_loaded_eval(epoch, t0);
+
+            uint64_t t = compute(config.EVAL_TIME, config.EVAL_TIME_STDEV);
+            batch_processed_eval(epoch, t, t0);
+            read_counter = 0;
+
+            t0 = get_time_usec();
+        }
+    }
+
+    if (read_counter != 0) {
+        batch_loaded_eval(epoch, t0);
+
+        uint64_t t = compute(config.EVAL_TIME, config.EVAL_TIME_STDEV);
+        batch_processed_eval(epoch, t, t0);
+    }
+}
+
+void train(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out) {
+    if (indices == NULL) return;
+    uint32_t total_samples = config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE;
+
+    if (config.DO_SHUFFLE) {
+        shuffle(indices, total_samples);
+    }
+
+    uint32_t samples_per_rank = total_samples / NUM_RANKS;
+    uint32_t read_from = MY_RANK * samples_per_rank;
+    uint32_t read_to = (MY_RANK + 1) * samples_per_rank;
+    uint32_t read_counter = 0;
+
+    uint64_t t0 = get_time_usec();
+    for (uint32_t i = read_from; i < read_to; i++) {
+        uint32_t file_num = indices[i] / config.NUM_SAMPLES_PER_FILE + 1;
+        uint32_t sample_num = indices[i] % config.NUM_SAMPLES_PER_FILE;
+        char file_path[256];
+        snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER, config.TRAIN_DATA_FOLDER, config.FILE_PREFIX, file_num, config.NUM_FILES_TRAIN);
+
+        uint64_t metadata_time = 0, read_time = 0;
+        read_sample(file_path, sample_num, &metadata_time, &read_time);
+        read_counter++;
+        compute(config.PREPROCESS_TIME, config.PREPROCESS_TIME_STDEV);
+
+        *local_metadata_time_out += metadata_time;
+        *local_read_time_out += read_time;
+
+        if (read_counter % config.BATCH_SIZE == 0 && read_counter != 0) {
+            batch_loaded_train(epoch, t0);
+
+            uint64_t t = compute(config.COMPUTATION_TIME, config.COMPUTATION_TIME_STDEV);
+            batch_processed_train(epoch, t, t0);
+            MPI_Barrier(MPI_COMM_WORLD);
+
+            read_counter = 0;
+            t0 = get_time_usec();
+        }
+    }
+
+    for (uint32_t iteration = MY_RANK; iteration < total_samples - NUM_RANKS * samples_per_rank; iteration += NUM_RANKS) {
+        uint32_t i = NUM_RANKS * samples_per_rank + iteration;
+        uint32_t file_num = indices[i] / config.NUM_SAMPLES_PER_FILE + 1;
+        uint32_t sample_num = indices[i] % config.NUM_SAMPLES_PER_FILE;
+        char file_path[256];
+        snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER, config.TRAIN_DATA_FOLDER, config.FILE_PREFIX, file_num, config.NUM_FILES_TRAIN);
+
+        uint64_t metadata_time = 0, read_time = 0;
+        read_sample(file_path, sample_num, &metadata_time, &read_time);
+        read_counter++;
+        compute(config.PREPROCESS_TIME, config.PREPROCESS_TIME_STDEV);
+
+        *local_metadata_time_out += metadata_time;
+        *local_read_time_out += read_time;
+
+        if (read_counter % config.BATCH_SIZE == 0){
+            batch_loaded_train(epoch, t0);
+
+            uint64_t t = compute(config.COMPUTATION_TIME, config.COMPUTATION_TIME_STDEV);
+            batch_processed_train(epoch, t, t0);
+
+            read_counter = 0;
+            t0 = get_time_usec();
+        }
+    }
+
+    if (read_counter != 0) {
+        batch_loaded_train(epoch, t0);
+
+        uint64_t t = compute(config.COMPUTATION_TIME, config.COMPUTATION_TIME_STDEV);
+        batch_processed_train(epoch, t, t0);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+}
+
+void run(uint64_t *train_metadata_time, uint64_t *train_read_time,
+         uint64_t *eval_metadata_time, uint64_t *eval_read_time) {
+    uint32_t total_train_samples = config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE;
+    uint32_t *indices_train = (uint32_t *)malloc(total_train_samples * sizeof(uint32_t));
+    if (indices_train == NULL) {
+        exit(1);
+    }
+    for (uint32_t i = 0; i < total_train_samples; i++) {
+        indices_train[i] = i;
+    }
+
+    uint32_t total_eval_samples = config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE;
+    uint32_t *indices_eval = (uint32_t *)malloc(total_eval_samples * sizeof(uint32_t));
+    if (indices_eval == NULL) {
+        exit(1);
+    }
+    for (unsigned long i = 0; i < total_eval_samples; i++) {
+        indices_eval[i] = i;
+    }
+
+    uint64_t local_train_metadata_time = 0, local_train_read_time = 0,
+             local_eval_metadata_time = 0, local_eval_read_time = 0;
+    uint32_t next_eval_epoch = config.EPOCHS_BETWEEN_EVALS;
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    for (uint32_t epoch = 0; epoch < config.EPOCHS; epoch++) {
+//        if (MY_RANK == 0) printf("New Epoch %u\n", epoch + 1);
+        if (config.SEED_CHANGE_EPOCH) srand(config.RANDOM_SEED + epoch);
+
+        start_train(epoch);
+        train(epoch, indices_train, &local_train_metadata_time, &local_train_read_time);
+        end_train(epoch);
+
+        if (config.DO_EVALUATION && (epoch + 1 >= next_eval_epoch)) {
+            next_eval_epoch += config.EPOCHS_BETWEEN_EVALS;
+            start_eval(epoch);
+            eval(epoch, indices_eval, &local_eval_metadata_time, &local_eval_read_time);
+            end_eval(epoch);
+        }
+
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+    MPI_Reduce(&local_train_metadata_time, train_metadata_time, 1, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+    MPI_Reduce(&local_train_read_time, train_read_time, 1, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+    MPI_Reduce(&local_eval_metadata_time, eval_metadata_time, 1, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+    MPI_Reduce(&local_eval_read_time, eval_read_time, 1, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+
+    free(indices_train);
+    free(indices_eval);
+}
+
+void init(int num_ranks) {
+    config.NUM_RANKS = num_ranks;
+
+    DIM = (uint32_t)sqrt(config.RECORD_LENGTH);
+    config.RECORD_LENGTH = DIM * DIM;
+
+    uint32_t chunk_dimension = (uint32_t)sqrt(config.CHUNK_SIZE);
+    chunk_dimension = chunk_dimension > DIM? DIM: chunk_dimension;
+    config.CHUNK_SIZE = chunk_dimension * chunk_dimension;
+
+    uint32_t data_length = config.RECORD_LENGTH * config.NUM_SAMPLES_PER_FILE;
+    GENERATION_SIZE = data_length > GENERATION_BUFFER_SIZE? GENERATION_BUFFER_SIZE: data_length;
+
+    srand(config.RANDOM_SEED);
+
+    DCPL = H5Pcreate(H5P_DATASET_CREATE);
+    if (config.DO_CHUNKING) {
+        hsize_t chunk_dims[3] = {1, chunk_dimension, chunk_dimension};
+        H5Pset_chunk(DCPL, 3, chunk_dims);
+        if (config.DO_COMPRESSION) {
+            H5Pset_deflate(DCPL, config.COMPRESSION_LEVEL);
+        }
+    }
+
+    FAPL = H5Pcreate(H5P_FILE_ACCESS);
+//    H5Pset_fapl_mpio(fapl, MPI_COMM_WORLD, MPI_INFO_NULL);
+#if H5_VERSION_GE(1, 10, 0)
+    H5Pset_all_coll_metadata_ops(FAPL, true);
+    H5Pset_coll_metadata_write(FAPL, true);
+#endif
+
+    hid_t DAPL = H5Pcreate(H5P_DATASET_ACCESS);
+#if H5_VERSION_GE(1, 10, 0)
+    H5Pset_all_coll_metadata_ops(DAPL, true);
+#endif
+
+    hid_t DXPL = H5Pcreate(H5P_DATASET_XFER);
+}
+
+int main(int argc, char *argv[]) {
+    int mpi_thread_lvl_provided = -1;
+    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_thread_lvl_provided);
+    assert(MPI_THREAD_MULTIPLE == mpi_thread_lvl_provided);
+    MPI_Comm_rank(MPI_COMM_WORLD, &MY_RANK);
+    MPI_Comm_size(MPI_COMM_WORLD, &NUM_RANKS);
+
+    parse_args(argc, argv);
+
+    if (MY_RANK == 0) {
+        printf("Create directory \"%s\"... ", config.DATA_FOLDER);
+        create_directory(config.DATA_FOLDER);
+        printf("OK\n");
+
+        printf("Create directory \"%s/%s\"... ", config.DATA_FOLDER, config.TRAIN_DATA_FOLDER);
+        char dir_name[256];
+        snprintf(dir_name, sizeof(dir_name), "%s/%s", config.DATA_FOLDER, config.TRAIN_DATA_FOLDER);
+        create_directory(dir_name);
+        printf("OK\n");
+
+        printf("Create directory \"%s/%s\"... ", config.DATA_FOLDER, config.VALID_DATA_FOLDER);
+        snprintf(dir_name, sizeof(dir_name), "%s/%s", config.DATA_FOLDER, config.VALID_DATA_FOLDER);
+        create_directory(dir_name);
+        printf("OK\n");
+    }
+
+    init(NUM_RANKS);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (config.DO_DATA_GENERATION) {
+        generate_data();
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (config.DO_TRAIN) {
+        // TODO: check files dimension if generate=no
+        stats_initialize();
+
+        uint64_t train_metadata_time = 0, train_read_time = 0, eval_metadata_time = 0, eval_read_time = 0;
+        run(&train_metadata_time, &train_read_time, &eval_metadata_time, &eval_read_time);
+
+        prepare_data();
+
+        MPI_Reduce(MY_RANK == 0? MPI_IN_PLACE: &train_metadata_time, &train_metadata_time, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(MY_RANK == 0? MPI_IN_PLACE: &train_read_time, &train_read_time, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(MY_RANK == 0? MPI_IN_PLACE: &eval_metadata_time, &eval_metadata_time, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(MY_RANK == 0? MPI_IN_PLACE: &eval_read_time, &eval_read_time, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+
+        train_metadata_time /= NUM_RANKS;
+        train_read_time /= NUM_RANKS;
+        eval_metadata_time /= NUM_RANKS;
+        eval_read_time /= NUM_RANKS;
+
+        MPI_Barrier(MPI_COMM_WORLD);
+
+        if (MY_RANK == 0) {
+            print_data(&train_metadata_time, &train_read_time, &eval_metadata_time, &eval_read_time);
+        }
+
+        stats_finalize();
+    }
+
+    if (!config.KEEP_FILES && MY_RANK == 0) {
+        delete_directory(config.DATA_FOLDER);
+    }
+
+    H5Pclose(DCPL);
+    H5Pclose(DXPL);
+    H5Pclose(DAPL);
+    H5Pclose(FAPL);
+    MPI_Finalize();
+    return 0;
+}
diff --git a/dlio/h5bench_dlio.h b/dlio/h5bench_dlio.h
new file mode 100644
index 00000000..959906ba
--- /dev/null
+++ b/dlio/h5bench_dlio.h
@@ -0,0 +1,28 @@
+#ifndef SANDBOX_H5BENCH_ML_READ_H
+#define SANDBOX_H5BENCH_ML_READ_H
+
+#include <hdf5.h>
+
+void generate_labels_dataset(hid_t file_id, hid_t filespace, hid_t memspace);
+
+void generate_records_dataset(hid_t file_id, hid_t filespace, hid_t memspace, hid_t extra_memspace);
+
+void generate_file(const char *file_name, hid_t labels_filespace, hid_t labels_memspace,
+                   hid_t records_filespace, hid_t records_memspace, hid_t extra_records_memspace);
+
+void generate_data();
+
+void read_sample(const char *file_path, uint32_t sample, uint64_t *metadata_time_out, uint64_t *read_time_out);
+
+uint64_t compute(float time, float time_stdev);
+
+void eval(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out);
+
+void train(uint32_t epoch,uint32_t *indices, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out);
+
+void init(int num_ranks);
+
+void run(uint64_t *train_metadata_time, uint64_t *train_read_time,
+         uint64_t *eval_metadata_time, uint64_t *eval_read_time);
+
+#endif //SANDBOX_H5BENCH_ML_READ_H
diff --git a/dlio/stats.c b/dlio/stats.c
new file mode 100644
index 00000000..c987d0b0
--- /dev/null
+++ b/dlio/stats.c
@@ -0,0 +1,426 @@
+#include <mpi.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "stats.h"
+#include "utils.h"
+
+uint32_t TRAIN_MAX_STEPS;
+uint32_t EVAL_MAX_STEPS;
+epoch_data_t *stats;
+epoch_data_t *global_stats;
+
+uint32_t *last_load_train;
+uint32_t *last_load_eval;
+uint32_t *last_proc_train;
+uint32_t *last_proc_eval;
+uint32_t *last_compute_train;
+uint32_t *last_compute_eval;
+
+double AU;
+
+void stats_initialize() {
+    AU = 0.90;
+
+    uint32_t train_steps_count = config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE / (config.BATCH_SIZE * config.NUM_RANKS);
+    uint32_t train_steps_count_remainder = config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE % (config.BATCH_SIZE * config.NUM_RANKS);
+    uint32_t eval_steps_count = config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE / (config.BATCH_SIZE_EVAL * config.NUM_RANKS);
+    uint32_t eval_steps_count_remainder = config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE % (config.BATCH_SIZE_EVAL * config.NUM_RANKS);
+
+    TRAIN_MAX_STEPS = train_steps_count + (train_steps_count_remainder > 0);
+    EVAL_MAX_STEPS = eval_steps_count + (eval_steps_count_remainder > 0);
+
+    stats = (struct epoch_data *)malloc(config.EPOCHS * sizeof(struct epoch_data));
+    if (stats == NULL) {
+        exit(1);
+    }
+
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        stats[i].load.train = (uint64_t*)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
+        if (stats[i].load.train == NULL) {
+            exit(1);
+        }
+        stats[i].load.eval = (uint64_t*)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
+        if (stats[i].load.eval == NULL) {
+            exit(1);
+        }
+        stats[i].proc.train = (uint64_t*)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
+        if (stats[i].proc.train == NULL) {
+            exit(1);
+        }
+        stats[i].proc.eval = (uint64_t*)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
+        if (stats[i].proc.eval == NULL) {
+            exit(1);
+        }
+        stats[i].throughput.train = 0.0;
+        stats[i].throughput.eval = 0.0;
+        stats[i].au.train = 0.0;
+        stats[i].au.eval = 0.0;
+        stats[i].compute.train = (uint64_t*)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
+        if (stats[i].compute.train == NULL) {
+            exit(1);
+        }
+        stats[i].compute.eval = (uint64_t*)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
+        if (stats[i].compute.eval == NULL) {
+            exit(1);
+        }
+        stats[i].observed_time.train = 0;
+        stats[i].observed_time.eval = 0;
+    }
+
+    last_load_train = calloc(config.EPOCHS, sizeof(uint32_t));
+    if (last_load_train == NULL) {
+        exit(1);
+    }
+    last_load_eval = calloc(config.EPOCHS, sizeof(uint32_t));
+    if (last_load_eval == NULL) {
+        exit(1);
+    }
+    last_proc_train = calloc(config.EPOCHS, sizeof(uint32_t));
+    if (last_proc_train == NULL) {
+        exit(1);
+    }
+    last_proc_eval = calloc(config.EPOCHS, sizeof(uint32_t));
+    if (last_proc_eval == NULL) {
+        exit(1);
+    }
+    last_compute_train = calloc(config.EPOCHS, sizeof(uint32_t));
+    if (last_compute_train == NULL) {
+        exit(1);
+    }
+    last_compute_eval = calloc(config.EPOCHS, sizeof(uint32_t));
+    if (last_compute_eval == NULL) {
+        exit(1);
+    }
+}
+
+void stats_finalize() {
+    free(last_load_train);
+    free(last_load_eval);
+    free(last_proc_train);
+    free(last_proc_eval);
+    free(last_compute_train);
+    free(last_compute_eval);
+
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        free(stats[i].load.train);
+        free(stats[i].load.eval);
+        free(stats[i].proc.train);
+        free(stats[i].proc.eval);
+        free(stats[i].compute.train);
+        free(stats[i].compute.eval);
+
+        free(global_stats[i].load.train);
+        free(global_stats[i].load.eval);
+        free(global_stats[i].proc.train);
+        free(global_stats[i].proc.eval);
+        free(global_stats[i].compute.train);
+        free(global_stats[i].compute.eval);
+    }
+
+    free(stats);
+    free(global_stats);
+}
+
+void prepare_data() {
+    global_stats = (struct epoch_data *)malloc(config.EPOCHS * sizeof(struct epoch_data));
+    if (global_stats == NULL) {
+        exit(1);
+    }
+
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        global_stats[i].load.train = (uint64_t*)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
+        if (global_stats[i].load.train == NULL) {
+            exit(1);
+        }
+        global_stats[i].load.eval = (uint64_t*)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
+        if (global_stats[i].load.eval == NULL) {
+            exit(1);
+        }
+        global_stats[i].proc.train = (uint64_t*)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
+        if (global_stats[i].proc.train == NULL) {
+            exit(1);
+        }
+        global_stats[i].proc.eval = (uint64_t*)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
+        if (global_stats[i].proc.eval == NULL) {
+            exit(1);
+        }
+        global_stats[i].compute.train = (uint64_t*)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
+        if (global_stats[i].compute.train == NULL) {
+            exit(1);
+        }
+        global_stats[i].compute.eval = (uint64_t*)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
+        if (global_stats[i].compute.eval == NULL) {
+            exit(1);
+        }
+
+        MPI_Reduce(stats[i].load.train, global_stats[i].load.train, TRAIN_MAX_STEPS, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(stats[i].load.eval, global_stats[i].load.eval, EVAL_MAX_STEPS, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(stats[i].proc.train, global_stats[i].proc.train, TRAIN_MAX_STEPS, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(stats[i].proc.eval, global_stats[i].proc.eval, EVAL_MAX_STEPS, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(&stats[i].au.train, &global_stats[i].au.train, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(&stats[i].au.eval, &global_stats[i].au.eval, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(&stats[i].throughput.train, &global_stats[i].throughput.train, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(&stats[i].throughput.eval, &global_stats[i].throughput.eval, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(stats[i].compute.train, global_stats[i].compute.train, TRAIN_MAX_STEPS, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(stats[i].compute.eval, global_stats[i].compute.eval, EVAL_MAX_STEPS, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(&stats[i].observed_time.train, &global_stats[i].observed_time.train, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(&stats[i].observed_time.eval, &global_stats[i].observed_time.eval, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+
+        for (int j = 0; j < TRAIN_MAX_STEPS; j++) {
+            global_stats[i].load.train[j] /= config.NUM_RANKS;
+            global_stats[i].proc.train[j] /= config.NUM_RANKS;
+            global_stats[i].compute.train[j] /= config.NUM_RANKS;
+        }
+
+        for (int j = 0; j < EVAL_MAX_STEPS; j++) {
+            global_stats[i].load.eval[j] /= config.NUM_RANKS;
+            global_stats[i].proc.eval[j] /= config.NUM_RANKS;
+            global_stats[i].compute.eval[j] /= config.NUM_RANKS;
+        }
+
+        global_stats[i].au.train /= config.NUM_RANKS;
+        global_stats[i].au.eval /= config.NUM_RANKS;
+        global_stats[i].throughput.train /= config.NUM_RANKS;
+        global_stats[i].throughput.eval /= config.NUM_RANKS;
+        global_stats[i].observed_time.train /= config.NUM_RANKS;
+        global_stats[i].observed_time.eval /= config.NUM_RANKS;
+    }
+}
+
+void print_data(uint64_t *train_metadata_time, uint64_t *train_read_time,
+                uint64_t *eval_metadata_time, uint64_t *eval_read_time) {
+
+    printf("metric, value\n");
+    printf("operation, dlio\n");
+    printf("ranks, %d\n", config.NUM_RANKS);
+//    printf("collective meta");
+//    printf("collective data");
+    // Train
+    printf("train compute time, \"");
+    uint64_t train_total_compute_time = 0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        unsigned long int compute_time = 0;
+        for (uint32_t j = 0; j < TRAIN_MAX_STEPS; j++) {
+            compute_time += global_stats[i].compute.train[j];
+        }
+        train_total_compute_time += compute_time;
+        printf("%lf", compute_time / 1000000.0);
+        if (i != config.EPOCHS - 1) printf(", ");
+    }
+    printf("\"\ntrain total compute time, %lf\n", train_total_compute_time / 1000000.0);
+
+    uint64_t train_total_size_bytes = (uint64_t)config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE * config.RECORD_LENGTH;
+    printf("train total size, %lu\n", train_total_size_bytes);
+
+    printf("train total metadata time, %lf\n", *train_metadata_time / 1000000.0);
+    printf("train total raw read time, %lf\n", *train_read_time / 1000000.0);
+    printf("train total raw read rate, %lf\n", (double)train_total_size_bytes / *train_read_time * 1000000.0);
+
+    printf("train observed time, \"");
+    double train_total_observed_time = 0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        double observed_time = global_stats[i].observed_time.train / 1000000.0;
+        train_total_observed_time += observed_time;
+        printf("%lf", observed_time);
+        if (i != config.EPOCHS - 1) printf(", ");
+    }
+    printf("\"\ntrain total observed time, %lf\n", train_total_observed_time);
+
+    printf("train observed rate, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        unsigned long int compute_time = 0;
+        for (uint32_t j = 0; j < TRAIN_MAX_STEPS; j++) {
+            compute_time += global_stats[i].compute.train[j];
+        }
+        printf("%lf", (double)train_total_size_bytes / (global_stats[i].observed_time.train - compute_time) * 1000000.0);
+        if (i != config.EPOCHS - 1) printf(", ");
+    }
+    printf("\"\n");
+
+    printf("train au percentage, \"");
+    double train_au_mean_percentage = 0.0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        train_au_mean_percentage += global_stats[i].au.train;
+        printf("%lf", global_stats[i].au.train);
+        if (i != config.EPOCHS - 1) printf(", ");
+    }
+    train_au_mean_percentage = train_au_mean_percentage / (double)config.EPOCHS;
+    printf("\"\ntrain au mean percentage, %lf\n", train_au_mean_percentage);
+    printf("train au meet expectation, %s\n", train_au_mean_percentage >= 100 * AU? "success": "fail");
+
+    double train_au_stdev_percentage = 0.0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        train_au_stdev_percentage += (global_stats[i].au.train - train_au_mean_percentage) * (global_stats[i].au.train - train_au_mean_percentage);
+    }
+    train_au_stdev_percentage = sqrt(train_au_stdev_percentage / (double)config.EPOCHS);
+    printf("train au stdev percentage, %lf\n", train_au_stdev_percentage);
+
+    printf("train throughput samples per second, \"");
+    double train_throughput_mean_samples_per_second = 0.0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        train_throughput_mean_samples_per_second += global_stats[i].throughput.train;
+        printf("%lf", global_stats[i].throughput.train);
+        if (i != config.EPOCHS - 1) printf(", ");
+    }
+    train_throughput_mean_samples_per_second = train_throughput_mean_samples_per_second / (double)config.EPOCHS;
+    printf("\"\ntrain throughput mean samples per second, %lf\n", train_throughput_mean_samples_per_second);
+
+    double train_throughput_stdev_samples_per_second = 0.0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        train_throughput_stdev_samples_per_second += (global_stats[i].throughput.train - train_throughput_mean_samples_per_second) * (global_stats[i].throughput.train - train_throughput_mean_samples_per_second);
+    }
+    train_throughput_stdev_samples_per_second = sqrt(train_throughput_stdev_samples_per_second / (double)config.EPOCHS);
+    printf("train throughput stdev samples per second, %lf\n", train_throughput_stdev_samples_per_second);
+
+    double train_io_mean_MB_per_second = train_throughput_mean_samples_per_second * config.RECORD_LENGTH / 1024 / 1024;
+    printf("train io mean MB per second, %lf\n", train_io_mean_MB_per_second);
+
+    double train_io_stdev_MB_per_second = train_throughput_stdev_samples_per_second * config.RECORD_LENGTH / 1024 / 1024;
+    printf("train io stdev MB per second, %lf\n", train_io_stdev_MB_per_second);
+
+    // Evaluation
+    printf("eval compute time, \"");
+    uint64_t eval_total_compute_time = 0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        unsigned long int compute_time = 0;
+        for (uint32_t j = 0; j < EVAL_MAX_STEPS; j++) {
+            compute_time += global_stats[i].compute.eval[j];
+        }
+        eval_total_compute_time += compute_time;
+        printf("%lf", compute_time / 1000000.0);
+        if (i != config.EPOCHS - 1) printf(", ");
+    }
+    printf("\"\neval total compute time, %lf\n", eval_total_compute_time / 1000000.0);
+
+    uint64_t eval_total_size_bytes = (uint64_t)config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE * config.RECORD_LENGTH;
+    printf("eval total size, %lu\n", eval_total_size_bytes);
+
+    printf("eval metadata time, %lf\n", *eval_metadata_time / 1000000.0);
+    printf("eval raw read time, %lf\n", *eval_read_time / 1000000.0);
+    printf("eval raw read rate, %lf\n", (double)eval_total_size_bytes / *eval_read_time * 1000000.0);
+
+    printf("eval observed time, \"");
+    double eval_total_observed_time = 0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        double observed_time = global_stats[i].observed_time.eval / 1000000.0;
+        eval_total_observed_time += observed_time;
+        printf("%lf", observed_time);
+        if (i != config.EPOCHS - 1) printf(", ");
+    }
+    printf("\"\neval total observed time, %lf\n", eval_total_observed_time);
+
+    printf("eval observed rate, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        unsigned long compute_time = 0;
+        for (uint32_t j = 0; j < EVAL_MAX_STEPS; j++) {
+            compute_time += global_stats[i].compute.eval[j];
+        }
+        printf("%lf", (double)eval_total_size_bytes / (global_stats[i].observed_time.eval - compute_time) * 1000000.0);
+        if (i != config.EPOCHS - 1) printf(", ");
+    }
+    printf("\"\n");
+
+    printf("eval au percentage, \"");
+    double eval_au_mean_percentage = 0.0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        eval_au_mean_percentage += global_stats[i].au.eval;
+        printf("%lf", global_stats[i].au.eval);
+        if (i != config.EPOCHS - 1) printf(", ");
+    }
+    eval_au_mean_percentage = eval_au_mean_percentage / (double)config.EPOCHS;
+    printf("\"\neval au mean percentage, %lf\n", eval_au_mean_percentage);
+    printf("eval au meet expectation, %s\n", eval_au_mean_percentage >= 100 * AU? "success": "fail");
+
+    double eval_au_stdev_percentage = 0.0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        eval_au_stdev_percentage += (global_stats[i].au.eval - eval_au_mean_percentage) * (global_stats[i].au.eval - eval_au_mean_percentage);
+    }
+    eval_au_stdev_percentage = sqrt(eval_au_stdev_percentage / (double)config.EPOCHS);
+    printf("eval au stdev percentage, %lf\n", eval_au_stdev_percentage);
+
+    printf("eval throughput samples per second, \"");
+    double eval_throughput_mean_samples_per_second = 0.0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        eval_throughput_mean_samples_per_second += global_stats[i].throughput.eval;
+        printf("%lf", global_stats[i].throughput.eval);
+        if (i != config.EPOCHS - 1) printf(", ");
+    }
+    eval_throughput_mean_samples_per_second = eval_throughput_mean_samples_per_second / (double)config.EPOCHS;
+    printf("\"\neval throughput mean samples per second, %lf\n", eval_throughput_mean_samples_per_second);
+
+    double eval_throughput_stdev_samples_per_second = 0.0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        eval_throughput_stdev_samples_per_second += (global_stats[i].throughput.eval - eval_throughput_mean_samples_per_second) * (global_stats[i].throughput.eval - eval_throughput_mean_samples_per_second);
+    }
+    eval_throughput_stdev_samples_per_second = sqrt(eval_throughput_stdev_samples_per_second / (double)config.EPOCHS);
+    printf("eval throughput stdev samples per second, %lf\n", eval_throughput_stdev_samples_per_second);
+
+    double eval_io_mean_MB_per_second = eval_throughput_mean_samples_per_second * config.RECORD_LENGTH / 1024 / 1024;
+    printf("eval io mean MB per second, %lf\n", eval_io_mean_MB_per_second);
+
+    double eval_io_stdev_MB_per_second = eval_throughput_stdev_samples_per_second * config.RECORD_LENGTH / 1024 / 1024;
+    printf("eval io stdev MB per second, %lf\n", eval_io_stdev_MB_per_second);
+}
+
+void batch_loaded_train(uint32_t epoch, uint64_t t0) {
+    stats[epoch].load.train[last_load_train[epoch]++] = (get_time_usec() - t0);
+}
+
+void batch_processed_train(uint32_t epoch, uint64_t computation_time, uint64_t t0) {
+    stats[epoch].proc.train[last_proc_train[epoch]++] = (get_time_usec() - t0);
+    stats[epoch].compute.train[last_compute_train[epoch]++] = computation_time;
+}
+
+void batch_loaded_eval(uint32_t epoch, uint64_t t0) {
+    stats[epoch].load.eval[last_load_eval[epoch]++] = (get_time_usec() - t0);
+}
+
+void batch_processed_eval(uint32_t epoch, uint64_t computation_time, uint64_t t0) {
+    stats[epoch].proc.eval[last_proc_eval[epoch]++] = (get_time_usec() - t0);
+    stats[epoch].compute.eval[last_compute_eval[epoch]++] = computation_time;
+}
+
+void start_train(uint32_t epoch) {
+    stats[epoch].start_time.train = get_time_usec();
+}
+
+void end_train(uint32_t epoch) {
+    uint64_t end_time = get_time_usec();
+    uint64_t total_compute_time = 0;
+    double au = 0.0;
+
+    for (int i = 0; i < TRAIN_MAX_STEPS; i++) {
+        total_compute_time += stats[epoch].compute.train[i];
+    }
+    if (total_compute_time > 0) {
+        stats[epoch].observed_time.train = end_time - stats[epoch].start_time.train;
+        au = (double)total_compute_time / stats[epoch].observed_time.train;
+    }
+
+    stats[epoch].au.train = au * 100;
+    stats[epoch].throughput.train = (double)TRAIN_MAX_STEPS * config.BATCH_SIZE * 1000000.0 / (end_time - stats[epoch].start_time.train);
+}
+
+void start_eval(uint32_t epoch) {
+    stats[epoch].start_time.eval = get_time_usec();
+}
+
+void end_eval(uint32_t epoch) {
+    uint64_t end_time = get_time_usec();
+    uint64_t total_compute_time = 0;
+    double au = 0.0;
+
+    for (int i = 0; i < EVAL_MAX_STEPS; i++) {
+        total_compute_time += stats[epoch].compute.eval[i];
+    }
+    if (total_compute_time > 0) {
+        stats[epoch].observed_time.eval = end_time - stats[epoch].start_time.eval;
+        au = (double)total_compute_time / (double)stats[epoch].observed_time.eval;
+    }
+    stats[epoch].au.eval = au * 100;
+    stats[epoch].throughput.eval = (double)EVAL_MAX_STEPS * config.BATCH_SIZE_EVAL * 1000000.0 / (end_time - stats[epoch].start_time.eval);
+}
+
diff --git a/dlio/stats.h b/dlio/stats.h
new file mode 100644
index 00000000..76cf89ce
--- /dev/null
+++ b/dlio/stats.h
@@ -0,0 +1,74 @@
+#ifndef SANDBOX_STATS_H
+#define SANDBOX_STATS_H
+
+struct load_data {
+    uint64_t *train;
+    uint64_t *eval;
+};
+
+struct proc_data {
+    uint64_t *train;
+    uint64_t *eval;
+};
+
+struct throughput_data {
+    double train;
+    double eval;
+};
+
+struct au_data {
+    double train;
+    double eval;
+};
+
+struct compute_data {
+    uint64_t *train;
+    uint64_t *eval;
+};
+
+struct start_time_data {
+    uint64_t train;
+    uint64_t eval;
+};
+
+struct observed_time_data {
+    uint64_t train;
+    uint64_t eval;
+};
+
+typedef struct epoch_data {
+    struct start_time_data start_time;
+    struct load_data load;
+    struct proc_data proc;
+    struct throughput_data throughput;
+    struct au_data au;
+    struct compute_data compute;
+    struct observed_time_data observed_time;
+} epoch_data_t;
+
+void stats_initialize();
+
+void stats_finalize();
+
+void prepare_data();
+
+void print_data(uint64_t *train_metadata_time, uint64_t *train_read_time,
+                uint64_t *eval_metadata_time, uint64_t *eval_read_time);
+
+void batch_loaded_train(uint32_t epoch, uint64_t start_time);
+
+void batch_processed_train(uint32_t epoch, uint64_t computation_time, uint64_t start_time);
+
+void batch_loaded_eval(uint32_t epoch, uint64_t t0);
+
+void batch_processed_eval(uint32_t epoch, uint64_t computation_time, uint64_t t0);
+
+void start_train(uint32_t epoch);
+
+void end_train(uint32_t epoch);
+
+void start_eval(uint32_t epoch);
+
+void end_eval(uint32_t epoch);
+
+#endif //SANDBOX_STATS_H
diff --git a/dlio/utils.c b/dlio/utils.c
new file mode 100644
index 00000000..32211bc7
--- /dev/null
+++ b/dlio/utils.c
@@ -0,0 +1,229 @@
+#include <dirent.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "utils.h"
+
+uint64_t get_time_usec() {
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return (uint64_t)1000000 * tv.tv_sec + tv.tv_usec;
+}
+
+config_datatype_t config = {
+    // Workflow
+    .DO_DATA_GENERATION = false,
+    .DO_TRAIN = false,
+    .DO_EVALUATION = false,
+
+    // Dataset
+    .RECORD_LENGTH = 67108864, // should be a square number
+//   .RECORD_LENGTH_STDEV = 0.0f,
+//   .RECORD_LENGTH_RESIZE = 0.0f,
+    .NUM_FILES_TRAIN = 64,
+    .NUM_FILES_EVAL = 8,
+    .NUM_SAMPLES_PER_FILE = 4,
+    .DATA_FOLDER = "./data",
+//    .NUM_SUBFOLDERS_TRAIN = 0,
+//    .NUM_SUBFOLDERS_EVAL = 0,
+    .FILE_PREFIX = "img",
+    .DO_COMPRESSION = false,
+    .COMPRESSION_LEVEL = 4,
+    .DO_CHUNKING = false,
+    .CHUNK_SIZE = 1024,  // should be greater than 120 on CLAIX23
+    .KEEP_FILES = false,
+
+    // Reader
+//    .DATA_LOADER = PYTORCH,
+    .BATCH_SIZE = 4,
+    .BATCH_SIZE_EVAL = 2,
+//    .READ_THREADS = 1,
+//    .COMPUTATION_THREADS = 1,
+//    .PREFETCH_SIZE = 0,
+    .DO_SHUFFLE = false,  // sample shuffle vs file_shuffle
+//    .TRANSFER_SIZE = 262144,
+    .PREPROCESS_TIME = 0.0f,
+    .PREPROCESS_TIME_STDEV = 0.000f,
+    // Train
+    .EPOCHS = 5,
+    .COMPUTATION_TIME = 0.323f,
+    .COMPUTATION_TIME_STDEV = 0.000f,
+//    .TOTAL_TRAINING_STEPS = -1
+    .SEED_CHANGE_EPOCH = false,
+    .RANDOM_SEED = 42,
+
+    // Evaluation
+    .EVAL_TIME = 0.323f,
+    .EVAL_TIME_STDEV = 0.000f,
+    .EPOCHS_BETWEEN_EVALS = 1,
+
+    // Output
+    .TRAIN_DATA_FOLDER = "train",
+    .VALID_DATA_FOLDER = "valid",
+    .RECORDS_DATASET_NAME = "records",
+    .LABELS_DATASET_NAME = "labels",
+};
+
+void create_directory(const char *folder) {
+    struct stat st = {0};
+    if (stat(folder, &st) == -1) {
+        if (mkdir(folder, 0700) != 0) {
+            perror("Failed to create directory");
+            MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+    }
+}
+
+void delete_directory(const char *dir_path) {
+    struct dirent *entry;
+    DIR *dir = opendir(dir_path);
+
+    if (dir == NULL) {
+        perror("Error opening directory");
+        return;
+    }
+
+    while ((entry = readdir(dir)) != NULL) {
+        char path[1024];
+        snprintf(path, sizeof(path), "%s/%s", dir_path, entry->d_name);
+
+        if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) {
+            continue;
+        }
+
+        struct stat statbuf;
+        if (stat(path, &statbuf) == 0) {
+            if (S_ISDIR(statbuf.st_mode)) {
+                delete_directory(path);
+            } else {
+                if (remove(path) != 0) {
+                    perror("Error deleting file");
+                }
+            }
+        }
+    }
+
+    closedir(dir);
+
+    if (rmdir(dir_path) != 0) {
+        perror("Error deleting directory");
+    }
+}
+
+void shuffle(uint32_t *array, size_t n) {
+    if (n > 1 && array != NULL) {
+        for (size_t i = n - 1; i > 0; i--) {
+            size_t j = rand() % (i + 1);
+            uint32_t temp = array[i];
+            array[i] = array[j];
+            array[j] = temp;
+        }
+    }
+}
+
+double generate_normal_random(float mean, float stdev) {
+    double u1 = (double) rand() / RAND_MAX;
+    double u2 = (double) rand() / RAND_MAX;
+    double z0 = sqrt(-2.0 * log(u1)) * cos(2.0 * M_PI * u2);
+    return z0 * stdev + mean;
+}
+
+void parse_args(int argc, char *argv[]) {
+    for (uint32_t i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "--generate-data") == 0) {
+            config.DO_DATA_GENERATION = true;
+        } else if (strcmp(argv[i], "--train") == 0) {
+            config.DO_TRAIN = true;
+        } else if (strcmp(argv[i], "--evaluation") == 0) {
+            config.DO_EVALUATION = true;
+        } else if (strcmp(argv[i], "--record-length") == 0) {
+            i++;
+            config.RECORD_LENGTH = atoi(argv[i]);
+        } else if (strcmp(argv[i], "--num-files-train") == 0) {
+            i++;
+            config.NUM_FILES_TRAIN = atoi(argv[i]);
+        } else if (strcmp(argv[i], "--num-files-eval") == 0) {
+            i++;
+            config.NUM_FILES_EVAL = atoi(argv[i]);
+        } else if (strcmp(argv[i], "--num-samples-per-file") == 0) {
+            i++;
+            config.NUM_SAMPLES_PER_FILE = atoi(argv[i]);
+        } else if (strcmp(argv[i], "--data-folder") == 0) {
+            i++;
+            config.DATA_FOLDER = argv[i];
+        } else if (strcmp(argv[i], "--file-prefix") == 0) {
+            i++;
+            config.FILE_PREFIX = argv[i];
+        } else if (strcmp(argv[i], "--chunking") == 0) {
+            config.DO_CHUNKING = true;
+        } else if (strcmp(argv[i], "--chunk-size") == 0) {
+            i++;
+            config.CHUNK_SIZE = atoi(argv[i]);
+        }else if (strcmp(argv[i], "--keep-files") == 0) {
+            config.KEEP_FILES = true;
+        } else if (strcmp(argv[i], "--compression") == 0) {
+            config.DO_COMPRESSION = true;
+        } else if (strcmp(argv[i], "--compression-level") == 0) {
+            i++;
+            config.COMPRESSION_LEVEL = atoi(argv[i]);
+        } else if (strcmp(argv[i], "--batch-size") == 0) {
+            i++;
+            config.BATCH_SIZE = atoi(argv[i]);
+        } else if (strcmp(argv[i], "--batch-size-eval") == 0) {
+            i++;
+            config.BATCH_SIZE_EVAL = atoi(argv[i]);
+        } else if (strcmp(argv[i], "--shuffle") == 0) {
+            config.DO_SHUFFLE = true;
+        } else if (strcmp(argv[i], "--preprocess-time") == 0) {
+            i++;
+            config.PREPROCESS_TIME = atof(argv[i]);
+        } else if (strcmp(argv[i], "--preprocess-time-stdev") == 0) {
+            i++;
+            config.PREPROCESS_TIME_STDEV = atof(argv[i]);
+        } else if (strcmp(argv[i], "--epochs") == 0) {
+            i++;
+            config.PREPROCESS_TIME_STDEV = atoi(argv[i]);
+        } else if (strcmp(argv[i], "--computation-time") == 0) {
+            i++;
+            config.COMPUTATION_TIME = atof(argv[i]);
+        } else if (strcmp(argv[i], "--computation-time-stdev") == 0) {
+            i++;
+            config.COMPUTATION_TIME_STDEV = atof(argv[i]);
+        } else if (strcmp(argv[i], "--random-seed") == 0) {
+            i++;
+            config.RANDOM_SEED = atoi(argv[i]);
+        } else if (strcmp(argv[i], "--eval-time") == 0) {
+            i++;
+            config.RANDOM_SEED = atof(argv[i]);
+        } else if (strcmp(argv[i], "--eval-time-stdev") == 0) {
+            i++;
+            config.RANDOM_SEED = atof(argv[i]);
+        } else if (strcmp(argv[i], "--epochs-between-evals") == 0) {
+            i++;
+            config.EPOCHS_BETWEEN_EVALS = atoi(argv[i]);
+        } else if (strcmp(argv[i], "--train-data-folder") == 0) {
+            i++;
+            config.TRAIN_DATA_FOLDER = argv[i];
+        } else if (strcmp(argv[i], "--valid-data-folder") == 0) {
+            i++;
+            config.VALID_DATA_FOLDER = argv[i];
+        } else if (strcmp(argv[i], "--records-dataset-name") == 0) {
+            i++;
+            config.RECORDS_DATASET_NAME = argv[i];
+        } else if (strcmp(argv[i], "--labels-dataset-name") == 0) {
+            i++;
+            config.LABELS_DATASET_NAME = argv[i];
+        } else if (strcmp(argv[i], "--seed-change-epoch") == 0) {
+            config.SEED_CHANGE_EPOCH = true;
+        } else {
+            printf("WARNING: %s not found\n", argv[i]);
+        }
+    }
+}
\ No newline at end of file
diff --git a/dlio/utils.h b/dlio/utils.h
new file mode 100644
index 00000000..9ff6937a
--- /dev/null
+++ b/dlio/utils.h
@@ -0,0 +1,81 @@
+#ifndef SANDBOX_UTILS_H
+#define SANDBOX_UTILS_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+// ------------------------------ H5bench utils ------------------------------
+
+uint64_t get_time_usec();
+
+// ---------------------------------------------------------------------------
+
+typedef struct config_datatype {
+    // Workflow
+    bool DO_DATA_GENERATION;
+    bool DO_TRAIN;
+    bool DO_EVALUATION;
+
+    // Dataset
+    uint32_t RECORD_LENGTH; // should be a square number
+//    float RECORD_LENGTH_STDEV;
+//    float RECORD_LENGTH_RESIZE;
+    uint32_t NUM_FILES_TRAIN;
+    uint32_t NUM_FILES_EVAL;
+    uint32_t NUM_SAMPLES_PER_FILE;
+    char *DATA_FOLDER;
+//    unsigned int NUM_SUBFOLDERS_TRAIN;
+//    unsigned int NUM_SUBFOLDERS_EVAL;
+    char *FILE_PREFIX;
+    bool DO_COMPRESSION;
+    uint32_t COMPRESSION_LEVEL;
+    bool DO_CHUNKING;
+    uint32_t CHUNK_SIZE; // should be a square number
+    bool KEEP_FILES;
+
+    // Reader
+//    DATA_LOADER;
+    uint32_t BATCH_SIZE;
+    uint32_t BATCH_SIZE_EVAL;
+//    int READ_THREADS;
+//    int COMPUTATION_THREADS;
+//    unsigned int PREFETCH_SIZE;
+    bool DO_SHUFFLE;  // sample shuffle vs file_shuffle
+//    unsigned int TRANSFER_SIZE;
+    float PREPROCESS_TIME;
+    float PREPROCESS_TIME_STDEV;
+    // Train
+    uint32_t EPOCHS;
+    float COMPUTATION_TIME;
+    float COMPUTATION_TIME_STDEV;
+//    long int TOTAL_TRAINING_STEPS = -1
+    bool SEED_CHANGE_EPOCH;
+    int RANDOM_SEED;
+
+    // Evaluation
+    float EVAL_TIME;
+    float EVAL_TIME_STDEV;
+    uint32_t EPOCHS_BETWEEN_EVALS;
+
+    // Output
+    char *TRAIN_DATA_FOLDER;
+    char *VALID_DATA_FOLDER;
+    char *RECORDS_DATASET_NAME;
+    char *LABELS_DATASET_NAME;
+
+    int NUM_RANKS;
+} config_datatype_t;
+
+extern config_datatype_t config;
+
+void shuffle(uint32_t *array, size_t n);
+
+double generate_normal_random(float mean, float stdev);
+
+void create_directory(const char *folder);
+
+void delete_directory(const char *dir_path);
+
+void parse_args(int argc, char *argv[]);
+
+#endif //SANDBOX_UTILS_H
diff --git a/samples/sync-dlio.json b/samples/sync-dlio.json
new file mode 100644
index 00000000..c7e22918
--- /dev/null
+++ b/samples/sync-dlio.json
@@ -0,0 +1,68 @@
+{
+  "mpi": {
+    "command": "srun",
+    "ranks": "8"
+  },
+  "vol": {
+
+  },
+  "file-system": {
+
+  },
+  "directory": "storage",
+  "benchmarks": [
+    {
+      "benchmark": "dlio",
+      "configuration": {
+        "generate-data": "true",
+        "chunking": "true",
+        "keep-files": "true",
+        "compression": "true",
+        "record-length": "67108864",
+        "num-files-train": "64",
+        "num-files-eval": "8",
+        "num-samples-per-file": "4",
+        "data-folder": "data",
+        "file-prefix": "img",
+        "chunk-size": "1024",
+        "compression-level": "4",
+        "random-seed": "42",
+        "train-data-folder": "train",
+        "valid-data-folder": "valid",
+        "records-dataset-name": "records",
+        "labels-dataset-name": "labels"
+      }
+    },
+    {
+      "benchmark": "dlio",
+      "configuration": {
+        "train": "true",
+        "evaluation": "true",
+        "keep-files": "true",
+        "shuffle": "true",
+        "seed-change-epoch": "true",
+        "record-length": "67108864",
+        "num-files-train": "64",
+        "num-files-eval": "8",
+        "num-samples-per-file": "4",
+        "data-folder": "./data",
+        "file-prefix": "img",
+        "batch-size": "4",
+        "batch-size-eval": "2",
+        "preprocess-time": "0.0",
+        "preprocess-time-stdev": "0.0",
+        "epochs": "5",
+        "computation-time": "0.323",
+        "computation-time-stdev": "0.0",
+        "random-seed": "42",
+        "eval-time": "0.323",
+        "eval-time-stdev": "0.323",
+        "epochs-between-evals": "1",
+        "train-data-folder": "train",
+        "valid-data-folder": "valid",
+        "records-dataset-name": "records",
+        "labels-dataset-name": "labels"
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/src/h5bench.py b/src/h5bench.py
index b50b7850..f44d19f2 100755
--- a/src/h5bench.py
+++ b/src/h5bench.py
@@ -34,6 +34,7 @@ class H5bench:
     H5BENCH_OPENPMD_READ = 'h5bench_openpmd_read'
     H5BENCH_E3SM = 'h5bench_e3sm'
     H5BENCH_MACSIO = 'h5bench_macsio'
+    H5BENCH_DLIO = 'h5bench_dlio'
 
     def __init__(self, setup, prefix=None, debug=None, abort=None, validate=None, filter=None):
         """Initialize the suite."""
@@ -245,6 +246,8 @@ def run(self):
                 self.run_e3sm(id, benchmark)
             elif name == 'macsio':
                 self.run_macsio(id, benchmark)
+            elif name == 'dlio':
+                self.run_dlio(id, benchmark)
             else:
                 self.logger.critical('{} - Unsupported benchmark/kernel')
 
@@ -952,6 +955,82 @@ def run_macsio(self, id, setup):
         except Exception as e:
             self.logger.error('Unable to run the benchmark: %s', e)
 
+    def run_dlio(self, id, setup):
+        """Run the DLIO benchmark."""
+        if not self.is_available(self.H5BENCH_DLIO):
+            self.logger.critical('{} is not available'.format(self.H5BENCH_DLIO))
+
+            sys.exit(os.EX_UNAVAILABLE)
+
+        try:
+            start = time.time()
+
+            configuration = setup['configuration']
+
+            parameters = []
+
+            parameters_binary = [
+                'generate-data',
+                'train',
+                'evaluation',
+                'chunking',
+                'keep-files',
+                'compression',
+                'shuffle',
+                'seed-change-epoch',
+            ]
+
+            # Create the configuration parameter list
+            for key in configuration:
+                if key in parameters_binary:
+                    if configuration[key].lower() == 'true':
+                        parameters.append('--{} '.format(key))
+                else:
+                    parameters.append('--{} {} '.format(key, configuration[key]))
+
+            if self.prefix:
+                benchmark_path = self.prefix + '/' + self.H5BENCH_DLIO
+            else:
+                if os.path.isfile(h5bench_configuration.__install__ + '/' + self.H5BENCH_DLIO):
+                    benchmark_path = h5bench_configuration.__install__ + '/' + self.H5BENCH_DLIO
+                else:
+                    benchmark_path = self.H5BENCH_DLIO
+
+            command = '{} {} {}'.format(
+                self.mpi,
+                benchmark_path,
+                ' '.join(parameters)
+            )
+
+            self.logger.info(command)
+
+            # Make sure the command line is in the correct format
+            arguments = shlex.split(command)
+
+            stdout_file_name = '{}/{}/stdout'.format(self.directory, id)
+            stderr_file_name = '{}/{}/stderr'.format(self.directory, id)
+
+            with open(stdout_file_name, mode='w') as stdout_file, open(stderr_file_name, mode='w') as stderr_file:
+                s = subprocess.Popen(arguments, stdout=stdout_file, stderr=stderr_file, env=self.vol_environment)
+                sOutput, sError = s.communicate()
+
+                if s.returncode == 0 and not self.check_for_hdf5_error(stderr_file_name):
+                    self.logger.info('SUCCESS (all output files are located at %s/%s)', self.directory, id)
+                else:
+                    self.logger.error('Return: %s (check %s for detailed log)', s.returncode, stderr_file_name)
+
+                    if self.abort:
+                        self.logger.critical('h5bench execution aborted upon first error')
+
+                        sys.exit(os.EX_SOFTWARE)
+
+            end = time.time()
+
+            self.logger.info('Runtime: {:.7f} seconds (elapsed time, includes allocation wait time)'.format(end - start))
+        except Exception as e:
+            self.logger.error('Unable to run the benchmark: %s', e)
+
+            sys.exit(os.EX_SOFTWARE)
 
 def main():
     PARSER = argparse.ArgumentParser(

From 89c62c26cd32ab26be2996b1a89f0bf25e4f99aa Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Wed, 19 Jun 2024 21:10:12 +0200
Subject: [PATCH 02/37] Add multithreading support

---
 CMakeLists.txt         |   6 +-
 dlio/README.md         |   3 +-
 dlio/h5bench_dlio.c    | 357 ++++++++++++++++++++++++++---------------
 dlio/h5bench_dlio.h    |  16 +-
 dlio/stats.c           |  54 ++++---
 dlio/utils.c           |  13 +-
 dlio/utils.h           |   4 +-
 dlio/workers.c         | 184 +++++++++++++++++++++
 dlio/workers.h         |  31 ++++
 samples/sync-dlio.json |   3 +-
 10 files changed, 506 insertions(+), 165 deletions(-)
 create mode 100644 dlio/workers.c
 create mode 100644 dlio/workers.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bed09d14..a1e3ec85 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -336,7 +336,11 @@ configure_file(${CMAKE_SOURCE_DIR}/src/h5bench_configuration.py.in ${CMAKE_BINAR
 # https://github.com/arcturus5340/h5bench
 
 if(H5BENCH_DLIO)
-    set(h5bench_dlio_src dlio/h5bench_dlio.c dlio/utils.c dlio/stats.c)
+    set(h5bench_dlio_src
+            dlio/h5bench_dlio.c
+            dlio/utils.c
+            dlio/stats.c
+            dlio/workers.c)
 
     add_executable(h5bench_dlio ${h5bench_dlio_src})
     target_link_libraries(h5bench_dlio h5bench_util hdf5 m MPI::MPI_C)
diff --git a/dlio/README.md b/dlio/README.md
index e4437276..460cce4e 100644
--- a/dlio/README.md
+++ b/dlio/README.md
@@ -20,7 +20,7 @@ The benchmark is designed to measure the performance of training and evaluation
 - ``--keep-files``: Does not delete data after the benchmark is finished [default: *1024*]
 - ``--compression``: Enable compression [default: *false*]
 - ``--compression-level <x>``: Compression level from 1 to 9 [default: *4*]
-- ``--batch-size <x>``: Training batch size [default: *4*]
+- ``--batch-size <x>``: Training batch size [default: *7*]
 - ``--batch-size-eval <x>``: Evaluation batch size [default: *2*]
 - ``--shuffle``: Enable samples shuffle [default: *false*]
 - ``--preprocess-time <x>``: Preprocessing time after reading each sample in seconds [default: *0.0*]
@@ -37,6 +37,7 @@ The benchmark is designed to measure the performance of training and evaluation
 - ``--records-dataset-name <x>``: Name of the dataset with records [default: *records*]
 - ``--labels-dataset-name <x>``: Name of the dataset with labels [default: *labels*]
 - ``--seed-change-epoch``: Enable seed changes every epoch [default: *false*]
+- ``--read-threads``: The number of workers used to read the data [default: *4*]
 
 ### Exerciser Basics
 
diff --git a/dlio/h5bench_dlio.c b/dlio/h5bench_dlio.c
index 86ccd1e7..fd86c11f 100644
--- a/dlio/h5bench_dlio.c
+++ b/dlio/h5bench_dlio.c
@@ -5,9 +5,10 @@
 // - Add more DLIO features
 // - Add more data loaders: Tensorflow & dali
 // - Add prefetcher configuration?
-// - Add read_threads/computation_threads
+// - Add computation_threads only for Tensorflow
 // - Add file shuffle configuration
 // - Add more compression filters
+// - Add drop_last = False setting
 
 #include <assert.h>
 #include <hdf5.h>
@@ -16,11 +17,14 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/types.h>
+#include <sys/wait.h>
 #include <unistd.h>
 
 #include "h5bench_dlio.h"
 #include "stats.h"
 #include "utils.h"
+#include "workers.h"
 
 //#ifdef HAVE_SUBFILING
 //#include "H5FDsubfiling.h"
@@ -173,7 +177,7 @@ void read_sample(const char *file_path, uint32_t sample, uint64_t *metadata_time
     uint64_t t4 = get_time_usec();
     assert(status >= 0);
 
-    free(data);
+    free(data);  // TODO: free memory only after compute() call?
 
     uint64_t t5 = get_time_usec();
     H5Sclose(memspace);
@@ -184,6 +188,8 @@ void read_sample(const char *file_path, uint32_t sample, uint64_t *metadata_time
 
     *metadata_time_out = (t2 - t1) + (t6 - t5);
     *read_time_out = t4 - t3;
+
+    compute(config.PREPROCESS_TIME, config.PREPROCESS_TIME_STDEV);
 }
 
 uint64_t compute(float time, float time_stdev) {
@@ -195,156 +201,252 @@ uint64_t compute(float time, float time_stdev) {
     return 0;
 }
 
-void eval(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out) {
-    uint32_t total_samples = config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE;
+void eval_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out) {
+    uint32_t batches_per_rank = config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE_EVAL / NUM_RANKS;
+    uint32_t offset = MY_RANK * batches_per_rank;
 
-    if (config.DO_SHUFFLE) {
-        shuffle(indices, total_samples);
-    }
+    uint64_t t0 = get_time_usec();
+    for (uint32_t i = 0; i < batches_per_rank; i++) {
+        for (uint32_t j = 0; j < config.BATCH_SIZE_EVAL; j++) {
+            uint32_t file_num = indices[offset + i * config.BATCH_SIZE_EVAL + j] / config.NUM_SAMPLES_PER_FILE + 1;
+            uint32_t sample_num = indices[offset + i * config.BATCH_SIZE_EVAL + j] % config.NUM_SAMPLES_PER_FILE;
+            char file_path[256];
+            snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER, config.VALID_DATA_FOLDER, config.FILE_PREFIX, file_num, config.NUM_FILES_EVAL);
+
+            uint64_t metadata_time = 0, read_time = 0;
+            read_sample(file_path, sample_num, &metadata_time, &read_time);
+
+            *local_metadata_time_out += metadata_time;
+            *local_read_time_out += read_time;
+        }
 
-    uint32_t samples_per_rank = total_samples / NUM_RANKS;
-    uint32_t read_from = MY_RANK * samples_per_rank;
-    uint32_t read_to = (MY_RANK + 1) * samples_per_rank;
-    uint32_t read_counter = 0;
+        batch_loaded_eval(epoch, t0);
 
-    uint64_t t0 = get_time_usec();
-    for (uint32_t i = read_from; i < read_to; i++) {
-        uint32_t file_num = indices[i] / config.NUM_SAMPLES_PER_FILE + 1;
-        uint32_t sample_num = indices[i] % config.NUM_SAMPLES_PER_FILE;
-        char file_path[256];
-        snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER, config.VALID_DATA_FOLDER, config.FILE_PREFIX, file_num, config.NUM_FILES_EVAL);
+        uint64_t t = compute(config.EVAL_TIME, config.EVAL_TIME_STDEV);
+        batch_processed_eval(epoch, t, t0);
+        MPI_Barrier(MPI_COMM_WORLD);
 
-        uint64_t metadata_time = 0, read_time = 0;
-        read_sample(file_path, sample_num, &metadata_time, &read_time);
-        read_counter++;
-        compute(config.PREPROCESS_TIME, config.PREPROCESS_TIME_STDEV);
+        t0 = get_time_usec();
+    }
 
-        *local_metadata_time_out += metadata_time;
-        *local_read_time_out += read_time;
+//    TODO: drop_data = False
+//    for (uint32_t iteration = MY_RANK; iteration < total_samples - NUM_RANKS * samples_per_rank; iteration += NUM_RANKS) {
+//        uint32_t i = NUM_RANKS * samples_per_rank + iteration;
+//        uint32_t file_num = indices[i] / config.NUM_SAMPLES_PER_FILE + 1;
+//        uint32_t sample_num = indices[i] % config.NUM_SAMPLES_PER_FILE;
+//        char file_path[256];
+//        snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER, config.VALID_DATA_FOLDER, config.FILE_PREFIX, file_num, config.NUM_FILES_EVAL);
+//
+//        uint64_t metadata_time = 0, read_time = 0;
+//        read_sample(file_path, sample_num, &metadata_time, &read_time);
+//        read_counter++;
+//
+//        *local_metadata_time_out += metadata_time;
+//        *local_read_time_out += read_time;
+//
+//        if (read_counter % config.BATCH_SIZE_EVAL == 0){
+//            batch_loaded_eval(epoch, t0);
+//
+//            uint64_t t = compute(config.EVAL_TIME, config.EVAL_TIME_STDEV);
+//            batch_processed_eval(epoch, t, t0);
+//            read_counter = 0;
+//
+//            t0 = get_time_usec();
+//        }
+//    }
+//
+//    if (read_counter != 0) {
+//        batch_loaded_eval(epoch, t0);
+//
+//        uint64_t t = compute(config.EVAL_TIME, config.EVAL_TIME_STDEV);
+//        batch_processed_eval(epoch, t, t0);
+//    }
+}
 
-        if (read_counter % config.BATCH_SIZE_EVAL == 0 && read_counter != 0) {
-            batch_loaded_eval(epoch, t0);
+void eval_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out) {
+    force_workers_to_shuffle(get_eval_read_fd(), get_eval_write_fd(), get_eval_system_fd());
 
-            uint64_t t = compute(config.EVAL_TIME, config.EVAL_TIME_STDEV);
-            batch_processed_eval(epoch, t, t0);
-            read_counter = 0;
+    uint32_t batches_per_rank = config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE_EVAL / NUM_RANKS;
+    uint32_t offset = MY_RANK * batches_per_rank;
 
-            t0 = get_time_usec();
-        }
+    for (uint32_t i = 0; i < (config.READ_THREADS > batches_per_rank? batches_per_rank: config.READ_THREADS); i++) {
+        int32_t batch = offset + i;
+        write(get_eval_write_fd(), &batch, sizeof(batch));
     }
 
-    for (uint32_t iteration = MY_RANK; iteration < total_samples - NUM_RANKS * samples_per_rank; iteration += NUM_RANKS) {
-        uint32_t i = NUM_RANKS * samples_per_rank + iteration;
-        uint32_t file_num = indices[i] / config.NUM_SAMPLES_PER_FILE + 1;
-        uint32_t sample_num = indices[i] % config.NUM_SAMPLES_PER_FILE;
-        char file_path[256];
-        snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER, config.VALID_DATA_FOLDER, config.FILE_PREFIX, file_num, config.NUM_FILES_EVAL);
-
-        uint64_t metadata_time = 0, read_time = 0;
-        read_sample(file_path, sample_num, &metadata_time, &read_time);
-        read_counter++;
-        compute(config.PREPROCESS_TIME, config.PREPROCESS_TIME_STDEV);
+    for (uint32_t i = config.READ_THREADS; i < batches_per_rank; i++) {
+        execution_time_t data_from_child_process;
+        uint64_t t0 = get_time_usec();
+        read(get_eval_read_fd(), &data_from_child_process, sizeof(data_from_child_process));
 
-        *local_metadata_time_out += metadata_time;
-        *local_read_time_out += read_time;
+        batch_loaded_eval(epoch, t0);
 
-        if (read_counter % config.BATCH_SIZE_EVAL == 0){
-            batch_loaded_eval(epoch, t0);
+        *local_metadata_time_out += data_from_child_process.metadata_time;
+        *local_read_time_out += data_from_child_process.read_time;
 
-            uint64_t t = compute(config.EVAL_TIME, config.EVAL_TIME_STDEV);
-            batch_processed_eval(epoch, t, t0);
-            read_counter = 0;
+        int32_t batch = offset + i;
+        write(get_eval_write_fd(), &batch, sizeof(batch));
 
-            t0 = get_time_usec();
-        }
+        uint64_t t = compute(config.EVAL_TIME_STDEV, config.EVAL_TIME_STDEV);
+        batch_processed_eval(epoch, t, t0);
+        MPI_Barrier(MPI_COMM_WORLD);
     }
 
-    if (read_counter != 0) {
+    for (uint32_t i = 0; i < (config.READ_THREADS > batches_per_rank? batches_per_rank: config.READ_THREADS); i++) {
+        execution_time_t data_from_child_process;
+        uint64_t t0 = get_time_usec();
+        read(get_eval_read_fd(), &data_from_child_process, sizeof(data_from_child_process));
+
         batch_loaded_eval(epoch, t0);
 
-        uint64_t t = compute(config.EVAL_TIME, config.EVAL_TIME_STDEV);
+        *local_metadata_time_out += data_from_child_process.metadata_time;
+        *local_read_time_out += data_from_child_process.read_time;
+
+        uint64_t t = compute(config.EVAL_TIME_STDEV, config.EVAL_TIME_STDEV);
         batch_processed_eval(epoch, t, t0);
+        MPI_Barrier(MPI_COMM_WORLD);
     }
 }
 
-void train(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out) {
-    if (indices == NULL) return;
-    uint32_t total_samples = config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE;
-
-    if (config.DO_SHUFFLE) {
-        shuffle(indices, total_samples);
+void eval(uint32_t epoch, uint32_t *indices, uint64_t *local_eval_metadata_time, uint64_t *local_eval_read_time, bool enable_multiprocessing) {
+    if (enable_multiprocessing) {
+        start_eval(epoch);
+        eval_using_workers(epoch, local_eval_metadata_time, local_eval_read_time);
+        end_eval(epoch);
+        return;
     }
 
-    uint32_t samples_per_rank = total_samples / NUM_RANKS;
-    uint32_t read_from = MY_RANK * samples_per_rank;
-    uint32_t read_to = (MY_RANK + 1) * samples_per_rank;
-    uint32_t read_counter = 0;
+    if (config.SEED_CHANGE_EPOCH) srand(config.RANDOM_SEED * 2 + epoch);
+    if (config.DO_SHUFFLE) shuffle(indices, config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE);
 
-    uint64_t t0 = get_time_usec();
-    for (uint32_t i = read_from; i < read_to; i++) {
-        uint32_t file_num = indices[i] / config.NUM_SAMPLES_PER_FILE + 1;
-        uint32_t sample_num = indices[i] % config.NUM_SAMPLES_PER_FILE;
-        char file_path[256];
-        snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER, config.TRAIN_DATA_FOLDER, config.FILE_PREFIX, file_num, config.NUM_FILES_TRAIN);
+    start_eval(epoch);
+    eval_without_workers(epoch, indices, local_eval_metadata_time, local_eval_read_time);
+    end_eval(epoch);
+}
 
-        uint64_t metadata_time = 0, read_time = 0;
-        read_sample(file_path, sample_num, &metadata_time, &read_time);
-        read_counter++;
-        compute(config.PREPROCESS_TIME, config.PREPROCESS_TIME_STDEV);
+void train_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out) {
+    uint32_t batches_per_rank = config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE / NUM_RANKS;
+    uint32_t offset = MY_RANK * batches_per_rank;
 
-        *local_metadata_time_out += metadata_time;
-        *local_read_time_out += read_time;
+    uint64_t t0 = get_time_usec();
+    for (uint32_t i = 0; i < batches_per_rank; i++) {
+        for (uint32_t j = 0; j < config.BATCH_SIZE; j++) {
+            uint32_t file_num = indices[offset + i * config.BATCH_SIZE + j] / config.NUM_SAMPLES_PER_FILE + 1;
+            uint32_t sample_num = indices[offset + i * config.BATCH_SIZE + j] % config.NUM_SAMPLES_PER_FILE;
+            char file_path[256];
+            snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER, config.TRAIN_DATA_FOLDER, config.FILE_PREFIX, file_num, config.NUM_FILES_TRAIN);
+
+            uint64_t metadata_time = 0, read_time = 0;
+            read_sample(file_path, sample_num, &metadata_time, &read_time);
+
+            *local_metadata_time_out += metadata_time;
+            *local_read_time_out += read_time;
+        }
 
-        if (read_counter % config.BATCH_SIZE == 0 && read_counter != 0) {
-            batch_loaded_train(epoch, t0);
+        batch_loaded_train(epoch, t0);
 
-            uint64_t t = compute(config.COMPUTATION_TIME, config.COMPUTATION_TIME_STDEV);
-            batch_processed_train(epoch, t, t0);
-            MPI_Barrier(MPI_COMM_WORLD);
+        uint64_t t = compute(config.COMPUTATION_TIME, config.COMPUTATION_TIME_STDEV);
+        batch_processed_train(epoch, t, t0);
+        MPI_Barrier(MPI_COMM_WORLD);
 
-            read_counter = 0;
-            t0 = get_time_usec();
-        }
+        t0 = get_time_usec();
     }
 
-    for (uint32_t iteration = MY_RANK; iteration < total_samples - NUM_RANKS * samples_per_rank; iteration += NUM_RANKS) {
-        uint32_t i = NUM_RANKS * samples_per_rank + iteration;
-        uint32_t file_num = indices[i] / config.NUM_SAMPLES_PER_FILE + 1;
-        uint32_t sample_num = indices[i] % config.NUM_SAMPLES_PER_FILE;
-        char file_path[256];
-        snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER, config.TRAIN_DATA_FOLDER, config.FILE_PREFIX, file_num, config.NUM_FILES_TRAIN);
+//    TODO: drop_data = True
+//    for (uint32_t iteration = MY_RANK; iteration < total_samples - NUM_RANKS * samples_per_rank; iteration += NUM_RANKS) {
+//        uint32_t i = NUM_RANKS * samples_per_rank + iteration;
+//        uint32_t file_num = indices[i] / config.NUM_SAMPLES_PER_FILE + 1;
+//        uint32_t sample_num = indices[i] % config.NUM_SAMPLES_PER_FILE;
+//        char file_path[256];
+//        snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER, config.TRAIN_DATA_FOLDER, config.FILE_PREFIX, file_num, config.NUM_FILES_TRAIN);
+//
+//        uint64_t metadata_time = 0, read_time = 0;
+//        read_sample(file_path, sample_num, &metadata_time, &read_time);
+//        read_counter++;
+//
+//        *local_metadata_time_out += metadata_time;
+//        *local_read_time_out += read_time;
+//
+//        if (read_counter % config.BATCH_SIZE == 0){
+//            batch_loaded_train(epoch, t0);
+//
+//            uint64_t t = compute(config.COMPUTATION_TIME, config.COMPUTATION_TIME_STDEV);
+//            batch_processed_train(epoch, t, t0);
+//
+//            read_counter = 0;
+//            t0 = get_time_usec();
+//        }
+//    }
+//
+//    if (read_counter != 0) {
+//        batch_loaded_train(epoch, t0);
+//
+//        uint64_t t = compute(config.COMPUTATION_TIME, config.COMPUTATION_TIME_STDEV);
+//        batch_processed_train(epoch, t, t0);
+//    }
+}
 
-        uint64_t metadata_time = 0, read_time = 0;
-        read_sample(file_path, sample_num, &metadata_time, &read_time);
-        read_counter++;
-        compute(config.PREPROCESS_TIME, config.PREPROCESS_TIME_STDEV);
+void train_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out) {
+    force_workers_to_shuffle(get_train_read_fd(), get_train_write_fd(), get_train_system_fd());
+    uint32_t batches_per_rank = config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE / NUM_RANKS;
+    uint32_t offset = MY_RANK * batches_per_rank;
 
-        *local_metadata_time_out += metadata_time;
-        *local_read_time_out += read_time;
+    for (uint32_t i = 0; i < (config.READ_THREADS > batches_per_rank? batches_per_rank: config.READ_THREADS); i++) {
+        int32_t batch = offset + i;
+        write(get_train_write_fd(), &batch, sizeof(batch));
+    }
 
-        if (read_counter % config.BATCH_SIZE == 0){
-            batch_loaded_train(epoch, t0);
+    for (uint32_t i = config.READ_THREADS; i < batches_per_rank; i++) {
+        execution_time_t data_from_child_process;
+        uint64_t t0 = get_time_usec();
+        read(get_train_read_fd(), &data_from_child_process, sizeof(data_from_child_process));
 
-            uint64_t t = compute(config.COMPUTATION_TIME, config.COMPUTATION_TIME_STDEV);
-            batch_processed_train(epoch, t, t0);
+        batch_loaded_train(epoch, t0);
 
-            read_counter = 0;
-            t0 = get_time_usec();
-        }
+        *local_metadata_time_out += data_from_child_process.metadata_time;
+        *local_read_time_out += data_from_child_process.read_time;
+
+        int32_t batch = offset + i;
+        write(get_train_write_fd(), &batch, sizeof(batch));
+
+        uint64_t t = compute(config.COMPUTATION_TIME, config.COMPUTATION_TIME_STDEV);
+        batch_processed_train(epoch, t, t0);
+        MPI_Barrier(MPI_COMM_WORLD);
     }
 
-    if (read_counter != 0) {
+    for (uint32_t i = 0; i < (config.READ_THREADS > batches_per_rank? batches_per_rank: config.READ_THREADS); i++) {
+        execution_time_t data_from_child_process;
+        uint64_t t0 = get_time_usec();
+        read(get_train_read_fd(), &data_from_child_process, sizeof(data_from_child_process));
+
         batch_loaded_train(epoch, t0);
 
+        *local_metadata_time_out += data_from_child_process.metadata_time;
+        *local_read_time_out += data_from_child_process.read_time;
+
         uint64_t t = compute(config.COMPUTATION_TIME, config.COMPUTATION_TIME_STDEV);
         batch_processed_train(epoch, t, t0);
+        MPI_Barrier(MPI_COMM_WORLD);
     }
+}
 
-    MPI_Barrier(MPI_COMM_WORLD);
+void train(uint32_t epoch, uint32_t *indices, uint64_t *local_train_metadata_time, uint64_t *local_train_read_time, bool enable_multiprocessing) {
+    if (enable_multiprocessing) {
+        start_train(epoch);
+        train_using_workers(epoch, local_train_metadata_time, local_train_read_time);
+        end_train(epoch);
+        return;
+    }
+
+    if (config.SEED_CHANGE_EPOCH) srand(config.RANDOM_SEED + epoch);
+    if (config.DO_SHUFFLE) shuffle(indices, config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE);
+
+    start_train(epoch);
+    train_without_workers(epoch, indices, local_train_metadata_time, local_train_read_time);
+    end_train(epoch);
 }
 
-void run(uint64_t *train_metadata_time, uint64_t *train_read_time,
-         uint64_t *eval_metadata_time, uint64_t *eval_read_time) {
+void run(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *eval_metadata_time, uint64_t *eval_read_time) {
     uint32_t total_train_samples = config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE;
     uint32_t *indices_train = (uint32_t *)malloc(total_train_samples * sizeof(uint32_t));
     if (indices_train == NULL) {
@@ -367,24 +469,27 @@ void run(uint64_t *train_metadata_time, uint64_t *train_read_time,
              local_eval_metadata_time = 0, local_eval_read_time = 0;
     uint32_t next_eval_epoch = config.EPOCHS_BETWEEN_EVALS;
 
+    bool enable_multiprocessing = config.READ_THREADS > 0;
+    if (enable_multiprocessing) {
+        init_workers(indices_train, indices_eval);
+    }
+
     MPI_Barrier(MPI_COMM_WORLD);
 
     for (uint32_t epoch = 0; epoch < config.EPOCHS; epoch++) {
 //        if (MY_RANK == 0) printf("New Epoch %u\n", epoch + 1);
-        if (config.SEED_CHANGE_EPOCH) srand(config.RANDOM_SEED + epoch);
 
-        start_train(epoch);
-        train(epoch, indices_train, &local_train_metadata_time, &local_train_read_time);
-        end_train(epoch);
+        train(epoch, indices_train, &local_train_metadata_time, &local_train_read_time, enable_multiprocessing);
+        MPI_Barrier(MPI_COMM_WORLD);
 
         if (config.DO_EVALUATION && (epoch + 1 >= next_eval_epoch)) {
+            eval(epoch, indices_eval, &local_eval_metadata_time, &local_eval_read_time, enable_multiprocessing);
             next_eval_epoch += config.EPOCHS_BETWEEN_EVALS;
-            start_eval(epoch);
-            eval(epoch, indices_eval, &local_eval_metadata_time, &local_eval_read_time);
-            end_eval(epoch);
+            MPI_Barrier(MPI_COMM_WORLD);
         }
-
-        MPI_Barrier(MPI_COMM_WORLD);
+    }
+    if (enable_multiprocessing) {
+        fin_workers();
     }
 
     MPI_Reduce(&local_train_metadata_time, train_metadata_time, 1, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
@@ -394,11 +499,11 @@ void run(uint64_t *train_metadata_time, uint64_t *train_read_time,
 
     free(indices_train);
     free(indices_eval);
-}
 
-void init(int num_ranks) {
-    config.NUM_RANKS = num_ranks;
+    MPI_Barrier(MPI_COMM_WORLD);
+}
 
+void init_global_variables() {
     DIM = (uint32_t)sqrt(config.RECORD_LENGTH);
     config.RECORD_LENGTH = DIM * DIM;
 
@@ -411,6 +516,8 @@ void init(int num_ranks) {
 
     srand(config.RANDOM_SEED);
 
+    // check if read_threads < batch size and print warning
+
     DCPL = H5Pcreate(H5P_DATASET_CREATE);
     if (config.DO_CHUNKING) {
         hsize_t chunk_dims[3] = {1, chunk_dimension, chunk_dimension};
@@ -460,24 +567,18 @@ int main(int argc, char *argv[]) {
         create_directory(dir_name);
         printf("OK\n");
     }
-
-    init(NUM_RANKS);
-
-    MPI_Barrier(MPI_COMM_WORLD);
+    init_global_variables();
 
     if (config.DO_DATA_GENERATION) {
         generate_data();
     }
 
-    MPI_Barrier(MPI_COMM_WORLD);
-
     if (config.DO_TRAIN) {
         // TODO: check files dimension if generate=no
         stats_initialize();
 
         uint64_t train_metadata_time = 0, train_read_time = 0, eval_metadata_time = 0, eval_read_time = 0;
         run(&train_metadata_time, &train_read_time, &eval_metadata_time, &eval_read_time);
-
         prepare_data();
 
         MPI_Reduce(MY_RANK == 0? MPI_IN_PLACE: &train_metadata_time, &train_metadata_time, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
@@ -485,10 +586,10 @@ int main(int argc, char *argv[]) {
         MPI_Reduce(MY_RANK == 0? MPI_IN_PLACE: &eval_metadata_time, &eval_metadata_time, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
         MPI_Reduce(MY_RANK == 0? MPI_IN_PLACE: &eval_read_time, &eval_read_time, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
 
-        train_metadata_time /= NUM_RANKS;
-        train_read_time /= NUM_RANKS;
-        eval_metadata_time /= NUM_RANKS;
-        eval_read_time /= NUM_RANKS;
+        train_metadata_time /= NUM_RANKS / (config.READ_THREADS > 0? config.READ_THREADS: 1);
+        train_read_time /= NUM_RANKS / (config.READ_THREADS > 0? config.READ_THREADS: 1);
+        eval_metadata_time /= NUM_RANKS / (config.READ_THREADS > 0? config.READ_THREADS: 1);
+        eval_read_time /= NUM_RANKS / (config.READ_THREADS > 0? config.READ_THREADS: 1);
 
         MPI_Barrier(MPI_COMM_WORLD);
 
diff --git a/dlio/h5bench_dlio.h b/dlio/h5bench_dlio.h
index 959906ba..0ee9a57e 100644
--- a/dlio/h5bench_dlio.h
+++ b/dlio/h5bench_dlio.h
@@ -3,6 +3,8 @@
 
 #include <hdf5.h>
 
+extern int NUM_RANKS, MY_RANK;
+
 void generate_labels_dataset(hid_t file_id, hid_t filespace, hid_t memspace);
 
 void generate_records_dataset(hid_t file_id, hid_t filespace, hid_t memspace, hid_t extra_memspace);
@@ -16,13 +18,21 @@ void read_sample(const char *file_path, uint32_t sample, uint64_t *metadata_time
 
 uint64_t compute(float time, float time_stdev);
 
-void eval(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out);
+void eval_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out);
+
+void eval_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out);
+
+void eval(uint32_t epoch, uint32_t *indices, uint64_t *local_eval_metadata_time, uint64_t *local_eval_read_time, bool enable_multiprocessing);
 
-void train(uint32_t epoch,uint32_t *indices, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out);
+void train_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out);
 
-void init(int num_ranks);
+void train_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out);
+
+void train(uint32_t epoch, uint32_t *indices, uint64_t *local_train_metadata_time, uint64_t *local_train_read_time, bool enable_multiprocessing);
 
 void run(uint64_t *train_metadata_time, uint64_t *train_read_time,
          uint64_t *eval_metadata_time, uint64_t *eval_read_time);
 
+void init_global_variables();
+
 #endif //SANDBOX_H5BENCH_ML_READ_H
diff --git a/dlio/stats.c b/dlio/stats.c
index c987d0b0..3940d5d0 100644
--- a/dlio/stats.c
+++ b/dlio/stats.c
@@ -4,6 +4,7 @@
 #include <stdlib.h>
 #include <math.h>
 
+#include "h5bench_dlio.h"
 #include "stats.h"
 #include "utils.h"
 
@@ -24,13 +25,17 @@ double AU;
 void stats_initialize() {
     AU = 0.90;
 
-    uint32_t train_steps_count = config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE / (config.BATCH_SIZE * config.NUM_RANKS);
-    uint32_t train_steps_count_remainder = config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE % (config.BATCH_SIZE * config.NUM_RANKS);
-    uint32_t eval_steps_count = config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE / (config.BATCH_SIZE_EVAL * config.NUM_RANKS);
-    uint32_t eval_steps_count_remainder = config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE % (config.BATCH_SIZE_EVAL * config.NUM_RANKS);
+    uint32_t train_steps_count = config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE / NUM_RANKS;
+    uint32_t train_steps_count_remainder = config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE % (config.BATCH_SIZE * NUM_RANKS);
+    uint32_t eval_steps_count = config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE_EVAL / NUM_RANKS;
+    uint32_t eval_steps_count_remainder = config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE % (config.BATCH_SIZE_EVAL * NUM_RANKS);
 
-    TRAIN_MAX_STEPS = train_steps_count + (train_steps_count_remainder > 0);
-    EVAL_MAX_STEPS = eval_steps_count + (eval_steps_count_remainder > 0);
+    TRAIN_MAX_STEPS = train_steps_count;
+    EVAL_MAX_STEPS = eval_steps_count;
+
+//    TODO: drop_last = False
+//    TRAIN_MAX_STEPS = train_steps_count + (train_steps_count_remainder > 0);
+//    EVAL_MAX_STEPS = eval_steps_count + (eval_steps_count_remainder > 0);
 
     stats = (struct epoch_data *)malloc(config.EPOCHS * sizeof(struct epoch_data));
     if (stats == NULL) {
@@ -170,23 +175,23 @@ void prepare_data() {
         MPI_Reduce(&stats[i].observed_time.eval, &global_stats[i].observed_time.eval, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
 
         for (int j = 0; j < TRAIN_MAX_STEPS; j++) {
-            global_stats[i].load.train[j] /= config.NUM_RANKS;
-            global_stats[i].proc.train[j] /= config.NUM_RANKS;
-            global_stats[i].compute.train[j] /= config.NUM_RANKS;
+            global_stats[i].load.train[j] /= NUM_RANKS;
+            global_stats[i].proc.train[j] /= NUM_RANKS;
+            global_stats[i].compute.train[j] /= NUM_RANKS;
         }
 
         for (int j = 0; j < EVAL_MAX_STEPS; j++) {
-            global_stats[i].load.eval[j] /= config.NUM_RANKS;
-            global_stats[i].proc.eval[j] /= config.NUM_RANKS;
-            global_stats[i].compute.eval[j] /= config.NUM_RANKS;
+            global_stats[i].load.eval[j] /= NUM_RANKS;
+            global_stats[i].proc.eval[j] /= NUM_RANKS;
+            global_stats[i].compute.eval[j] /= NUM_RANKS;
         }
 
-        global_stats[i].au.train /= config.NUM_RANKS;
-        global_stats[i].au.eval /= config.NUM_RANKS;
-        global_stats[i].throughput.train /= config.NUM_RANKS;
-        global_stats[i].throughput.eval /= config.NUM_RANKS;
-        global_stats[i].observed_time.train /= config.NUM_RANKS;
-        global_stats[i].observed_time.eval /= config.NUM_RANKS;
+        global_stats[i].au.train /= NUM_RANKS;
+        global_stats[i].au.eval /= NUM_RANKS;
+        global_stats[i].throughput.train /= NUM_RANKS;
+        global_stats[i].throughput.eval /= NUM_RANKS;
+        global_stats[i].observed_time.train /= NUM_RANKS;
+        global_stats[i].observed_time.eval /= NUM_RANKS;
     }
 }
 
@@ -195,7 +200,7 @@ void print_data(uint64_t *train_metadata_time, uint64_t *train_read_time,
 
     printf("metric, value\n");
     printf("operation, dlio\n");
-    printf("ranks, %d\n", config.NUM_RANKS);
+    printf("ranks, %d\n", NUM_RANKS);
 //    printf("collective meta");
 //    printf("collective data");
     // Train
@@ -212,7 +217,9 @@ void print_data(uint64_t *train_metadata_time, uint64_t *train_read_time,
     }
     printf("\"\ntrain total compute time, %lf\n", train_total_compute_time / 1000000.0);
 
-    uint64_t train_total_size_bytes = (uint64_t)config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE * config.RECORD_LENGTH;
+    // TODO: drop_last = False
+    uint64_t train_total_batches = (uint64_t)config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE / NUM_RANKS * NUM_RANKS;
+    uint64_t train_total_size_bytes = train_total_batches * config.BATCH_SIZE_EVAL * config.NUM_SAMPLES_PER_FILE * config.RECORD_LENGTH;
     printf("train total size, %lu\n", train_total_size_bytes);
 
     printf("train total metadata time, %lf\n", *train_metadata_time / 1000000.0);
@@ -294,8 +301,9 @@ void print_data(uint64_t *train_metadata_time, uint64_t *train_read_time,
         if (i != config.EPOCHS - 1) printf(", ");
     }
     printf("\"\neval total compute time, %lf\n", eval_total_compute_time / 1000000.0);
-
-    uint64_t eval_total_size_bytes = (uint64_t)config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE * config.RECORD_LENGTH;
+    // TODO: drop_last = False
+    uint64_t eval_total_batches = (uint64_t)config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE_EVAL / NUM_RANKS * NUM_RANKS;
+    uint64_t eval_total_size_bytes = eval_total_batches * config.BATCH_SIZE_EVAL * config.NUM_SAMPLES_PER_FILE * config.RECORD_LENGTH;
     printf("eval total size, %lu\n", eval_total_size_bytes);
 
     printf("eval metadata time, %lf\n", *eval_metadata_time / 1000000.0);
@@ -418,7 +426,7 @@ void end_eval(uint32_t epoch) {
     }
     if (total_compute_time > 0) {
         stats[epoch].observed_time.eval = end_time - stats[epoch].start_time.eval;
-        au = (double)total_compute_time / (double)stats[epoch].observed_time.eval;
+        au = (double)total_compute_time / stats[epoch].observed_time.eval;
     }
     stats[epoch].au.eval = au * 100;
     stats[epoch].throughput.eval = (double)EVAL_MAX_STEPS * config.BATCH_SIZE_EVAL * 1000000.0 / (end_time - stats[epoch].start_time.eval);
diff --git a/dlio/utils.c b/dlio/utils.c
index 32211bc7..a0fb38d4 100644
--- a/dlio/utils.c
+++ b/dlio/utils.c
@@ -42,9 +42,9 @@ config_datatype_t config = {
 
     // Reader
 //    .DATA_LOADER = PYTORCH,
-    .BATCH_SIZE = 4,
+    .BATCH_SIZE = 7,
     .BATCH_SIZE_EVAL = 2,
-//    .READ_THREADS = 1,
+    .READ_THREADS = 4,
 //    .COMPUTATION_THREADS = 1,
 //    .PREFETCH_SIZE = 0,
     .DO_SHUFFLE = false,  // sample shuffle vs file_shuffle
@@ -189,7 +189,7 @@ void parse_args(int argc, char *argv[]) {
             config.PREPROCESS_TIME_STDEV = atof(argv[i]);
         } else if (strcmp(argv[i], "--epochs") == 0) {
             i++;
-            config.PREPROCESS_TIME_STDEV = atoi(argv[i]);
+            config.EPOCHS = atoi(argv[i]);
         } else if (strcmp(argv[i], "--computation-time") == 0) {
             i++;
             config.COMPUTATION_TIME = atof(argv[i]);
@@ -201,10 +201,10 @@ void parse_args(int argc, char *argv[]) {
             config.RANDOM_SEED = atoi(argv[i]);
         } else if (strcmp(argv[i], "--eval-time") == 0) {
             i++;
-            config.RANDOM_SEED = atof(argv[i]);
+            config.EVAL_TIME = atof(argv[i]);
         } else if (strcmp(argv[i], "--eval-time-stdev") == 0) {
             i++;
-            config.RANDOM_SEED = atof(argv[i]);
+            config.EVAL_TIME_STDEV = atof(argv[i]);
         } else if (strcmp(argv[i], "--epochs-between-evals") == 0) {
             i++;
             config.EPOCHS_BETWEEN_EVALS = atoi(argv[i]);
@@ -222,6 +222,9 @@ void parse_args(int argc, char *argv[]) {
             config.LABELS_DATASET_NAME = argv[i];
         } else if (strcmp(argv[i], "--seed-change-epoch") == 0) {
             config.SEED_CHANGE_EPOCH = true;
+        } else if (strcmp(argv[i], "--read-threads") == 0) {
+            i++;
+            config.READ_THREADS = atoi(argv[i]);
         } else {
             printf("WARNING: %s not found\n", argv[i]);
         }
diff --git a/dlio/utils.h b/dlio/utils.h
index 9ff6937a..cdbb4580 100644
--- a/dlio/utils.h
+++ b/dlio/utils.h
@@ -37,7 +37,7 @@ typedef struct config_datatype {
 //    DATA_LOADER;
     uint32_t BATCH_SIZE;
     uint32_t BATCH_SIZE_EVAL;
-//    int READ_THREADS;
+    uint32_t READ_THREADS;
 //    int COMPUTATION_THREADS;
 //    unsigned int PREFETCH_SIZE;
     bool DO_SHUFFLE;  // sample shuffle vs file_shuffle
@@ -62,8 +62,6 @@ typedef struct config_datatype {
     char *VALID_DATA_FOLDER;
     char *RECORDS_DATASET_NAME;
     char *LABELS_DATASET_NAME;
-
-    int NUM_RANKS;
 } config_datatype_t;
 
 extern config_datatype_t config;
diff --git a/dlio/workers.c b/dlio/workers.c
new file mode 100644
index 00000000..b183d784
--- /dev/null
+++ b/dlio/workers.c
@@ -0,0 +1,184 @@
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/wait.h>
+
+#include "h5bench_dlio.h"
+#include "workers.h"
+#include "utils.h"
+
+int pipe_train_task_fd[2], pipe_train_result_fd[2], pipe_eval_task_fd[2], pipe_eval_result_fd[2];
+int pipe_train_system_fd[2], pipe_eval_system_fd[2];
+
+void init_workers(uint32_t *indices_train, uint32_t *indices_eval) {
+    if ((pipe(pipe_train_system_fd) == -1) || (pipe(pipe_train_task_fd) == -1) || (pipe(pipe_train_result_fd) == -1)) {
+        perror("pipe");
+        exit(EXIT_FAILURE);
+    }
+
+    for (uint32_t i = 0; i < config.READ_THREADS; i++) {
+        pid_t pid = fork();
+        if (pid == -1) {
+            perror("fork");
+            exit(EXIT_FAILURE);
+        } else if (pid == 0) {
+            close(pipe_train_task_fd[1]);
+            close(pipe_train_result_fd[0]);
+            close(pipe_train_system_fd[1]);
+
+            run_worker(indices_train, pipe_train_task_fd, pipe_train_result_fd, pipe_train_system_fd, true);
+
+            close(pipe_train_task_fd[0]);
+            close(pipe_train_result_fd[1]);
+            close(pipe_train_system_fd[0]);
+            exit(EXIT_SUCCESS);
+        }
+    }
+
+    if (config.DO_EVALUATION) {
+        if ((pipe(pipe_eval_system_fd) == -1) || (pipe(pipe_eval_task_fd) == -1) || (pipe(pipe_eval_result_fd) == -1)) {
+            perror("pipe");
+            exit(EXIT_FAILURE);
+        }
+
+        for (uint32_t i = 0; i < config.READ_THREADS; i++) {
+            pid_t pid = fork();
+            if (pid == -1) {
+                perror("fork");
+                exit(EXIT_FAILURE);
+            } else if (pid == 0) {
+                close(pipe_eval_task_fd[1]);
+                close(pipe_eval_result_fd[0]);
+                close(pipe_eval_system_fd[1]);
+
+                run_worker(indices_eval, pipe_eval_task_fd, pipe_eval_result_fd, pipe_eval_system_fd, false);
+
+                close(pipe_eval_task_fd[0]);
+                close(pipe_eval_result_fd[1]);
+                close(pipe_eval_system_fd[0]);
+                exit(EXIT_SUCCESS);
+            }
+        }
+
+        close(pipe_eval_task_fd[0]);
+        close(pipe_eval_result_fd[1]);
+        close(pipe_eval_system_fd[0]);
+    }
+
+    close(pipe_train_task_fd[0]);
+    close(pipe_train_result_fd[1]);
+    close(pipe_train_system_fd[0]);
+}
+
+int get_train_read_fd() {
+    return pipe_train_result_fd[0];
+}
+
+int get_eval_read_fd() {
+    return pipe_eval_result_fd[0];
+}
+
+int get_train_write_fd() {
+    return pipe_train_task_fd[1];
+}
+
+int get_eval_write_fd() {
+    return pipe_eval_task_fd[1];
+}
+
+int get_train_system_fd() {
+    return pipe_train_system_fd[1];
+}
+
+int get_eval_system_fd() {
+    return pipe_eval_system_fd[1];
+}
+
+void fin_workers() {
+    close(pipe_train_task_fd[1]);
+    close(pipe_train_result_fd[0]);
+    close(pipe_train_system_fd[1]);
+
+    if (config.DO_TRAIN) {
+        close(pipe_eval_task_fd[1]);
+        close(pipe_eval_result_fd[0]);
+        close(pipe_eval_system_fd[1]);
+
+    }
+
+    for (uint32_t i = 0; i < config.READ_THREADS; i++) {
+        wait(NULL);
+    }
+
+    if (config.DO_EVALUATION) {
+        for (uint32_t i = 0; i < config.READ_THREADS; i++) {
+            wait(NULL);
+        }
+    }
+}
+
+void force_workers_to_shuffle(int read_fd, int write_fd, int system_fd) {
+    int32_t shuffle_code = -1;
+    for (uint32_t i = 0; i < config.READ_THREADS; i++) {
+        write(write_fd, &shuffle_code, sizeof(shuffle_code));
+    }
+
+    for (uint32_t i = 0; i < config.READ_THREADS; i++) {
+        read(read_fd, &shuffle_code, sizeof(shuffle_code));
+    }
+
+    for (uint32_t i = 0; i < config.READ_THREADS; i++) {
+        write(system_fd, &shuffle_code, sizeof(shuffle_code));
+    }
+}
+
+void run_worker(uint32_t *indices, int pipe_task_fd[2], int pipe_result_fd[2], int pipe_system_fd[2], bool is_train_worker) {
+    int32_t batch = 0, current_epoch = 0;
+    while (read(pipe_task_fd[0], &batch, sizeof(batch)) > 0) {
+        // A new epoch has begun
+        if (batch == -1) {
+            if (config.SEED_CHANGE_EPOCH) {
+                srand(config.RANDOM_SEED * (is_train_worker? 1: 2) + current_epoch);
+            }
+            if (config.DO_SHUFFLE) {
+                shuffle(indices, config.NUM_SAMPLES_PER_FILE * (is_train_worker? config.NUM_FILES_TRAIN: config.NUM_FILES_EVAL));
+            }
+            current_epoch++;
+            write(pipe_result_fd[1], &batch, sizeof(batch));
+            read(pipe_system_fd[0], &batch, sizeof(batch));
+            continue;
+        }
+
+        uint32_t read_from = batch * (is_train_worker? config.BATCH_SIZE: config.BATCH_SIZE_EVAL);
+        uint32_t read_to = (batch + 1) * (is_train_worker? config.BATCH_SIZE: config.BATCH_SIZE_EVAL);
+        uint64_t process_metadata_time = 0, process_read_time = 0;
+
+        for (uint32_t i = read_from; i < read_to; i++) {
+            uint32_t file_num = indices[i] / config.NUM_SAMPLES_PER_FILE + 1;
+            uint32_t sample_num = indices[i] % config.NUM_SAMPLES_PER_FILE;
+            char file_path[256];
+            snprintf(
+                file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5",
+                 config.DATA_FOLDER,
+                 is_train_worker? config.TRAIN_DATA_FOLDER: config.VALID_DATA_FOLDER,
+                 config.FILE_PREFIX,
+                 file_num,
+                 is_train_worker? config.NUM_FILES_TRAIN: config.NUM_FILES_EVAL
+            );
+
+            uint64_t metadata_time = 0, read_time = 0;
+            read_sample(file_path, sample_num, &metadata_time, &read_time);
+
+            process_metadata_time += metadata_time;
+            process_read_time += read_time;
+        }
+
+        execution_time_t data = {
+            .metadata_time = process_metadata_time,
+            .read_time = process_read_time,
+        };
+
+        write(pipe_result_fd[1], &data, sizeof(data));
+    }
+}
diff --git a/dlio/workers.h b/dlio/workers.h
new file mode 100644
index 00000000..61ed4ac1
--- /dev/null
+++ b/dlio/workers.h
@@ -0,0 +1,31 @@
+#ifndef H5BENCH_WORKERS_H
+#define H5BENCH_WORKERS_H
+
+#include <stdint.h>
+
+typedef struct execution_time {
+    uint64_t metadata_time;
+    uint64_t read_time;
+} execution_time_t;
+
+void init_workers(uint32_t *indices_train, uint32_t *indices_eval);
+
+int get_train_read_fd();
+
+int get_eval_read_fd();
+
+int get_train_write_fd();
+
+int get_eval_write_fd();
+
+int get_train_system_fd();
+
+int get_eval_system_fd();
+
+void fin_workers();
+
+void force_workers_to_shuffle(int read_fd, int write_fd, int system_fd);
+
+void run_worker(uint32_t *indices, int pipe_task_fd[2], int pipe_result_fd[2], int pipe_system_fd[2], bool is_train_worker);
+
+#endif // H5BENCH_WORKERS_H
diff --git a/samples/sync-dlio.json b/samples/sync-dlio.json
index c7e22918..d454cd37 100644
--- a/samples/sync-dlio.json
+++ b/samples/sync-dlio.json
@@ -47,8 +47,9 @@
         "num-samples-per-file": "4",
         "data-folder": "./data",
         "file-prefix": "img",
-        "batch-size": "4",
+        "batch-size": "7",
         "batch-size-eval": "2",
+        "read-threads": "4",
         "preprocess-time": "0.0",
         "preprocess-time-stdev": "0.0",
         "epochs": "5",

From 8bfec7c1783c0ca0cc3f7624b730374cdfa884e1 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions[bot]@users.noreply.github.com>
Date: Wed, 19 Jun 2024 19:22:55 +0000
Subject: [PATCH 03/37] Committing clang-format changes

---
 dlio/h5bench_dlio.c | 382 ++++++++++++++++++++++++++------------------
 dlio/h5bench_dlio.h |  21 ++-
 dlio/stats.c        | 252 ++++++++++++++++++-----------
 dlio/stats.h        |  18 +--
 dlio/utils.c        | 197 ++++++++++++++---------
 dlio/utils.h        |  42 ++---
 dlio/workers.c      |  81 ++++++----
 dlio/workers.h      |   3 +-
 8 files changed, 603 insertions(+), 393 deletions(-)

diff --git a/dlio/h5bench_dlio.c b/dlio/h5bench_dlio.c
index fd86c11f..5bf4eee1 100644
--- a/dlio/h5bench_dlio.c
+++ b/dlio/h5bench_dlio.c
@@ -34,16 +34,19 @@
 #define GENERATION_BUFFER_SIZE 2 * 1073741824lu
 
 // Global variables
-int NUM_RANKS, MY_RANK;
+int      NUM_RANKS, MY_RANK;
 uint32_t GENERATION_SIZE;
 uint32_t DIM;
-hid_t DCPL, FAPL, DAPL, DXPL;
+hid_t    DCPL, FAPL, DAPL, DXPL;
 
-void generate_labels_dataset(hid_t file_id, hid_t filespace, hid_t memspace) {
-    hid_t dataset_id = H5Dcreate(file_id, config.LABELS_DATASET_NAME, H5T_STD_I64LE, filespace, H5P_DEFAULT, H5P_DEFAULT, DAPL);
+void
+generate_labels_dataset(hid_t file_id, hid_t filespace, hid_t memspace)
+{
+    hid_t dataset_id = H5Dcreate(file_id, config.LABELS_DATASET_NAME, H5T_STD_I64LE, filespace, H5P_DEFAULT,
+                                 H5P_DEFAULT, DAPL);
     assert(dataset_id >= 0);
 
-    uint64_t *data = (uint64_t*)malloc(config.NUM_SAMPLES_PER_FILE * sizeof(uint64_t));
+    uint64_t *data = (uint64_t *)malloc(config.NUM_SAMPLES_PER_FILE * sizeof(uint64_t));
     if (data == NULL) {
         exit(1);
     }
@@ -52,7 +55,7 @@ void generate_labels_dataset(hid_t file_id, hid_t filespace, hid_t memspace) {
     }
 
     hsize_t offset[1] = {0};
-    hsize_t dims[1] = {config.NUM_SAMPLES_PER_FILE};
+    hsize_t dims[1]   = {config.NUM_SAMPLES_PER_FILE};
     H5Sselect_hyperslab(filespace, H5S_SELECT_SET, offset, NULL, dims, NULL);
     herr_t status = H5Dwrite(dataset_id, H5T_STD_I64LE, memspace, filespace, DXPL, data);
     assert(status >= 0);
@@ -61,11 +64,14 @@ void generate_labels_dataset(hid_t file_id, hid_t filespace, hid_t memspace) {
     H5Dclose(dataset_id);
 }
 
-void generate_records_dataset(hid_t file_id, hid_t filespace, hid_t memspace, hid_t extra_memspace) {
-    hid_t dataset_id = H5Dcreate(file_id, config.RECORDS_DATASET_NAME, H5T_STD_U8LE, filespace, H5P_DEFAULT, DCPL, DAPL);
+void
+generate_records_dataset(hid_t file_id, hid_t filespace, hid_t memspace, hid_t extra_memspace)
+{
+    hid_t dataset_id =
+        H5Dcreate(file_id, config.RECORDS_DATASET_NAME, H5T_STD_U8LE, filespace, H5P_DEFAULT, DCPL, DAPL);
     assert(dataset_id >= 0);
 
-    uint8_t *data = (uint8_t*)malloc(GENERATION_SIZE * sizeof(uint8_t));
+    uint8_t *data = (uint8_t *)malloc(GENERATION_SIZE * sizeof(uint8_t));
     if (data == NULL) {
         exit(1);
     }
@@ -77,7 +83,7 @@ void generate_records_dataset(hid_t file_id, hid_t filespace, hid_t memspace, hi
     uint32_t extra_elements = (config.RECORD_LENGTH * config.NUM_SAMPLES_PER_FILE) % GENERATION_SIZE;
 
     hsize_t offset[3] = {0, 0, 0};
-    hsize_t dims[3] = {config.NUM_SAMPLES_PER_FILE, DIM, DIM};
+    hsize_t dims[3]   = {config.NUM_SAMPLES_PER_FILE, DIM, DIM};
 
     for (uint32_t i = 0; i < num_iterations; i++) {
         offset[0] = i * config.RECORD_LENGTH * config.NUM_SAMPLES_PER_FILE;
@@ -88,7 +94,7 @@ void generate_records_dataset(hid_t file_id, hid_t filespace, hid_t memspace, hi
 
     if (extra_elements > 0) {
         hsize_t extra_count[3] = {extra_elements, DIM, DIM};
-        offset[0] = num_iterations * config.RECORD_LENGTH * config.NUM_SAMPLES_PER_FILE;
+        offset[0]              = num_iterations * config.RECORD_LENGTH * config.NUM_SAMPLES_PER_FILE;
         H5Sselect_hyperslab(filespace, H5S_SELECT_SET, offset, NULL, extra_count, NULL);
         herr_t status = H5Dwrite(dataset_id, H5T_STD_U8LE, extra_memspace, filespace, DXPL, data);
         assert(status >= 0);
@@ -98,8 +104,10 @@ void generate_records_dataset(hid_t file_id, hid_t filespace, hid_t memspace, hi
     H5Dclose(dataset_id);
 }
 
-void generate_file(const char *file_name, hid_t labels_filespace, hid_t labels_memspace,
-                   hid_t records_filespace, hid_t records_memspace, hid_t extra_records_memspace) {
+void
+generate_file(const char *file_name, hid_t labels_filespace, hid_t labels_memspace, hid_t records_filespace,
+              hid_t records_memspace, hid_t extra_records_memspace)
+{
     hid_t file_id = H5Fcreate(file_name, H5F_ACC_TRUNC, H5P_DEFAULT, FAPL);
     assert(file_id >= 0);
 
@@ -109,21 +117,24 @@ void generate_file(const char *file_name, hid_t labels_filespace, hid_t labels_m
     H5Fclose(file_id);
 }
 
-void generate_data() {
-    hsize_t labels_dims[1] = {config.NUM_SAMPLES_PER_FILE};
-    hid_t labels_filespace = H5Screate_simple(1, labels_dims, NULL);
+void
+generate_data()
+{
+    hsize_t labels_dims[1]   = {config.NUM_SAMPLES_PER_FILE};
+    hid_t   labels_filespace = H5Screate_simple(1, labels_dims, NULL);
     assert(labels_filespace >= 0);
     hid_t labels_memspace = H5Screate_simple(1, labels_dims, NULL);
     assert(labels_memspace >= 0);
 
-    hsize_t records_dims[3] = {config.NUM_SAMPLES_PER_FILE, DIM, DIM};
-    hid_t records_filespace = H5Screate_simple(3, records_dims, NULL);
+    hsize_t records_dims[3]   = {config.NUM_SAMPLES_PER_FILE, DIM, DIM};
+    hid_t   records_filespace = H5Screate_simple(3, records_dims, NULL);
     assert(records_filespace >= 0);
     hid_t records_memspace = H5Screate_simple(3, records_dims, NULL);
     assert(records_memspace >= 0);
 
-    hsize_t extra_records_count[3] = {(config.RECORD_LENGTH * config.NUM_SAMPLES_PER_FILE) % GENERATION_SIZE, DIM, DIM};
-    hid_t extra_records_memspace = H5Screate_simple(3, extra_records_count, NULL);
+    hsize_t extra_records_count[3] = {(config.RECORD_LENGTH * config.NUM_SAMPLES_PER_FILE) % GENERATION_SIZE,
+                                      DIM, DIM};
+    hid_t   extra_records_memspace = H5Screate_simple(3, extra_records_count, NULL);
     assert(extra_records_memspace >= 0);
 
     for (uint32_t i = MY_RANK; i < config.NUM_FILES_TRAIN; i += NUM_RANKS) {
@@ -131,8 +142,10 @@ void generate_data() {
 
         printf("Generate train file %u / %u\n", i + 1, config.NUM_FILES_TRAIN);
         char file_name[256];
-        snprintf(file_name, sizeof(file_name), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER, config.TRAIN_DATA_FOLDER, config.FILE_PREFIX, i + 1, config.NUM_FILES_TRAIN);
-        generate_file(file_name, labels_filespace, labels_memspace, records_filespace, records_memspace, extra_records_memspace);
+        snprintf(file_name, sizeof(file_name), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER,
+                 config.TRAIN_DATA_FOLDER, config.FILE_PREFIX, i + 1, config.NUM_FILES_TRAIN);
+        generate_file(file_name, labels_filespace, labels_memspace, records_filespace, records_memspace,
+                      extra_records_memspace);
     }
 
     for (uint32_t i = MY_RANK; i < config.NUM_FILES_EVAL; i += NUM_RANKS) {
@@ -140,8 +153,10 @@ void generate_data() {
 
         printf("Generate valid file %u / %u\n", i + 1, config.NUM_FILES_EVAL);
         char file_name[256];
-        snprintf(file_name, sizeof(file_name), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER, config.VALID_DATA_FOLDER, config.FILE_PREFIX, i + 1, config.NUM_FILES_EVAL);
-        generate_file(file_name, labels_filespace, labels_memspace, records_filespace, records_memspace, extra_records_memspace);
+        snprintf(file_name, sizeof(file_name), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER,
+                 config.VALID_DATA_FOLDER, config.FILE_PREFIX, i + 1, config.NUM_FILES_EVAL);
+        generate_file(file_name, labels_filespace, labels_memspace, records_filespace, records_memspace,
+                      extra_records_memspace);
     }
 
     H5Sclose(labels_memspace);
@@ -151,15 +166,17 @@ void generate_data() {
     H5Sclose(records_filespace);
 }
 
-void read_sample(const char *file_path, uint32_t sample, uint64_t *metadata_time_out, uint64_t *read_time_out) {
+void
+read_sample(const char *file_path, uint32_t sample, uint64_t *metadata_time_out, uint64_t *read_time_out)
+{
     hsize_t offset[3] = {sample, 0, 0};
-    hsize_t count[3] = {1, DIM, DIM};
+    hsize_t count[3]  = {1, DIM, DIM};
 
-    uint64_t t1 = get_time_usec();
-    hid_t file_id = H5Fopen(file_path, H5F_ACC_RDONLY, FAPL);
-    hid_t dataset_id = H5Dopen(file_id, config.RECORDS_DATASET_NAME, DXPL);
-    hid_t filespace = H5Dget_space(dataset_id);
-    hid_t memspace = H5Screate_simple(3, count, NULL);
+    uint64_t t1         = get_time_usec();
+    hid_t    file_id    = H5Fopen(file_path, H5F_ACC_RDONLY, FAPL);
+    hid_t    dataset_id = H5Dopen(file_id, config.RECORDS_DATASET_NAME, DXPL);
+    hid_t    filespace  = H5Dget_space(dataset_id);
+    hid_t    memspace   = H5Screate_simple(3, count, NULL);
     H5Sselect_hyperslab(filespace, H5S_SELECT_SET, offset, NULL, count, NULL);
     uint64_t t2 = get_time_usec();
     assert(file_id >= 0);
@@ -172,12 +189,12 @@ void read_sample(const char *file_path, uint32_t sample, uint64_t *metadata_time
         exit(1);
     }
 
-    uint64_t t3 = get_time_usec();
-    herr_t status = H5Dread(dataset_id, H5T_STD_U8LE, memspace, filespace, DXPL, data);
-    uint64_t t4 = get_time_usec();
+    uint64_t t3     = get_time_usec();
+    herr_t   status = H5Dread(dataset_id, H5T_STD_U8LE, memspace, filespace, DXPL, data);
+    uint64_t t4     = get_time_usec();
     assert(status >= 0);
 
-    free(data);  // TODO: free memory only after compute() call?
+    free(data); // TODO: free memory only after compute() call?
 
     uint64_t t5 = get_time_usec();
     H5Sclose(memspace);
@@ -187,12 +204,14 @@ void read_sample(const char *file_path, uint32_t sample, uint64_t *metadata_time
     uint64_t t6 = get_time_usec();
 
     *metadata_time_out = (t2 - t1) + (t6 - t5);
-    *read_time_out = t4 - t3;
+    *read_time_out     = t4 - t3;
 
     compute(config.PREPROCESS_TIME, config.PREPROCESS_TIME_STDEV);
 }
 
-uint64_t compute(float time, float time_stdev) {
+uint64_t
+compute(float time, float time_stdev)
+{
     if (time != 0.0 || time_stdev != 0.0) {
         int t = (uint64_t)(generate_normal_random(time, time_stdev) * 1000000.0);
         usleep(t > 0 ? t : 0);
@@ -201,17 +220,24 @@ uint64_t compute(float time, float time_stdev) {
     return 0;
 }
 
-void eval_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out) {
-    uint32_t batches_per_rank = config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE_EVAL / NUM_RANKS;
+void
+eval_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out,
+                     uint64_t *local_read_time_out)
+{
+    uint32_t batches_per_rank =
+        config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE_EVAL / NUM_RANKS;
     uint32_t offset = MY_RANK * batches_per_rank;
 
     uint64_t t0 = get_time_usec();
     for (uint32_t i = 0; i < batches_per_rank; i++) {
         for (uint32_t j = 0; j < config.BATCH_SIZE_EVAL; j++) {
-            uint32_t file_num = indices[offset + i * config.BATCH_SIZE_EVAL + j] / config.NUM_SAMPLES_PER_FILE + 1;
-            uint32_t sample_num = indices[offset + i * config.BATCH_SIZE_EVAL + j] % config.NUM_SAMPLES_PER_FILE;
+            uint32_t file_num =
+                indices[offset + i * config.BATCH_SIZE_EVAL + j] / config.NUM_SAMPLES_PER_FILE + 1;
+            uint32_t sample_num =
+                indices[offset + i * config.BATCH_SIZE_EVAL + j] % config.NUM_SAMPLES_PER_FILE;
             char file_path[256];
-            snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER, config.VALID_DATA_FOLDER, config.FILE_PREFIX, file_num, config.NUM_FILES_EVAL);
+            snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER,
+                     config.VALID_DATA_FOLDER, config.FILE_PREFIX, file_num, config.NUM_FILES_EVAL);
 
             uint64_t metadata_time = 0, read_time = 0;
             read_sample(file_path, sample_num, &metadata_time, &read_time);
@@ -229,54 +255,60 @@ void eval_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_met
         t0 = get_time_usec();
     }
 
-//    TODO: drop_data = False
-//    for (uint32_t iteration = MY_RANK; iteration < total_samples - NUM_RANKS * samples_per_rank; iteration += NUM_RANKS) {
-//        uint32_t i = NUM_RANKS * samples_per_rank + iteration;
-//        uint32_t file_num = indices[i] / config.NUM_SAMPLES_PER_FILE + 1;
-//        uint32_t sample_num = indices[i] % config.NUM_SAMPLES_PER_FILE;
-//        char file_path[256];
-//        snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER, config.VALID_DATA_FOLDER, config.FILE_PREFIX, file_num, config.NUM_FILES_EVAL);
-//
-//        uint64_t metadata_time = 0, read_time = 0;
-//        read_sample(file_path, sample_num, &metadata_time, &read_time);
-//        read_counter++;
-//
-//        *local_metadata_time_out += metadata_time;
-//        *local_read_time_out += read_time;
-//
-//        if (read_counter % config.BATCH_SIZE_EVAL == 0){
-//            batch_loaded_eval(epoch, t0);
-//
-//            uint64_t t = compute(config.EVAL_TIME, config.EVAL_TIME_STDEV);
-//            batch_processed_eval(epoch, t, t0);
-//            read_counter = 0;
-//
-//            t0 = get_time_usec();
-//        }
-//    }
-//
-//    if (read_counter != 0) {
-//        batch_loaded_eval(epoch, t0);
-//
-//        uint64_t t = compute(config.EVAL_TIME, config.EVAL_TIME_STDEV);
-//        batch_processed_eval(epoch, t, t0);
-//    }
+    //    TODO: drop_data = False
+    //    for (uint32_t iteration = MY_RANK; iteration < total_samples - NUM_RANKS * samples_per_rank;
+    //    iteration += NUM_RANKS) {
+    //        uint32_t i = NUM_RANKS * samples_per_rank + iteration;
+    //        uint32_t file_num = indices[i] / config.NUM_SAMPLES_PER_FILE + 1;
+    //        uint32_t sample_num = indices[i] % config.NUM_SAMPLES_PER_FILE;
+    //        char file_path[256];
+    //        snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER,
+    //        config.VALID_DATA_FOLDER, config.FILE_PREFIX, file_num, config.NUM_FILES_EVAL);
+    //
+    //        uint64_t metadata_time = 0, read_time = 0;
+    //        read_sample(file_path, sample_num, &metadata_time, &read_time);
+    //        read_counter++;
+    //
+    //        *local_metadata_time_out += metadata_time;
+    //        *local_read_time_out += read_time;
+    //
+    //        if (read_counter % config.BATCH_SIZE_EVAL == 0){
+    //            batch_loaded_eval(epoch, t0);
+    //
+    //            uint64_t t = compute(config.EVAL_TIME, config.EVAL_TIME_STDEV);
+    //            batch_processed_eval(epoch, t, t0);
+    //            read_counter = 0;
+    //
+    //            t0 = get_time_usec();
+    //        }
+    //    }
+    //
+    //    if (read_counter != 0) {
+    //        batch_loaded_eval(epoch, t0);
+    //
+    //        uint64_t t = compute(config.EVAL_TIME, config.EVAL_TIME_STDEV);
+    //        batch_processed_eval(epoch, t, t0);
+    //    }
 }
 
-void eval_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out) {
+void
+eval_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out)
+{
     force_workers_to_shuffle(get_eval_read_fd(), get_eval_write_fd(), get_eval_system_fd());
 
-    uint32_t batches_per_rank = config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE_EVAL / NUM_RANKS;
+    uint32_t batches_per_rank =
+        config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE_EVAL / NUM_RANKS;
     uint32_t offset = MY_RANK * batches_per_rank;
 
-    for (uint32_t i = 0; i < (config.READ_THREADS > batches_per_rank? batches_per_rank: config.READ_THREADS); i++) {
+    for (uint32_t i = 0;
+         i < (config.READ_THREADS > batches_per_rank ? batches_per_rank : config.READ_THREADS); i++) {
         int32_t batch = offset + i;
         write(get_eval_write_fd(), &batch, sizeof(batch));
     }
 
     for (uint32_t i = config.READ_THREADS; i < batches_per_rank; i++) {
         execution_time_t data_from_child_process;
-        uint64_t t0 = get_time_usec();
+        uint64_t         t0 = get_time_usec();
         read(get_eval_read_fd(), &data_from_child_process, sizeof(data_from_child_process));
 
         batch_loaded_eval(epoch, t0);
@@ -292,9 +324,10 @@ void eval_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint6
         MPI_Barrier(MPI_COMM_WORLD);
     }
 
-    for (uint32_t i = 0; i < (config.READ_THREADS > batches_per_rank? batches_per_rank: config.READ_THREADS); i++) {
+    for (uint32_t i = 0;
+         i < (config.READ_THREADS > batches_per_rank ? batches_per_rank : config.READ_THREADS); i++) {
         execution_time_t data_from_child_process;
-        uint64_t t0 = get_time_usec();
+        uint64_t         t0 = get_time_usec();
         read(get_eval_read_fd(), &data_from_child_process, sizeof(data_from_child_process));
 
         batch_loaded_eval(epoch, t0);
@@ -308,7 +341,10 @@ void eval_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint6
     }
 }
 
-void eval(uint32_t epoch, uint32_t *indices, uint64_t *local_eval_metadata_time, uint64_t *local_eval_read_time, bool enable_multiprocessing) {
+void
+eval(uint32_t epoch, uint32_t *indices, uint64_t *local_eval_metadata_time, uint64_t *local_eval_read_time,
+     bool enable_multiprocessing)
+{
     if (enable_multiprocessing) {
         start_eval(epoch);
         eval_using_workers(epoch, local_eval_metadata_time, local_eval_read_time);
@@ -316,16 +352,22 @@ void eval(uint32_t epoch, uint32_t *indices, uint64_t *local_eval_metadata_time,
         return;
     }
 
-    if (config.SEED_CHANGE_EPOCH) srand(config.RANDOM_SEED * 2 + epoch);
-    if (config.DO_SHUFFLE) shuffle(indices, config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE);
+    if (config.SEED_CHANGE_EPOCH)
+        srand(config.RANDOM_SEED * 2 + epoch);
+    if (config.DO_SHUFFLE)
+        shuffle(indices, config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE);
 
     start_eval(epoch);
     eval_without_workers(epoch, indices, local_eval_metadata_time, local_eval_read_time);
     end_eval(epoch);
 }
 
-void train_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out) {
-    uint32_t batches_per_rank = config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE / NUM_RANKS;
+void
+train_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out,
+                      uint64_t *local_read_time_out)
+{
+    uint32_t batches_per_rank =
+        config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE / NUM_RANKS;
     uint32_t offset = MY_RANK * batches_per_rank;
 
     uint64_t t0 = get_time_usec();
@@ -333,8 +375,9 @@ void train_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_me
         for (uint32_t j = 0; j < config.BATCH_SIZE; j++) {
             uint32_t file_num = indices[offset + i * config.BATCH_SIZE + j] / config.NUM_SAMPLES_PER_FILE + 1;
             uint32_t sample_num = indices[offset + i * config.BATCH_SIZE + j] % config.NUM_SAMPLES_PER_FILE;
-            char file_path[256];
-            snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER, config.TRAIN_DATA_FOLDER, config.FILE_PREFIX, file_num, config.NUM_FILES_TRAIN);
+            char     file_path[256];
+            snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER,
+                     config.TRAIN_DATA_FOLDER, config.FILE_PREFIX, file_num, config.NUM_FILES_TRAIN);
 
             uint64_t metadata_time = 0, read_time = 0;
             read_sample(file_path, sample_num, &metadata_time, &read_time);
@@ -352,53 +395,59 @@ void train_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_me
         t0 = get_time_usec();
     }
 
-//    TODO: drop_data = True
-//    for (uint32_t iteration = MY_RANK; iteration < total_samples - NUM_RANKS * samples_per_rank; iteration += NUM_RANKS) {
-//        uint32_t i = NUM_RANKS * samples_per_rank + iteration;
-//        uint32_t file_num = indices[i] / config.NUM_SAMPLES_PER_FILE + 1;
-//        uint32_t sample_num = indices[i] % config.NUM_SAMPLES_PER_FILE;
-//        char file_path[256];
-//        snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER, config.TRAIN_DATA_FOLDER, config.FILE_PREFIX, file_num, config.NUM_FILES_TRAIN);
-//
-//        uint64_t metadata_time = 0, read_time = 0;
-//        read_sample(file_path, sample_num, &metadata_time, &read_time);
-//        read_counter++;
-//
-//        *local_metadata_time_out += metadata_time;
-//        *local_read_time_out += read_time;
-//
-//        if (read_counter % config.BATCH_SIZE == 0){
-//            batch_loaded_train(epoch, t0);
-//
-//            uint64_t t = compute(config.COMPUTATION_TIME, config.COMPUTATION_TIME_STDEV);
-//            batch_processed_train(epoch, t, t0);
-//
-//            read_counter = 0;
-//            t0 = get_time_usec();
-//        }
-//    }
-//
-//    if (read_counter != 0) {
-//        batch_loaded_train(epoch, t0);
-//
-//        uint64_t t = compute(config.COMPUTATION_TIME, config.COMPUTATION_TIME_STDEV);
-//        batch_processed_train(epoch, t, t0);
-//    }
+    //    TODO: drop_data = True
+    //    for (uint32_t iteration = MY_RANK; iteration < total_samples - NUM_RANKS * samples_per_rank;
+    //    iteration += NUM_RANKS) {
+    //        uint32_t i = NUM_RANKS * samples_per_rank + iteration;
+    //        uint32_t file_num = indices[i] / config.NUM_SAMPLES_PER_FILE + 1;
+    //        uint32_t sample_num = indices[i] % config.NUM_SAMPLES_PER_FILE;
+    //        char file_path[256];
+    //        snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER,
+    //        config.TRAIN_DATA_FOLDER, config.FILE_PREFIX, file_num, config.NUM_FILES_TRAIN);
+    //
+    //        uint64_t metadata_time = 0, read_time = 0;
+    //        read_sample(file_path, sample_num, &metadata_time, &read_time);
+    //        read_counter++;
+    //
+    //        *local_metadata_time_out += metadata_time;
+    //        *local_read_time_out += read_time;
+    //
+    //        if (read_counter % config.BATCH_SIZE == 0){
+    //            batch_loaded_train(epoch, t0);
+    //
+    //            uint64_t t = compute(config.COMPUTATION_TIME, config.COMPUTATION_TIME_STDEV);
+    //            batch_processed_train(epoch, t, t0);
+    //
+    //            read_counter = 0;
+    //            t0 = get_time_usec();
+    //        }
+    //    }
+    //
+    //    if (read_counter != 0) {
+    //        batch_loaded_train(epoch, t0);
+    //
+    //        uint64_t t = compute(config.COMPUTATION_TIME, config.COMPUTATION_TIME_STDEV);
+    //        batch_processed_train(epoch, t, t0);
+    //    }
 }
 
-void train_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out) {
+void
+train_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out)
+{
     force_workers_to_shuffle(get_train_read_fd(), get_train_write_fd(), get_train_system_fd());
-    uint32_t batches_per_rank = config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE / NUM_RANKS;
+    uint32_t batches_per_rank =
+        config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE / NUM_RANKS;
     uint32_t offset = MY_RANK * batches_per_rank;
 
-    for (uint32_t i = 0; i < (config.READ_THREADS > batches_per_rank? batches_per_rank: config.READ_THREADS); i++) {
+    for (uint32_t i = 0;
+         i < (config.READ_THREADS > batches_per_rank ? batches_per_rank : config.READ_THREADS); i++) {
         int32_t batch = offset + i;
         write(get_train_write_fd(), &batch, sizeof(batch));
     }
 
     for (uint32_t i = config.READ_THREADS; i < batches_per_rank; i++) {
         execution_time_t data_from_child_process;
-        uint64_t t0 = get_time_usec();
+        uint64_t         t0 = get_time_usec();
         read(get_train_read_fd(), &data_from_child_process, sizeof(data_from_child_process));
 
         batch_loaded_train(epoch, t0);
@@ -414,9 +463,10 @@ void train_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint
         MPI_Barrier(MPI_COMM_WORLD);
     }
 
-    for (uint32_t i = 0; i < (config.READ_THREADS > batches_per_rank? batches_per_rank: config.READ_THREADS); i++) {
+    for (uint32_t i = 0;
+         i < (config.READ_THREADS > batches_per_rank ? batches_per_rank : config.READ_THREADS); i++) {
         execution_time_t data_from_child_process;
-        uint64_t t0 = get_time_usec();
+        uint64_t         t0 = get_time_usec();
         read(get_train_read_fd(), &data_from_child_process, sizeof(data_from_child_process));
 
         batch_loaded_train(epoch, t0);
@@ -430,7 +480,10 @@ void train_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint
     }
 }
 
-void train(uint32_t epoch, uint32_t *indices, uint64_t *local_train_metadata_time, uint64_t *local_train_read_time, bool enable_multiprocessing) {
+void
+train(uint32_t epoch, uint32_t *indices, uint64_t *local_train_metadata_time, uint64_t *local_train_read_time,
+      bool enable_multiprocessing)
+{
     if (enable_multiprocessing) {
         start_train(epoch);
         train_using_workers(epoch, local_train_metadata_time, local_train_read_time);
@@ -438,17 +491,22 @@ void train(uint32_t epoch, uint32_t *indices, uint64_t *local_train_metadata_tim
         return;
     }
 
-    if (config.SEED_CHANGE_EPOCH) srand(config.RANDOM_SEED + epoch);
-    if (config.DO_SHUFFLE) shuffle(indices, config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE);
+    if (config.SEED_CHANGE_EPOCH)
+        srand(config.RANDOM_SEED + epoch);
+    if (config.DO_SHUFFLE)
+        shuffle(indices, config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE);
 
     start_train(epoch);
     train_without_workers(epoch, indices, local_train_metadata_time, local_train_read_time);
     end_train(epoch);
 }
 
-void run(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *eval_metadata_time, uint64_t *eval_read_time) {
-    uint32_t total_train_samples = config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE;
-    uint32_t *indices_train = (uint32_t *)malloc(total_train_samples * sizeof(uint32_t));
+void
+run(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *eval_metadata_time,
+    uint64_t *eval_read_time)
+{
+    uint32_t  total_train_samples = config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE;
+    uint32_t *indices_train       = (uint32_t *)malloc(total_train_samples * sizeof(uint32_t));
     if (indices_train == NULL) {
         exit(1);
     }
@@ -456,8 +514,8 @@ void run(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *eva
         indices_train[i] = i;
     }
 
-    uint32_t total_eval_samples = config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE;
-    uint32_t *indices_eval = (uint32_t *)malloc(total_eval_samples * sizeof(uint32_t));
+    uint32_t  total_eval_samples = config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE;
+    uint32_t *indices_eval       = (uint32_t *)malloc(total_eval_samples * sizeof(uint32_t));
     if (indices_eval == NULL) {
         exit(1);
     }
@@ -465,9 +523,9 @@ void run(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *eva
         indices_eval[i] = i;
     }
 
-    uint64_t local_train_metadata_time = 0, local_train_read_time = 0,
-             local_eval_metadata_time = 0, local_eval_read_time = 0;
-    uint32_t next_eval_epoch = config.EPOCHS_BETWEEN_EVALS;
+    uint64_t local_train_metadata_time = 0, local_train_read_time = 0, local_eval_metadata_time = 0,
+             local_eval_read_time = 0;
+    uint32_t next_eval_epoch      = config.EPOCHS_BETWEEN_EVALS;
 
     bool enable_multiprocessing = config.READ_THREADS > 0;
     if (enable_multiprocessing) {
@@ -477,13 +535,15 @@ void run(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *eva
     MPI_Barrier(MPI_COMM_WORLD);
 
     for (uint32_t epoch = 0; epoch < config.EPOCHS; epoch++) {
-//        if (MY_RANK == 0) printf("New Epoch %u\n", epoch + 1);
+        //        if (MY_RANK == 0) printf("New Epoch %u\n", epoch + 1);
 
-        train(epoch, indices_train, &local_train_metadata_time, &local_train_read_time, enable_multiprocessing);
+        train(epoch, indices_train, &local_train_metadata_time, &local_train_read_time,
+              enable_multiprocessing);
         MPI_Barrier(MPI_COMM_WORLD);
 
         if (config.DO_EVALUATION && (epoch + 1 >= next_eval_epoch)) {
-            eval(epoch, indices_eval, &local_eval_metadata_time, &local_eval_read_time, enable_multiprocessing);
+            eval(epoch, indices_eval, &local_eval_metadata_time, &local_eval_read_time,
+                 enable_multiprocessing);
             next_eval_epoch += config.EPOCHS_BETWEEN_EVALS;
             MPI_Barrier(MPI_COMM_WORLD);
         }
@@ -492,9 +552,11 @@ void run(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *eva
         fin_workers();
     }
 
-    MPI_Reduce(&local_train_metadata_time, train_metadata_time, 1, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+    MPI_Reduce(&local_train_metadata_time, train_metadata_time, 1, MPI_UNSIGNED_LONG, MPI_SUM, 0,
+               MPI_COMM_WORLD);
     MPI_Reduce(&local_train_read_time, train_read_time, 1, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
-    MPI_Reduce(&local_eval_metadata_time, eval_metadata_time, 1, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+    MPI_Reduce(&local_eval_metadata_time, eval_metadata_time, 1, MPI_UNSIGNED_LONG, MPI_SUM, 0,
+               MPI_COMM_WORLD);
     MPI_Reduce(&local_eval_read_time, eval_read_time, 1, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
 
     free(indices_train);
@@ -503,16 +565,18 @@ void run(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *eva
     MPI_Barrier(MPI_COMM_WORLD);
 }
 
-void init_global_variables() {
-    DIM = (uint32_t)sqrt(config.RECORD_LENGTH);
+void
+init_global_variables()
+{
+    DIM                  = (uint32_t)sqrt(config.RECORD_LENGTH);
     config.RECORD_LENGTH = DIM * DIM;
 
     uint32_t chunk_dimension = (uint32_t)sqrt(config.CHUNK_SIZE);
-    chunk_dimension = chunk_dimension > DIM? DIM: chunk_dimension;
-    config.CHUNK_SIZE = chunk_dimension * chunk_dimension;
+    chunk_dimension          = chunk_dimension > DIM ? DIM : chunk_dimension;
+    config.CHUNK_SIZE        = chunk_dimension * chunk_dimension;
 
     uint32_t data_length = config.RECORD_LENGTH * config.NUM_SAMPLES_PER_FILE;
-    GENERATION_SIZE = data_length > GENERATION_BUFFER_SIZE? GENERATION_BUFFER_SIZE: data_length;
+    GENERATION_SIZE      = data_length > GENERATION_BUFFER_SIZE ? GENERATION_BUFFER_SIZE : data_length;
 
     srand(config.RANDOM_SEED);
 
@@ -542,7 +606,9 @@ void init_global_variables() {
     hid_t DXPL = H5Pcreate(H5P_DATASET_XFER);
 }
 
-int main(int argc, char *argv[]) {
+int
+main(int argc, char *argv[])
+{
     int mpi_thread_lvl_provided = -1;
     MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_thread_lvl_provided);
     assert(MPI_THREAD_MULTIPLE == mpi_thread_lvl_provided);
@@ -581,15 +647,19 @@ int main(int argc, char *argv[]) {
         run(&train_metadata_time, &train_read_time, &eval_metadata_time, &eval_read_time);
         prepare_data();
 
-        MPI_Reduce(MY_RANK == 0? MPI_IN_PLACE: &train_metadata_time, &train_metadata_time, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(MY_RANK == 0? MPI_IN_PLACE: &train_read_time, &train_read_time, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(MY_RANK == 0? MPI_IN_PLACE: &eval_metadata_time, &eval_metadata_time, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(MY_RANK == 0? MPI_IN_PLACE: &eval_read_time, &eval_read_time, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
-
-        train_metadata_time /= NUM_RANKS / (config.READ_THREADS > 0? config.READ_THREADS: 1);
-        train_read_time /= NUM_RANKS / (config.READ_THREADS > 0? config.READ_THREADS: 1);
-        eval_metadata_time /= NUM_RANKS / (config.READ_THREADS > 0? config.READ_THREADS: 1);
-        eval_read_time /= NUM_RANKS / (config.READ_THREADS > 0? config.READ_THREADS: 1);
+        MPI_Reduce(MY_RANK == 0 ? MPI_IN_PLACE : &train_metadata_time, &train_metadata_time, 1,
+                   MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(MY_RANK == 0 ? MPI_IN_PLACE : &train_read_time, &train_read_time, 1,
+                   MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(MY_RANK == 0 ? MPI_IN_PLACE : &eval_metadata_time, &eval_metadata_time, 1,
+                   MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(MY_RANK == 0 ? MPI_IN_PLACE : &eval_read_time, &eval_read_time, 1, MPI_UNSIGNED_LONG_LONG,
+                   MPI_SUM, 0, MPI_COMM_WORLD);
+
+        train_metadata_time /= NUM_RANKS / (config.READ_THREADS > 0 ? config.READ_THREADS : 1);
+        train_read_time /= NUM_RANKS / (config.READ_THREADS > 0 ? config.READ_THREADS : 1);
+        eval_metadata_time /= NUM_RANKS / (config.READ_THREADS > 0 ? config.READ_THREADS : 1);
+        eval_read_time /= NUM_RANKS / (config.READ_THREADS > 0 ? config.READ_THREADS : 1);
 
         MPI_Barrier(MPI_COMM_WORLD);
 
diff --git a/dlio/h5bench_dlio.h b/dlio/h5bench_dlio.h
index 0ee9a57e..ca38e895 100644
--- a/dlio/h5bench_dlio.h
+++ b/dlio/h5bench_dlio.h
@@ -14,25 +14,30 @@ void generate_file(const char *file_name, hid_t labels_filespace, hid_t labels_m
 
 void generate_data();
 
-void read_sample(const char *file_path, uint32_t sample, uint64_t *metadata_time_out, uint64_t *read_time_out);
+void read_sample(const char *file_path, uint32_t sample, uint64_t *metadata_time_out,
+                 uint64_t *read_time_out);
 
 uint64_t compute(float time, float time_stdev);
 
-void eval_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out);
+void eval_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out,
+                          uint64_t *local_read_time_out);
 
 void eval_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out);
 
-void eval(uint32_t epoch, uint32_t *indices, uint64_t *local_eval_metadata_time, uint64_t *local_eval_read_time, bool enable_multiprocessing);
+void eval(uint32_t epoch, uint32_t *indices, uint64_t *local_eval_metadata_time,
+          uint64_t *local_eval_read_time, bool enable_multiprocessing);
 
-void train_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out);
+void train_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out,
+                           uint64_t *local_read_time_out);
 
 void train_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out);
 
-void train(uint32_t epoch, uint32_t *indices, uint64_t *local_train_metadata_time, uint64_t *local_train_read_time, bool enable_multiprocessing);
+void train(uint32_t epoch, uint32_t *indices, uint64_t *local_train_metadata_time,
+           uint64_t *local_train_read_time, bool enable_multiprocessing);
 
-void run(uint64_t *train_metadata_time, uint64_t *train_read_time,
-         uint64_t *eval_metadata_time, uint64_t *eval_read_time);
+void run(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *eval_metadata_time,
+         uint64_t *eval_read_time);
 
 void init_global_variables();
 
-#endif //SANDBOX_H5BENCH_ML_READ_H
+#endif // SANDBOX_H5BENCH_ML_READ_H
diff --git a/dlio/stats.c b/dlio/stats.c
index 3940d5d0..0d54811f 100644
--- a/dlio/stats.c
+++ b/dlio/stats.c
@@ -8,8 +8,8 @@
 #include "stats.h"
 #include "utils.h"
 
-uint32_t TRAIN_MAX_STEPS;
-uint32_t EVAL_MAX_STEPS;
+uint32_t      TRAIN_MAX_STEPS;
+uint32_t      EVAL_MAX_STEPS;
 epoch_data_t *stats;
 epoch_data_t *global_stats;
 
@@ -22,20 +22,26 @@ uint32_t *last_compute_eval;
 
 double AU;
 
-void stats_initialize() {
+void
+stats_initialize()
+{
     AU = 0.90;
 
-    uint32_t train_steps_count = config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE / NUM_RANKS;
-    uint32_t train_steps_count_remainder = config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE % (config.BATCH_SIZE * NUM_RANKS);
-    uint32_t eval_steps_count = config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE_EVAL / NUM_RANKS;
-    uint32_t eval_steps_count_remainder = config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE % (config.BATCH_SIZE_EVAL * NUM_RANKS);
+    uint32_t train_steps_count =
+        config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE / NUM_RANKS;
+    uint32_t train_steps_count_remainder =
+        config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE % (config.BATCH_SIZE * NUM_RANKS);
+    uint32_t eval_steps_count =
+        config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE_EVAL / NUM_RANKS;
+    uint32_t eval_steps_count_remainder =
+        config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE % (config.BATCH_SIZE_EVAL * NUM_RANKS);
 
     TRAIN_MAX_STEPS = train_steps_count;
-    EVAL_MAX_STEPS = eval_steps_count;
+    EVAL_MAX_STEPS  = eval_steps_count;
 
-//    TODO: drop_last = False
-//    TRAIN_MAX_STEPS = train_steps_count + (train_steps_count_remainder > 0);
-//    EVAL_MAX_STEPS = eval_steps_count + (eval_steps_count_remainder > 0);
+    //    TODO: drop_last = False
+    //    TRAIN_MAX_STEPS = train_steps_count + (train_steps_count_remainder > 0);
+    //    EVAL_MAX_STEPS = eval_steps_count + (eval_steps_count_remainder > 0);
 
     stats = (struct epoch_data *)malloc(config.EPOCHS * sizeof(struct epoch_data));
     if (stats == NULL) {
@@ -43,36 +49,36 @@ void stats_initialize() {
     }
 
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        stats[i].load.train = (uint64_t*)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
+        stats[i].load.train = (uint64_t *)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
         if (stats[i].load.train == NULL) {
             exit(1);
         }
-        stats[i].load.eval = (uint64_t*)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
+        stats[i].load.eval = (uint64_t *)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
         if (stats[i].load.eval == NULL) {
             exit(1);
         }
-        stats[i].proc.train = (uint64_t*)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
+        stats[i].proc.train = (uint64_t *)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
         if (stats[i].proc.train == NULL) {
             exit(1);
         }
-        stats[i].proc.eval = (uint64_t*)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
+        stats[i].proc.eval = (uint64_t *)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
         if (stats[i].proc.eval == NULL) {
             exit(1);
         }
         stats[i].throughput.train = 0.0;
-        stats[i].throughput.eval = 0.0;
-        stats[i].au.train = 0.0;
-        stats[i].au.eval = 0.0;
-        stats[i].compute.train = (uint64_t*)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
+        stats[i].throughput.eval  = 0.0;
+        stats[i].au.train         = 0.0;
+        stats[i].au.eval          = 0.0;
+        stats[i].compute.train    = (uint64_t *)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
         if (stats[i].compute.train == NULL) {
             exit(1);
         }
-        stats[i].compute.eval = (uint64_t*)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
+        stats[i].compute.eval = (uint64_t *)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
         if (stats[i].compute.eval == NULL) {
             exit(1);
         }
         stats[i].observed_time.train = 0;
-        stats[i].observed_time.eval = 0;
+        stats[i].observed_time.eval  = 0;
     }
 
     last_load_train = calloc(config.EPOCHS, sizeof(uint32_t));
@@ -101,7 +107,9 @@ void stats_initialize() {
     }
 }
 
-void stats_finalize() {
+void
+stats_finalize()
+{
     free(last_load_train);
     free(last_load_eval);
     free(last_proc_train);
@@ -129,50 +137,62 @@ void stats_finalize() {
     free(global_stats);
 }
 
-void prepare_data() {
+void
+prepare_data()
+{
     global_stats = (struct epoch_data *)malloc(config.EPOCHS * sizeof(struct epoch_data));
     if (global_stats == NULL) {
         exit(1);
     }
 
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        global_stats[i].load.train = (uint64_t*)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
+        global_stats[i].load.train = (uint64_t *)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
         if (global_stats[i].load.train == NULL) {
             exit(1);
         }
-        global_stats[i].load.eval = (uint64_t*)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
+        global_stats[i].load.eval = (uint64_t *)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
         if (global_stats[i].load.eval == NULL) {
             exit(1);
         }
-        global_stats[i].proc.train = (uint64_t*)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
+        global_stats[i].proc.train = (uint64_t *)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
         if (global_stats[i].proc.train == NULL) {
             exit(1);
         }
-        global_stats[i].proc.eval = (uint64_t*)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
+        global_stats[i].proc.eval = (uint64_t *)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
         if (global_stats[i].proc.eval == NULL) {
             exit(1);
         }
-        global_stats[i].compute.train = (uint64_t*)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
+        global_stats[i].compute.train = (uint64_t *)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
         if (global_stats[i].compute.train == NULL) {
             exit(1);
         }
-        global_stats[i].compute.eval = (uint64_t*)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
+        global_stats[i].compute.eval = (uint64_t *)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
         if (global_stats[i].compute.eval == NULL) {
             exit(1);
         }
 
-        MPI_Reduce(stats[i].load.train, global_stats[i].load.train, TRAIN_MAX_STEPS, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(stats[i].load.eval, global_stats[i].load.eval, EVAL_MAX_STEPS, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(stats[i].proc.train, global_stats[i].proc.train, TRAIN_MAX_STEPS, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(stats[i].proc.eval, global_stats[i].proc.eval, EVAL_MAX_STEPS, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(stats[i].load.train, global_stats[i].load.train, TRAIN_MAX_STEPS, MPI_UNSIGNED_LONG_LONG,
+                   MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(stats[i].load.eval, global_stats[i].load.eval, EVAL_MAX_STEPS, MPI_UNSIGNED_LONG_LONG,
+                   MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(stats[i].proc.train, global_stats[i].proc.train, TRAIN_MAX_STEPS, MPI_UNSIGNED_LONG_LONG,
+                   MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(stats[i].proc.eval, global_stats[i].proc.eval, EVAL_MAX_STEPS, MPI_UNSIGNED_LONG_LONG,
+                   MPI_SUM, 0, MPI_COMM_WORLD);
         MPI_Reduce(&stats[i].au.train, &global_stats[i].au.train, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
         MPI_Reduce(&stats[i].au.eval, &global_stats[i].au.eval, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(&stats[i].throughput.train, &global_stats[i].throughput.train, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(&stats[i].throughput.eval, &global_stats[i].throughput.eval, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(stats[i].compute.train, global_stats[i].compute.train, TRAIN_MAX_STEPS, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(stats[i].compute.eval, global_stats[i].compute.eval, EVAL_MAX_STEPS, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(&stats[i].observed_time.train, &global_stats[i].observed_time.train, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(&stats[i].observed_time.eval, &global_stats[i].observed_time.eval, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(&stats[i].throughput.train, &global_stats[i].throughput.train, 1, MPI_DOUBLE, MPI_SUM, 0,
+                   MPI_COMM_WORLD);
+        MPI_Reduce(&stats[i].throughput.eval, &global_stats[i].throughput.eval, 1, MPI_DOUBLE, MPI_SUM, 0,
+                   MPI_COMM_WORLD);
+        MPI_Reduce(stats[i].compute.train, global_stats[i].compute.train, TRAIN_MAX_STEPS,
+                   MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(stats[i].compute.eval, global_stats[i].compute.eval, EVAL_MAX_STEPS,
+                   MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(&stats[i].observed_time.train, &global_stats[i].observed_time.train, 1,
+                   MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(&stats[i].observed_time.eval, &global_stats[i].observed_time.eval, 1,
+                   MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
 
         for (int j = 0; j < TRAIN_MAX_STEPS; j++) {
             global_stats[i].load.train[j] /= NUM_RANKS;
@@ -195,14 +215,16 @@ void prepare_data() {
     }
 }
 
-void print_data(uint64_t *train_metadata_time, uint64_t *train_read_time,
-                uint64_t *eval_metadata_time, uint64_t *eval_read_time) {
+void
+print_data(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *eval_metadata_time,
+           uint64_t *eval_read_time)
+{
 
     printf("metric, value\n");
     printf("operation, dlio\n");
     printf("ranks, %d\n", NUM_RANKS);
-//    printf("collective meta");
-//    printf("collective data");
+    //    printf("collective meta");
+    //    printf("collective data");
     // Train
     printf("train compute time, \"");
     uint64_t train_total_compute_time = 0;
@@ -213,13 +235,16 @@ void print_data(uint64_t *train_metadata_time, uint64_t *train_read_time,
         }
         train_total_compute_time += compute_time;
         printf("%lf", compute_time / 1000000.0);
-        if (i != config.EPOCHS - 1) printf(", ");
+        if (i != config.EPOCHS - 1)
+            printf(", ");
     }
     printf("\"\ntrain total compute time, %lf\n", train_total_compute_time / 1000000.0);
 
     // TODO: drop_last = False
-    uint64_t train_total_batches = (uint64_t)config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE / NUM_RANKS * NUM_RANKS;
-    uint64_t train_total_size_bytes = train_total_batches * config.BATCH_SIZE_EVAL * config.NUM_SAMPLES_PER_FILE * config.RECORD_LENGTH;
+    uint64_t train_total_batches = (uint64_t)config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE /
+                                   config.BATCH_SIZE / NUM_RANKS * NUM_RANKS;
+    uint64_t train_total_size_bytes =
+        train_total_batches * config.BATCH_SIZE_EVAL * config.NUM_SAMPLES_PER_FILE * config.RECORD_LENGTH;
     printf("train total size, %lu\n", train_total_size_bytes);
 
     printf("train total metadata time, %lf\n", *train_metadata_time / 1000000.0);
@@ -232,7 +257,8 @@ void print_data(uint64_t *train_metadata_time, uint64_t *train_read_time,
         double observed_time = global_stats[i].observed_time.train / 1000000.0;
         train_total_observed_time += observed_time;
         printf("%lf", observed_time);
-        if (i != config.EPOCHS - 1) printf(", ");
+        if (i != config.EPOCHS - 1)
+            printf(", ");
     }
     printf("\"\ntrain total observed time, %lf\n", train_total_observed_time);
 
@@ -242,8 +268,10 @@ void print_data(uint64_t *train_metadata_time, uint64_t *train_read_time,
         for (uint32_t j = 0; j < TRAIN_MAX_STEPS; j++) {
             compute_time += global_stats[i].compute.train[j];
         }
-        printf("%lf", (double)train_total_size_bytes / (global_stats[i].observed_time.train - compute_time) * 1000000.0);
-        if (i != config.EPOCHS - 1) printf(", ");
+        printf("%lf", (double)train_total_size_bytes / (global_stats[i].observed_time.train - compute_time) *
+                          1000000.0);
+        if (i != config.EPOCHS - 1)
+            printf(", ");
     }
     printf("\"\n");
 
@@ -252,15 +280,17 @@ void print_data(uint64_t *train_metadata_time, uint64_t *train_read_time,
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
         train_au_mean_percentage += global_stats[i].au.train;
         printf("%lf", global_stats[i].au.train);
-        if (i != config.EPOCHS - 1) printf(", ");
+        if (i != config.EPOCHS - 1)
+            printf(", ");
     }
     train_au_mean_percentage = train_au_mean_percentage / (double)config.EPOCHS;
     printf("\"\ntrain au mean percentage, %lf\n", train_au_mean_percentage);
-    printf("train au meet expectation, %s\n", train_au_mean_percentage >= 100 * AU? "success": "fail");
+    printf("train au meet expectation, %s\n", train_au_mean_percentage >= 100 * AU ? "success" : "fail");
 
     double train_au_stdev_percentage = 0.0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        train_au_stdev_percentage += (global_stats[i].au.train - train_au_mean_percentage) * (global_stats[i].au.train - train_au_mean_percentage);
+        train_au_stdev_percentage += (global_stats[i].au.train - train_au_mean_percentage) *
+                                     (global_stats[i].au.train - train_au_mean_percentage);
     }
     train_au_stdev_percentage = sqrt(train_au_stdev_percentage / (double)config.EPOCHS);
     printf("train au stdev percentage, %lf\n", train_au_stdev_percentage);
@@ -270,22 +300,29 @@ void print_data(uint64_t *train_metadata_time, uint64_t *train_read_time,
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
         train_throughput_mean_samples_per_second += global_stats[i].throughput.train;
         printf("%lf", global_stats[i].throughput.train);
-        if (i != config.EPOCHS - 1) printf(", ");
+        if (i != config.EPOCHS - 1)
+            printf(", ");
     }
-    train_throughput_mean_samples_per_second = train_throughput_mean_samples_per_second / (double)config.EPOCHS;
+    train_throughput_mean_samples_per_second =
+        train_throughput_mean_samples_per_second / (double)config.EPOCHS;
     printf("\"\ntrain throughput mean samples per second, %lf\n", train_throughput_mean_samples_per_second);
 
     double train_throughput_stdev_samples_per_second = 0.0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        train_throughput_stdev_samples_per_second += (global_stats[i].throughput.train - train_throughput_mean_samples_per_second) * (global_stats[i].throughput.train - train_throughput_mean_samples_per_second);
+        train_throughput_stdev_samples_per_second +=
+            (global_stats[i].throughput.train - train_throughput_mean_samples_per_second) *
+            (global_stats[i].throughput.train - train_throughput_mean_samples_per_second);
     }
-    train_throughput_stdev_samples_per_second = sqrt(train_throughput_stdev_samples_per_second / (double)config.EPOCHS);
+    train_throughput_stdev_samples_per_second =
+        sqrt(train_throughput_stdev_samples_per_second / (double)config.EPOCHS);
     printf("train throughput stdev samples per second, %lf\n", train_throughput_stdev_samples_per_second);
 
-    double train_io_mean_MB_per_second = train_throughput_mean_samples_per_second * config.RECORD_LENGTH / 1024 / 1024;
+    double train_io_mean_MB_per_second =
+        train_throughput_mean_samples_per_second * config.RECORD_LENGTH / 1024 / 1024;
     printf("train io mean MB per second, %lf\n", train_io_mean_MB_per_second);
 
-    double train_io_stdev_MB_per_second = train_throughput_stdev_samples_per_second * config.RECORD_LENGTH / 1024 / 1024;
+    double train_io_stdev_MB_per_second =
+        train_throughput_stdev_samples_per_second * config.RECORD_LENGTH / 1024 / 1024;
     printf("train io stdev MB per second, %lf\n", train_io_stdev_MB_per_second);
 
     // Evaluation
@@ -298,12 +335,15 @@ void print_data(uint64_t *train_metadata_time, uint64_t *train_read_time,
         }
         eval_total_compute_time += compute_time;
         printf("%lf", compute_time / 1000000.0);
-        if (i != config.EPOCHS - 1) printf(", ");
+        if (i != config.EPOCHS - 1)
+            printf(", ");
     }
     printf("\"\neval total compute time, %lf\n", eval_total_compute_time / 1000000.0);
     // TODO: drop_last = False
-    uint64_t eval_total_batches = (uint64_t)config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE_EVAL / NUM_RANKS * NUM_RANKS;
-    uint64_t eval_total_size_bytes = eval_total_batches * config.BATCH_SIZE_EVAL * config.NUM_SAMPLES_PER_FILE * config.RECORD_LENGTH;
+    uint64_t eval_total_batches = (uint64_t)config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE /
+                                  config.BATCH_SIZE_EVAL / NUM_RANKS * NUM_RANKS;
+    uint64_t eval_total_size_bytes =
+        eval_total_batches * config.BATCH_SIZE_EVAL * config.NUM_SAMPLES_PER_FILE * config.RECORD_LENGTH;
     printf("eval total size, %lu\n", eval_total_size_bytes);
 
     printf("eval metadata time, %lf\n", *eval_metadata_time / 1000000.0);
@@ -316,7 +356,8 @@ void print_data(uint64_t *train_metadata_time, uint64_t *train_read_time,
         double observed_time = global_stats[i].observed_time.eval / 1000000.0;
         eval_total_observed_time += observed_time;
         printf("%lf", observed_time);
-        if (i != config.EPOCHS - 1) printf(", ");
+        if (i != config.EPOCHS - 1)
+            printf(", ");
     }
     printf("\"\neval total observed time, %lf\n", eval_total_observed_time);
 
@@ -326,8 +367,10 @@ void print_data(uint64_t *train_metadata_time, uint64_t *train_read_time,
         for (uint32_t j = 0; j < EVAL_MAX_STEPS; j++) {
             compute_time += global_stats[i].compute.eval[j];
         }
-        printf("%lf", (double)eval_total_size_bytes / (global_stats[i].observed_time.eval - compute_time) * 1000000.0);
-        if (i != config.EPOCHS - 1) printf(", ");
+        printf("%lf", (double)eval_total_size_bytes / (global_stats[i].observed_time.eval - compute_time) *
+                          1000000.0);
+        if (i != config.EPOCHS - 1)
+            printf(", ");
     }
     printf("\"\n");
 
@@ -336,15 +379,17 @@ void print_data(uint64_t *train_metadata_time, uint64_t *train_read_time,
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
         eval_au_mean_percentage += global_stats[i].au.eval;
         printf("%lf", global_stats[i].au.eval);
-        if (i != config.EPOCHS - 1) printf(", ");
+        if (i != config.EPOCHS - 1)
+            printf(", ");
     }
     eval_au_mean_percentage = eval_au_mean_percentage / (double)config.EPOCHS;
     printf("\"\neval au mean percentage, %lf\n", eval_au_mean_percentage);
-    printf("eval au meet expectation, %s\n", eval_au_mean_percentage >= 100 * AU? "success": "fail");
+    printf("eval au meet expectation, %s\n", eval_au_mean_percentage >= 100 * AU ? "success" : "fail");
 
     double eval_au_stdev_percentage = 0.0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        eval_au_stdev_percentage += (global_stats[i].au.eval - eval_au_mean_percentage) * (global_stats[i].au.eval - eval_au_mean_percentage);
+        eval_au_stdev_percentage += (global_stats[i].au.eval - eval_au_mean_percentage) *
+                                    (global_stats[i].au.eval - eval_au_mean_percentage);
     }
     eval_au_stdev_percentage = sqrt(eval_au_stdev_percentage / (double)config.EPOCHS);
     printf("eval au stdev percentage, %lf\n", eval_au_stdev_percentage);
@@ -354,81 +399,104 @@ void print_data(uint64_t *train_metadata_time, uint64_t *train_read_time,
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
         eval_throughput_mean_samples_per_second += global_stats[i].throughput.eval;
         printf("%lf", global_stats[i].throughput.eval);
-        if (i != config.EPOCHS - 1) printf(", ");
+        if (i != config.EPOCHS - 1)
+            printf(", ");
     }
     eval_throughput_mean_samples_per_second = eval_throughput_mean_samples_per_second / (double)config.EPOCHS;
     printf("\"\neval throughput mean samples per second, %lf\n", eval_throughput_mean_samples_per_second);
 
     double eval_throughput_stdev_samples_per_second = 0.0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        eval_throughput_stdev_samples_per_second += (global_stats[i].throughput.eval - eval_throughput_mean_samples_per_second) * (global_stats[i].throughput.eval - eval_throughput_mean_samples_per_second);
+        eval_throughput_stdev_samples_per_second +=
+            (global_stats[i].throughput.eval - eval_throughput_mean_samples_per_second) *
+            (global_stats[i].throughput.eval - eval_throughput_mean_samples_per_second);
     }
-    eval_throughput_stdev_samples_per_second = sqrt(eval_throughput_stdev_samples_per_second / (double)config.EPOCHS);
+    eval_throughput_stdev_samples_per_second =
+        sqrt(eval_throughput_stdev_samples_per_second / (double)config.EPOCHS);
     printf("eval throughput stdev samples per second, %lf\n", eval_throughput_stdev_samples_per_second);
 
-    double eval_io_mean_MB_per_second = eval_throughput_mean_samples_per_second * config.RECORD_LENGTH / 1024 / 1024;
+    double eval_io_mean_MB_per_second =
+        eval_throughput_mean_samples_per_second * config.RECORD_LENGTH / 1024 / 1024;
     printf("eval io mean MB per second, %lf\n", eval_io_mean_MB_per_second);
 
-    double eval_io_stdev_MB_per_second = eval_throughput_stdev_samples_per_second * config.RECORD_LENGTH / 1024 / 1024;
+    double eval_io_stdev_MB_per_second =
+        eval_throughput_stdev_samples_per_second * config.RECORD_LENGTH / 1024 / 1024;
     printf("eval io stdev MB per second, %lf\n", eval_io_stdev_MB_per_second);
 }
 
-void batch_loaded_train(uint32_t epoch, uint64_t t0) {
+void
+batch_loaded_train(uint32_t epoch, uint64_t t0)
+{
     stats[epoch].load.train[last_load_train[epoch]++] = (get_time_usec() - t0);
 }
 
-void batch_processed_train(uint32_t epoch, uint64_t computation_time, uint64_t t0) {
-    stats[epoch].proc.train[last_proc_train[epoch]++] = (get_time_usec() - t0);
+void
+batch_processed_train(uint32_t epoch, uint64_t computation_time, uint64_t t0)
+{
+    stats[epoch].proc.train[last_proc_train[epoch]++]       = (get_time_usec() - t0);
     stats[epoch].compute.train[last_compute_train[epoch]++] = computation_time;
 }
 
-void batch_loaded_eval(uint32_t epoch, uint64_t t0) {
+void
+batch_loaded_eval(uint32_t epoch, uint64_t t0)
+{
     stats[epoch].load.eval[last_load_eval[epoch]++] = (get_time_usec() - t0);
 }
 
-void batch_processed_eval(uint32_t epoch, uint64_t computation_time, uint64_t t0) {
-    stats[epoch].proc.eval[last_proc_eval[epoch]++] = (get_time_usec() - t0);
+void
+batch_processed_eval(uint32_t epoch, uint64_t computation_time, uint64_t t0)
+{
+    stats[epoch].proc.eval[last_proc_eval[epoch]++]       = (get_time_usec() - t0);
     stats[epoch].compute.eval[last_compute_eval[epoch]++] = computation_time;
 }
 
-void start_train(uint32_t epoch) {
+void
+start_train(uint32_t epoch)
+{
     stats[epoch].start_time.train = get_time_usec();
 }
 
-void end_train(uint32_t epoch) {
-    uint64_t end_time = get_time_usec();
+void
+end_train(uint32_t epoch)
+{
+    uint64_t end_time           = get_time_usec();
     uint64_t total_compute_time = 0;
-    double au = 0.0;
+    double   au                 = 0.0;
 
     for (int i = 0; i < TRAIN_MAX_STEPS; i++) {
         total_compute_time += stats[epoch].compute.train[i];
     }
     if (total_compute_time > 0) {
         stats[epoch].observed_time.train = end_time - stats[epoch].start_time.train;
-        au = (double)total_compute_time / stats[epoch].observed_time.train;
+        au                               = (double)total_compute_time / stats[epoch].observed_time.train;
     }
 
     stats[epoch].au.train = au * 100;
-    stats[epoch].throughput.train = (double)TRAIN_MAX_STEPS * config.BATCH_SIZE * 1000000.0 / (end_time - stats[epoch].start_time.train);
+    stats[epoch].throughput.train =
+        (double)TRAIN_MAX_STEPS * config.BATCH_SIZE * 1000000.0 / (end_time - stats[epoch].start_time.train);
 }
 
-void start_eval(uint32_t epoch) {
+void
+start_eval(uint32_t epoch)
+{
     stats[epoch].start_time.eval = get_time_usec();
 }
 
-void end_eval(uint32_t epoch) {
-    uint64_t end_time = get_time_usec();
+void
+end_eval(uint32_t epoch)
+{
+    uint64_t end_time           = get_time_usec();
     uint64_t total_compute_time = 0;
-    double au = 0.0;
+    double   au                 = 0.0;
 
     for (int i = 0; i < EVAL_MAX_STEPS; i++) {
         total_compute_time += stats[epoch].compute.eval[i];
     }
     if (total_compute_time > 0) {
         stats[epoch].observed_time.eval = end_time - stats[epoch].start_time.eval;
-        au = (double)total_compute_time / stats[epoch].observed_time.eval;
+        au                              = (double)total_compute_time / stats[epoch].observed_time.eval;
     }
-    stats[epoch].au.eval = au * 100;
-    stats[epoch].throughput.eval = (double)EVAL_MAX_STEPS * config.BATCH_SIZE_EVAL * 1000000.0 / (end_time - stats[epoch].start_time.eval);
+    stats[epoch].au.eval         = au * 100;
+    stats[epoch].throughput.eval = (double)EVAL_MAX_STEPS * config.BATCH_SIZE_EVAL * 1000000.0 /
+                                   (end_time - stats[epoch].start_time.eval);
 }
-
diff --git a/dlio/stats.h b/dlio/stats.h
index 76cf89ce..20c817c3 100644
--- a/dlio/stats.h
+++ b/dlio/stats.h
@@ -37,12 +37,12 @@ struct observed_time_data {
 };
 
 typedef struct epoch_data {
-    struct start_time_data start_time;
-    struct load_data load;
-    struct proc_data proc;
-    struct throughput_data throughput;
-    struct au_data au;
-    struct compute_data compute;
+    struct start_time_data    start_time;
+    struct load_data          load;
+    struct proc_data          proc;
+    struct throughput_data    throughput;
+    struct au_data            au;
+    struct compute_data       compute;
     struct observed_time_data observed_time;
 } epoch_data_t;
 
@@ -52,8 +52,8 @@ void stats_finalize();
 
 void prepare_data();
 
-void print_data(uint64_t *train_metadata_time, uint64_t *train_read_time,
-                uint64_t *eval_metadata_time, uint64_t *eval_read_time);
+void print_data(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *eval_metadata_time,
+                uint64_t *eval_read_time);
 
 void batch_loaded_train(uint32_t epoch, uint64_t start_time);
 
@@ -71,4 +71,4 @@ void start_eval(uint32_t epoch);
 
 void end_eval(uint32_t epoch);
 
-#endif //SANDBOX_STATS_H
+#endif // SANDBOX_STATS_H
diff --git a/dlio/utils.c b/dlio/utils.c
index a0fb38d4..9f5a9e3c 100644
--- a/dlio/utils.c
+++ b/dlio/utils.c
@@ -11,7 +11,9 @@
 
 #include "utils.h"
 
-uint64_t get_time_usec() {
+uint64_t
+get_time_usec()
+{
     struct timeval tv;
     gettimeofday(&tv, NULL);
     return (uint64_t)1000000 * tv.tv_sec + tv.tv_usec;
@@ -20,58 +22,60 @@ uint64_t get_time_usec() {
 config_datatype_t config = {
     // Workflow
     .DO_DATA_GENERATION = false,
-    .DO_TRAIN = false,
-    .DO_EVALUATION = false,
+    .DO_TRAIN           = false,
+    .DO_EVALUATION      = false,
 
     // Dataset
     .RECORD_LENGTH = 67108864, // should be a square number
-//   .RECORD_LENGTH_STDEV = 0.0f,
-//   .RECORD_LENGTH_RESIZE = 0.0f,
-    .NUM_FILES_TRAIN = 64,
-    .NUM_FILES_EVAL = 8,
+                               //   .RECORD_LENGTH_STDEV = 0.0f,
+                               //   .RECORD_LENGTH_RESIZE = 0.0f,
+    .NUM_FILES_TRAIN      = 64,
+    .NUM_FILES_EVAL       = 8,
     .NUM_SAMPLES_PER_FILE = 4,
-    .DATA_FOLDER = "./data",
-//    .NUM_SUBFOLDERS_TRAIN = 0,
-//    .NUM_SUBFOLDERS_EVAL = 0,
-    .FILE_PREFIX = "img",
-    .DO_COMPRESSION = false,
+    .DATA_FOLDER          = "./data",
+    //    .NUM_SUBFOLDERS_TRAIN = 0,
+    //    .NUM_SUBFOLDERS_EVAL = 0,
+    .FILE_PREFIX       = "img",
+    .DO_COMPRESSION    = false,
     .COMPRESSION_LEVEL = 4,
-    .DO_CHUNKING = false,
-    .CHUNK_SIZE = 1024,  // should be greater than 120 on CLAIX23
-    .KEEP_FILES = false,
+    .DO_CHUNKING       = false,
+    .CHUNK_SIZE        = 1024, // should be greater than 120 on CLAIX23
+    .KEEP_FILES        = false,
 
     // Reader
-//    .DATA_LOADER = PYTORCH,
-    .BATCH_SIZE = 7,
+    //    .DATA_LOADER = PYTORCH,
+    .BATCH_SIZE      = 7,
     .BATCH_SIZE_EVAL = 2,
-    .READ_THREADS = 4,
-//    .COMPUTATION_THREADS = 1,
-//    .PREFETCH_SIZE = 0,
-    .DO_SHUFFLE = false,  // sample shuffle vs file_shuffle
-//    .TRANSFER_SIZE = 262144,
-    .PREPROCESS_TIME = 0.0f,
+    .READ_THREADS    = 4,
+    //    .COMPUTATION_THREADS = 1,
+    //    .PREFETCH_SIZE = 0,
+    .DO_SHUFFLE = false, // sample shuffle vs file_shuffle
+                         //    .TRANSFER_SIZE = 262144,
+    .PREPROCESS_TIME       = 0.0f,
     .PREPROCESS_TIME_STDEV = 0.000f,
     // Train
-    .EPOCHS = 5,
-    .COMPUTATION_TIME = 0.323f,
+    .EPOCHS                 = 5,
+    .COMPUTATION_TIME       = 0.323f,
     .COMPUTATION_TIME_STDEV = 0.000f,
-//    .TOTAL_TRAINING_STEPS = -1
+    //    .TOTAL_TRAINING_STEPS = -1
     .SEED_CHANGE_EPOCH = false,
-    .RANDOM_SEED = 42,
+    .RANDOM_SEED       = 42,
 
     // Evaluation
-    .EVAL_TIME = 0.323f,
-    .EVAL_TIME_STDEV = 0.000f,
+    .EVAL_TIME            = 0.323f,
+    .EVAL_TIME_STDEV      = 0.000f,
     .EPOCHS_BETWEEN_EVALS = 1,
 
     // Output
-    .TRAIN_DATA_FOLDER = "train",
-    .VALID_DATA_FOLDER = "valid",
+    .TRAIN_DATA_FOLDER    = "train",
+    .VALID_DATA_FOLDER    = "valid",
     .RECORDS_DATASET_NAME = "records",
-    .LABELS_DATASET_NAME = "labels",
+    .LABELS_DATASET_NAME  = "labels",
 };
 
-void create_directory(const char *folder) {
+void
+create_directory(const char *folder)
+{
     struct stat st = {0};
     if (stat(folder, &st) == -1) {
         if (mkdir(folder, 0700) != 0) {
@@ -81,9 +85,11 @@ void create_directory(const char *folder) {
     }
 }
 
-void delete_directory(const char *dir_path) {
+void
+delete_directory(const char *dir_path)
+{
     struct dirent *entry;
-    DIR *dir = opendir(dir_path);
+    DIR *          dir = opendir(dir_path);
 
     if (dir == NULL) {
         perror("Error opening directory");
@@ -102,7 +108,8 @@ void delete_directory(const char *dir_path) {
         if (stat(path, &statbuf) == 0) {
             if (S_ISDIR(statbuf.st_mode)) {
                 delete_directory(path);
-            } else {
+            }
+            else {
                 if (remove(path) != 0) {
                     perror("Error deleting file");
                 }
@@ -117,115 +124,153 @@ void delete_directory(const char *dir_path) {
     }
 }
 
-void shuffle(uint32_t *array, size_t n) {
+void
+shuffle(uint32_t *array, size_t n)
+{
     if (n > 1 && array != NULL) {
         for (size_t i = n - 1; i > 0; i--) {
-            size_t j = rand() % (i + 1);
+            size_t   j    = rand() % (i + 1);
             uint32_t temp = array[i];
-            array[i] = array[j];
-            array[j] = temp;
+            array[i]      = array[j];
+            array[j]      = temp;
         }
     }
 }
 
-double generate_normal_random(float mean, float stdev) {
-    double u1 = (double) rand() / RAND_MAX;
-    double u2 = (double) rand() / RAND_MAX;
+double
+generate_normal_random(float mean, float stdev)
+{
+    double u1 = (double)rand() / RAND_MAX;
+    double u2 = (double)rand() / RAND_MAX;
     double z0 = sqrt(-2.0 * log(u1)) * cos(2.0 * M_PI * u2);
     return z0 * stdev + mean;
 }
 
-void parse_args(int argc, char *argv[]) {
+void
+parse_args(int argc, char *argv[])
+{
     for (uint32_t i = 1; i < argc; i++) {
         if (strcmp(argv[i], "--generate-data") == 0) {
             config.DO_DATA_GENERATION = true;
-        } else if (strcmp(argv[i], "--train") == 0) {
+        }
+        else if (strcmp(argv[i], "--train") == 0) {
             config.DO_TRAIN = true;
-        } else if (strcmp(argv[i], "--evaluation") == 0) {
+        }
+        else if (strcmp(argv[i], "--evaluation") == 0) {
             config.DO_EVALUATION = true;
-        } else if (strcmp(argv[i], "--record-length") == 0) {
+        }
+        else if (strcmp(argv[i], "--record-length") == 0) {
             i++;
             config.RECORD_LENGTH = atoi(argv[i]);
-        } else if (strcmp(argv[i], "--num-files-train") == 0) {
+        }
+        else if (strcmp(argv[i], "--num-files-train") == 0) {
             i++;
             config.NUM_FILES_TRAIN = atoi(argv[i]);
-        } else if (strcmp(argv[i], "--num-files-eval") == 0) {
+        }
+        else if (strcmp(argv[i], "--num-files-eval") == 0) {
             i++;
             config.NUM_FILES_EVAL = atoi(argv[i]);
-        } else if (strcmp(argv[i], "--num-samples-per-file") == 0) {
+        }
+        else if (strcmp(argv[i], "--num-samples-per-file") == 0) {
             i++;
             config.NUM_SAMPLES_PER_FILE = atoi(argv[i]);
-        } else if (strcmp(argv[i], "--data-folder") == 0) {
+        }
+        else if (strcmp(argv[i], "--data-folder") == 0) {
             i++;
             config.DATA_FOLDER = argv[i];
-        } else if (strcmp(argv[i], "--file-prefix") == 0) {
+        }
+        else if (strcmp(argv[i], "--file-prefix") == 0) {
             i++;
             config.FILE_PREFIX = argv[i];
-        } else if (strcmp(argv[i], "--chunking") == 0) {
+        }
+        else if (strcmp(argv[i], "--chunking") == 0) {
             config.DO_CHUNKING = true;
-        } else if (strcmp(argv[i], "--chunk-size") == 0) {
+        }
+        else if (strcmp(argv[i], "--chunk-size") == 0) {
             i++;
             config.CHUNK_SIZE = atoi(argv[i]);
-        }else if (strcmp(argv[i], "--keep-files") == 0) {
+        }
+        else if (strcmp(argv[i], "--keep-files") == 0) {
             config.KEEP_FILES = true;
-        } else if (strcmp(argv[i], "--compression") == 0) {
+        }
+        else if (strcmp(argv[i], "--compression") == 0) {
             config.DO_COMPRESSION = true;
-        } else if (strcmp(argv[i], "--compression-level") == 0) {
+        }
+        else if (strcmp(argv[i], "--compression-level") == 0) {
             i++;
             config.COMPRESSION_LEVEL = atoi(argv[i]);
-        } else if (strcmp(argv[i], "--batch-size") == 0) {
+        }
+        else if (strcmp(argv[i], "--batch-size") == 0) {
             i++;
             config.BATCH_SIZE = atoi(argv[i]);
-        } else if (strcmp(argv[i], "--batch-size-eval") == 0) {
+        }
+        else if (strcmp(argv[i], "--batch-size-eval") == 0) {
             i++;
             config.BATCH_SIZE_EVAL = atoi(argv[i]);
-        } else if (strcmp(argv[i], "--shuffle") == 0) {
+        }
+        else if (strcmp(argv[i], "--shuffle") == 0) {
             config.DO_SHUFFLE = true;
-        } else if (strcmp(argv[i], "--preprocess-time") == 0) {
+        }
+        else if (strcmp(argv[i], "--preprocess-time") == 0) {
             i++;
             config.PREPROCESS_TIME = atof(argv[i]);
-        } else if (strcmp(argv[i], "--preprocess-time-stdev") == 0) {
+        }
+        else if (strcmp(argv[i], "--preprocess-time-stdev") == 0) {
             i++;
             config.PREPROCESS_TIME_STDEV = atof(argv[i]);
-        } else if (strcmp(argv[i], "--epochs") == 0) {
+        }
+        else if (strcmp(argv[i], "--epochs") == 0) {
             i++;
             config.EPOCHS = atoi(argv[i]);
-        } else if (strcmp(argv[i], "--computation-time") == 0) {
+        }
+        else if (strcmp(argv[i], "--computation-time") == 0) {
             i++;
             config.COMPUTATION_TIME = atof(argv[i]);
-        } else if (strcmp(argv[i], "--computation-time-stdev") == 0) {
+        }
+        else if (strcmp(argv[i], "--computation-time-stdev") == 0) {
             i++;
             config.COMPUTATION_TIME_STDEV = atof(argv[i]);
-        } else if (strcmp(argv[i], "--random-seed") == 0) {
+        }
+        else if (strcmp(argv[i], "--random-seed") == 0) {
             i++;
             config.RANDOM_SEED = atoi(argv[i]);
-        } else if (strcmp(argv[i], "--eval-time") == 0) {
+        }
+        else if (strcmp(argv[i], "--eval-time") == 0) {
             i++;
             config.EVAL_TIME = atof(argv[i]);
-        } else if (strcmp(argv[i], "--eval-time-stdev") == 0) {
+        }
+        else if (strcmp(argv[i], "--eval-time-stdev") == 0) {
             i++;
             config.EVAL_TIME_STDEV = atof(argv[i]);
-        } else if (strcmp(argv[i], "--epochs-between-evals") == 0) {
+        }
+        else if (strcmp(argv[i], "--epochs-between-evals") == 0) {
             i++;
             config.EPOCHS_BETWEEN_EVALS = atoi(argv[i]);
-        } else if (strcmp(argv[i], "--train-data-folder") == 0) {
+        }
+        else if (strcmp(argv[i], "--train-data-folder") == 0) {
             i++;
             config.TRAIN_DATA_FOLDER = argv[i];
-        } else if (strcmp(argv[i], "--valid-data-folder") == 0) {
+        }
+        else if (strcmp(argv[i], "--valid-data-folder") == 0) {
             i++;
             config.VALID_DATA_FOLDER = argv[i];
-        } else if (strcmp(argv[i], "--records-dataset-name") == 0) {
+        }
+        else if (strcmp(argv[i], "--records-dataset-name") == 0) {
             i++;
             config.RECORDS_DATASET_NAME = argv[i];
-        } else if (strcmp(argv[i], "--labels-dataset-name") == 0) {
+        }
+        else if (strcmp(argv[i], "--labels-dataset-name") == 0) {
             i++;
             config.LABELS_DATASET_NAME = argv[i];
-        } else if (strcmp(argv[i], "--seed-change-epoch") == 0) {
+        }
+        else if (strcmp(argv[i], "--seed-change-epoch") == 0) {
             config.SEED_CHANGE_EPOCH = true;
-        } else if (strcmp(argv[i], "--read-threads") == 0) {
+        }
+        else if (strcmp(argv[i], "--read-threads") == 0) {
             i++;
             config.READ_THREADS = atoi(argv[i]);
-        } else {
+        }
+        else {
             printf("WARNING: %s not found\n", argv[i]);
         }
     }
diff --git a/dlio/utils.h b/dlio/utils.h
index cdbb4580..cb861262 100644
--- a/dlio/utils.h
+++ b/dlio/utils.h
@@ -18,43 +18,43 @@ typedef struct config_datatype {
 
     // Dataset
     uint32_t RECORD_LENGTH; // should be a square number
-//    float RECORD_LENGTH_STDEV;
-//    float RECORD_LENGTH_RESIZE;
+                            //    float RECORD_LENGTH_STDEV;
+                            //    float RECORD_LENGTH_RESIZE;
     uint32_t NUM_FILES_TRAIN;
     uint32_t NUM_FILES_EVAL;
     uint32_t NUM_SAMPLES_PER_FILE;
-    char *DATA_FOLDER;
-//    unsigned int NUM_SUBFOLDERS_TRAIN;
-//    unsigned int NUM_SUBFOLDERS_EVAL;
-    char *FILE_PREFIX;
-    bool DO_COMPRESSION;
+    char *   DATA_FOLDER;
+    //    unsigned int NUM_SUBFOLDERS_TRAIN;
+    //    unsigned int NUM_SUBFOLDERS_EVAL;
+    char *   FILE_PREFIX;
+    bool     DO_COMPRESSION;
     uint32_t COMPRESSION_LEVEL;
-    bool DO_CHUNKING;
+    bool     DO_CHUNKING;
     uint32_t CHUNK_SIZE; // should be a square number
-    bool KEEP_FILES;
+    bool     KEEP_FILES;
 
     // Reader
-//    DATA_LOADER;
+    //    DATA_LOADER;
     uint32_t BATCH_SIZE;
     uint32_t BATCH_SIZE_EVAL;
     uint32_t READ_THREADS;
-//    int COMPUTATION_THREADS;
-//    unsigned int PREFETCH_SIZE;
-    bool DO_SHUFFLE;  // sample shuffle vs file_shuffle
-//    unsigned int TRANSFER_SIZE;
+    //    int COMPUTATION_THREADS;
+    //    unsigned int PREFETCH_SIZE;
+    bool DO_SHUFFLE; // sample shuffle vs file_shuffle
+                     //    unsigned int TRANSFER_SIZE;
     float PREPROCESS_TIME;
     float PREPROCESS_TIME_STDEV;
     // Train
     uint32_t EPOCHS;
-    float COMPUTATION_TIME;
-    float COMPUTATION_TIME_STDEV;
-//    long int TOTAL_TRAINING_STEPS = -1
+    float    COMPUTATION_TIME;
+    float    COMPUTATION_TIME_STDEV;
+    //    long int TOTAL_TRAINING_STEPS = -1
     bool SEED_CHANGE_EPOCH;
-    int RANDOM_SEED;
+    int  RANDOM_SEED;
 
     // Evaluation
-    float EVAL_TIME;
-    float EVAL_TIME_STDEV;
+    float    EVAL_TIME;
+    float    EVAL_TIME_STDEV;
     uint32_t EPOCHS_BETWEEN_EVALS;
 
     // Output
@@ -76,4 +76,4 @@ void delete_directory(const char *dir_path);
 
 void parse_args(int argc, char *argv[]);
 
-#endif //SANDBOX_UTILS_H
+#endif // SANDBOX_UTILS_H
diff --git a/dlio/workers.c b/dlio/workers.c
index b183d784..8f6e2008 100644
--- a/dlio/workers.c
+++ b/dlio/workers.c
@@ -11,8 +11,11 @@
 int pipe_train_task_fd[2], pipe_train_result_fd[2], pipe_eval_task_fd[2], pipe_eval_result_fd[2];
 int pipe_train_system_fd[2], pipe_eval_system_fd[2];
 
-void init_workers(uint32_t *indices_train, uint32_t *indices_eval) {
-    if ((pipe(pipe_train_system_fd) == -1) || (pipe(pipe_train_task_fd) == -1) || (pipe(pipe_train_result_fd) == -1)) {
+void
+init_workers(uint32_t *indices_train, uint32_t *indices_eval)
+{
+    if ((pipe(pipe_train_system_fd) == -1) || (pipe(pipe_train_task_fd) == -1) ||
+        (pipe(pipe_train_result_fd) == -1)) {
         perror("pipe");
         exit(EXIT_FAILURE);
     }
@@ -22,7 +25,8 @@ void init_workers(uint32_t *indices_train, uint32_t *indices_eval) {
         if (pid == -1) {
             perror("fork");
             exit(EXIT_FAILURE);
-        } else if (pid == 0) {
+        }
+        else if (pid == 0) {
             close(pipe_train_task_fd[1]);
             close(pipe_train_result_fd[0]);
             close(pipe_train_system_fd[1]);
@@ -37,7 +41,8 @@ void init_workers(uint32_t *indices_train, uint32_t *indices_eval) {
     }
 
     if (config.DO_EVALUATION) {
-        if ((pipe(pipe_eval_system_fd) == -1) || (pipe(pipe_eval_task_fd) == -1) || (pipe(pipe_eval_result_fd) == -1)) {
+        if ((pipe(pipe_eval_system_fd) == -1) || (pipe(pipe_eval_task_fd) == -1) ||
+            (pipe(pipe_eval_result_fd) == -1)) {
             perror("pipe");
             exit(EXIT_FAILURE);
         }
@@ -47,7 +52,8 @@ void init_workers(uint32_t *indices_train, uint32_t *indices_eval) {
             if (pid == -1) {
                 perror("fork");
                 exit(EXIT_FAILURE);
-            } else if (pid == 0) {
+            }
+            else if (pid == 0) {
                 close(pipe_eval_task_fd[1]);
                 close(pipe_eval_result_fd[0]);
                 close(pipe_eval_system_fd[1]);
@@ -71,31 +77,45 @@ void init_workers(uint32_t *indices_train, uint32_t *indices_eval) {
     close(pipe_train_system_fd[0]);
 }
 
-int get_train_read_fd() {
+int
+get_train_read_fd()
+{
     return pipe_train_result_fd[0];
 }
 
-int get_eval_read_fd() {
+int
+get_eval_read_fd()
+{
     return pipe_eval_result_fd[0];
 }
 
-int get_train_write_fd() {
+int
+get_train_write_fd()
+{
     return pipe_train_task_fd[1];
 }
 
-int get_eval_write_fd() {
+int
+get_eval_write_fd()
+{
     return pipe_eval_task_fd[1];
 }
 
-int get_train_system_fd() {
+int
+get_train_system_fd()
+{
     return pipe_train_system_fd[1];
 }
 
-int get_eval_system_fd() {
+int
+get_eval_system_fd()
+{
     return pipe_eval_system_fd[1];
 }
 
-void fin_workers() {
+void
+fin_workers()
+{
     close(pipe_train_task_fd[1]);
     close(pipe_train_result_fd[0]);
     close(pipe_train_system_fd[1]);
@@ -104,7 +124,6 @@ void fin_workers() {
         close(pipe_eval_task_fd[1]);
         close(pipe_eval_result_fd[0]);
         close(pipe_eval_system_fd[1]);
-
     }
 
     for (uint32_t i = 0; i < config.READ_THREADS; i++) {
@@ -118,7 +137,9 @@ void fin_workers() {
     }
 }
 
-void force_workers_to_shuffle(int read_fd, int write_fd, int system_fd) {
+void
+force_workers_to_shuffle(int read_fd, int write_fd, int system_fd)
+{
     int32_t shuffle_code = -1;
     for (uint32_t i = 0; i < config.READ_THREADS; i++) {
         write(write_fd, &shuffle_code, sizeof(shuffle_code));
@@ -133,16 +154,20 @@ void force_workers_to_shuffle(int read_fd, int write_fd, int system_fd) {
     }
 }
 
-void run_worker(uint32_t *indices, int pipe_task_fd[2], int pipe_result_fd[2], int pipe_system_fd[2], bool is_train_worker) {
+void
+run_worker(uint32_t *indices, int pipe_task_fd[2], int pipe_result_fd[2], int pipe_system_fd[2],
+           bool is_train_worker)
+{
     int32_t batch = 0, current_epoch = 0;
     while (read(pipe_task_fd[0], &batch, sizeof(batch)) > 0) {
         // A new epoch has begun
         if (batch == -1) {
             if (config.SEED_CHANGE_EPOCH) {
-                srand(config.RANDOM_SEED * (is_train_worker? 1: 2) + current_epoch);
+                srand(config.RANDOM_SEED * (is_train_worker ? 1 : 2) + current_epoch);
             }
             if (config.DO_SHUFFLE) {
-                shuffle(indices, config.NUM_SAMPLES_PER_FILE * (is_train_worker? config.NUM_FILES_TRAIN: config.NUM_FILES_EVAL));
+                shuffle(indices, config.NUM_SAMPLES_PER_FILE *
+                                     (is_train_worker ? config.NUM_FILES_TRAIN : config.NUM_FILES_EVAL));
             }
             current_epoch++;
             write(pipe_result_fd[1], &batch, sizeof(batch));
@@ -150,22 +175,18 @@ void run_worker(uint32_t *indices, int pipe_task_fd[2], int pipe_result_fd[2], i
             continue;
         }
 
-        uint32_t read_from = batch * (is_train_worker? config.BATCH_SIZE: config.BATCH_SIZE_EVAL);
-        uint32_t read_to = (batch + 1) * (is_train_worker? config.BATCH_SIZE: config.BATCH_SIZE_EVAL);
+        uint32_t read_from = batch * (is_train_worker ? config.BATCH_SIZE : config.BATCH_SIZE_EVAL);
+        uint32_t read_to   = (batch + 1) * (is_train_worker ? config.BATCH_SIZE : config.BATCH_SIZE_EVAL);
         uint64_t process_metadata_time = 0, process_read_time = 0;
 
         for (uint32_t i = read_from; i < read_to; i++) {
-            uint32_t file_num = indices[i] / config.NUM_SAMPLES_PER_FILE + 1;
+            uint32_t file_num   = indices[i] / config.NUM_SAMPLES_PER_FILE + 1;
             uint32_t sample_num = indices[i] % config.NUM_SAMPLES_PER_FILE;
-            char file_path[256];
-            snprintf(
-                file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5",
-                 config.DATA_FOLDER,
-                 is_train_worker? config.TRAIN_DATA_FOLDER: config.VALID_DATA_FOLDER,
-                 config.FILE_PREFIX,
-                 file_num,
-                 is_train_worker? config.NUM_FILES_TRAIN: config.NUM_FILES_EVAL
-            );
+            char     file_path[256];
+            snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER,
+                     is_train_worker ? config.TRAIN_DATA_FOLDER : config.VALID_DATA_FOLDER,
+                     config.FILE_PREFIX, file_num,
+                     is_train_worker ? config.NUM_FILES_TRAIN : config.NUM_FILES_EVAL);
 
             uint64_t metadata_time = 0, read_time = 0;
             read_sample(file_path, sample_num, &metadata_time, &read_time);
@@ -176,7 +197,7 @@ void run_worker(uint32_t *indices, int pipe_task_fd[2], int pipe_result_fd[2], i
 
         execution_time_t data = {
             .metadata_time = process_metadata_time,
-            .read_time = process_read_time,
+            .read_time     = process_read_time,
         };
 
         write(pipe_result_fd[1], &data, sizeof(data));
diff --git a/dlio/workers.h b/dlio/workers.h
index 61ed4ac1..62d0ee82 100644
--- a/dlio/workers.h
+++ b/dlio/workers.h
@@ -26,6 +26,7 @@ void fin_workers();
 
 void force_workers_to_shuffle(int read_fd, int write_fd, int system_fd);
 
-void run_worker(uint32_t *indices, int pipe_task_fd[2], int pipe_result_fd[2], int pipe_system_fd[2], bool is_train_worker);
+void run_worker(uint32_t *indices, int pipe_task_fd[2], int pipe_result_fd[2], int pipe_system_fd[2],
+                bool is_train_worker);
 
 #endif // H5BENCH_WORKERS_H

From 5b702e16f661ab8eff42a86958ff1b91b74e7ecc Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Sat, 22 Jun 2024 18:36:31 +0200
Subject: [PATCH 04/37] Add Subfiling VFD support

---
 dlio/README.md         |  3 ++
 dlio/h5bench_dlio.c    | 78 ++++++++++++++++++++++++++-----------
 dlio/stats.c           | 87 ++++++------------------------------------
 dlio/stats.h           |  6 ---
 dlio/utils.c           | 13 +++++++
 dlio/utils.h           |  3 ++
 dlio/workers.c         |  2 +
 samples/sync-dlio.json |  8 ++--
 src/h5bench.py         |  3 ++
 9 files changed, 96 insertions(+), 107 deletions(-)

diff --git a/dlio/README.md b/dlio/README.md
index 460cce4e..68cc9ed4 100644
--- a/dlio/README.md
+++ b/dlio/README.md
@@ -38,6 +38,9 @@ The benchmark is designed to measure the performance of training and evaluation
 - ``--labels-dataset-name <x>``: Name of the dataset with labels [default: *labels*]
 - ``--seed-change-epoch``: Enable seed changes every epoch [default: *false*]
 - ``--read-threads``: The number of workers used to read the data [default: *4*]
+- ``--collective-meta``: Enable collective HDF5 metadata operations [default: *false*]
+- ``--collective-data``: Enable collective HDF5 data operations [default: *false*]
+- ``--subfiling``: Enable HDF5 Subfiling Virtual File Driver [default: *false*]
 
 ### Exerciser Basics
 
diff --git a/dlio/h5bench_dlio.c b/dlio/h5bench_dlio.c
index 5bf4eee1..0117cdb1 100644
--- a/dlio/h5bench_dlio.c
+++ b/dlio/h5bench_dlio.c
@@ -1,7 +1,7 @@
 // TODO:
 // - Add logging
 // - Add vol-async support
-// - Add subfiling support
+// - Add subfiling settings
 // - Add more DLIO features
 // - Add more data loaders: Tensorflow & dali
 // - Add prefetcher configuration?
@@ -9,6 +9,8 @@
 // - Add file shuffle configuration
 // - Add more compression filters
 // - Add drop_last = False setting
+// - Replace fork() with MPI_Comm_spawn()
+// - Add Cache VOL connector support
 
 #include <assert.h>
 #include <hdf5.h>
@@ -26,10 +28,10 @@
 #include "utils.h"
 #include "workers.h"
 
-//#ifdef HAVE_SUBFILING
-//#include "H5FDsubfiling.h"
-//#include "H5FDioc.h"
-//#endif
+#ifdef HAVE_SUBFILING
+#include "H5FDsubfiling.h"
+#include "H5FDioc.h"
+#endif
 
 #define GENERATION_BUFFER_SIZE 2 * 1073741824lu
 
@@ -137,10 +139,13 @@ generate_data()
     hid_t   extra_records_memspace = H5Screate_simple(3, extra_records_count, NULL);
     assert(extra_records_memspace >= 0);
 
-    for (uint32_t i = MY_RANK; i < config.NUM_FILES_TRAIN; i += NUM_RANKS) {
+    uint32_t from = config.SUBFILING? 0: MY_RANK;
+    uint32_t increment = config.SUBFILING? 1: NUM_RANKS;
+
+    for (uint32_t i = from; i < config.NUM_FILES_TRAIN; i += increment) {
         srand(config.RANDOM_SEED + i);
 
-        printf("Generate train file %u / %u\n", i + 1, config.NUM_FILES_TRAIN);
+        if (!config.SUBFILING || config.SUBFILING && (MY_RANK == 0)) printf("Generate train file %u / %u\n", i + 1, config.NUM_FILES_TRAIN);
         char file_name[256];
         snprintf(file_name, sizeof(file_name), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER,
                  config.TRAIN_DATA_FOLDER, config.FILE_PREFIX, i + 1, config.NUM_FILES_TRAIN);
@@ -148,10 +153,10 @@ generate_data()
                       extra_records_memspace);
     }
 
-    for (uint32_t i = MY_RANK; i < config.NUM_FILES_EVAL; i += NUM_RANKS) {
+    for (uint32_t i = from; i < config.NUM_FILES_EVAL; i += increment) {
         srand(config.RANDOM_SEED + config.NUM_FILES_TRAIN + i);
 
-        printf("Generate valid file %u / %u\n", i + 1, config.NUM_FILES_EVAL);
+        if (!config.SUBFILING || config.SUBFILING && (MY_RANK == 0)) printf("Generate valid file %u / %u\n", i + 1, config.NUM_FILES_EVAL);
         char file_name[256];
         snprintf(file_name, sizeof(file_name), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER,
                  config.VALID_DATA_FOLDER, config.FILE_PREFIX, i + 1, config.NUM_FILES_EVAL);
@@ -174,7 +179,7 @@ read_sample(const char *file_path, uint32_t sample, uint64_t *metadata_time_out,
 
     uint64_t t1         = get_time_usec();
     hid_t    file_id    = H5Fopen(file_path, H5F_ACC_RDONLY, FAPL);
-    hid_t    dataset_id = H5Dopen(file_id, config.RECORDS_DATASET_NAME, DXPL);
+    hid_t    dataset_id = H5Dopen(file_id, config.RECORDS_DATASET_NAME, DAPL);
     hid_t    filespace  = H5Dget_space(dataset_id);
     hid_t    memspace   = H5Screate_simple(3, count, NULL);
     H5Sselect_hyperslab(filespace, H5S_SELECT_SET, offset, NULL, count, NULL);
@@ -582,28 +587,55 @@ init_global_variables()
 
     // check if read_threads < batch size and print warning
 
+#ifndef HAVE_SUBFILING
+    config.SUBFILING = false;
+#endif
+
+    FAPL = H5Pcreate(H5P_FILE_ACCESS);
     DCPL = H5Pcreate(H5P_DATASET_CREATE);
-    if (config.DO_CHUNKING) {
+    DAPL = H5Pcreate(H5P_DATASET_ACCESS);
+    DXPL = H5Pcreate(H5P_DATASET_XFER);
+
+    if (config.SUBFILING) {
+        H5Pset_fapl_subfiling(FAPL, NULL);
+        if (config.COLLECTIVE_DATA) {
+            if (MY_RANK == 0) printf("Warning: Collective mode can't be used with subfiling\n");
+            config.COLLECTIVE_DATA = false;
+        }
+        if (config.DO_CHUNKING) {
+            if (MY_RANK == 0) printf("Warning: Chunking can't be used with subfiling\n");
+            config.DO_CHUNKING = false;
+        }
+        if (config.READ_THREADS > 0) {
+            if (MY_RANK == 0) printf("Warning: Multiprocessing can't be used with subfiling. READ_THREADS is set to 0...\n");
+            config.READ_THREADS = 0;
+        }
+    } else if (config.DO_CHUNKING) {
         hsize_t chunk_dims[3] = {1, chunk_dimension, chunk_dimension};
         H5Pset_chunk(DCPL, 3, chunk_dims);
         if (config.DO_COMPRESSION) {
             H5Pset_deflate(DCPL, config.COMPRESSION_LEVEL);
         }
+        if (config.COLLECTIVE_DATA) {
+            if (MY_RANK == 0) printf("Warning: Collective mode can't be used with subfiling\n");
+            config.COLLECTIVE_DATA = false;
+        }
+    } else {
+        H5Pset_fapl_mpio(FAPL, MPI_COMM_SELF, MPI_INFO_NULL);
+        if (config.COLLECTIVE_DATA) {
+            H5Pset_dxpl_mpio(DXPL, H5FD_MPIO_COLLECTIVE);
+        } else {
+            H5Pset_dxpl_mpio(DXPL, H5FD_MPIO_INDEPENDENT);
+        }
     }
 
-    FAPL = H5Pcreate(H5P_FILE_ACCESS);
-//    H5Pset_fapl_mpio(fapl, MPI_COMM_WORLD, MPI_INFO_NULL);
-#if H5_VERSION_GE(1, 10, 0)
-    H5Pset_all_coll_metadata_ops(FAPL, true);
-    H5Pset_coll_metadata_write(FAPL, true);
-#endif
-
-    hid_t DAPL = H5Pcreate(H5P_DATASET_ACCESS);
 #if H5_VERSION_GE(1, 10, 0)
-    H5Pset_all_coll_metadata_ops(DAPL, true);
+    if (config.COLLECTIVE_META) {
+        H5Pset_all_coll_metadata_ops(FAPL, true);
+        H5Pset_coll_metadata_write(FAPL, true);
+        H5Pset_all_coll_metadata_ops(DAPL, true);
+    }
 #endif
-
-    hid_t DXPL = H5Pcreate(H5P_DATASET_XFER);
 }
 
 int
@@ -634,6 +666,7 @@ main(int argc, char *argv[])
         printf("OK\n");
     }
     init_global_variables();
+    MPI_Barrier(MPI_COMM_WORLD);
 
     if (config.DO_DATA_GENERATION) {
         generate_data();
@@ -678,6 +711,7 @@ main(int argc, char *argv[])
     H5Pclose(DXPL);
     H5Pclose(DAPL);
     H5Pclose(FAPL);
+    H5close();
     MPI_Finalize();
     return 0;
 }
diff --git a/dlio/stats.c b/dlio/stats.c
index 0d54811f..29c8e258 100644
--- a/dlio/stats.c
+++ b/dlio/stats.c
@@ -20,13 +20,9 @@ uint32_t *last_proc_eval;
 uint32_t *last_compute_train;
 uint32_t *last_compute_eval;
 
-double AU;
-
 void
 stats_initialize()
 {
-    AU = 0.90;
-
     uint32_t train_steps_count =
         config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE / NUM_RANKS;
     uint32_t train_steps_count_remainder =
@@ -67,8 +63,6 @@ stats_initialize()
         }
         stats[i].throughput.train = 0.0;
         stats[i].throughput.eval  = 0.0;
-        stats[i].au.train         = 0.0;
-        stats[i].au.eval          = 0.0;
         stats[i].compute.train    = (uint64_t *)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
         if (stats[i].compute.train == NULL) {
             exit(1);
@@ -179,8 +173,6 @@ prepare_data()
                    MPI_SUM, 0, MPI_COMM_WORLD);
         MPI_Reduce(stats[i].proc.eval, global_stats[i].proc.eval, EVAL_MAX_STEPS, MPI_UNSIGNED_LONG_LONG,
                    MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(&stats[i].au.train, &global_stats[i].au.train, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(&stats[i].au.eval, &global_stats[i].au.eval, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
         MPI_Reduce(&stats[i].throughput.train, &global_stats[i].throughput.train, 1, MPI_DOUBLE, MPI_SUM, 0,
                    MPI_COMM_WORLD);
         MPI_Reduce(&stats[i].throughput.eval, &global_stats[i].throughput.eval, 1, MPI_DOUBLE, MPI_SUM, 0,
@@ -206,8 +198,6 @@ prepare_data()
             global_stats[i].compute.eval[j] /= NUM_RANKS;
         }
 
-        global_stats[i].au.train /= NUM_RANKS;
-        global_stats[i].au.eval /= NUM_RANKS;
         global_stats[i].throughput.train /= NUM_RANKS;
         global_stats[i].throughput.eval /= NUM_RANKS;
         global_stats[i].observed_time.train /= NUM_RANKS;
@@ -219,12 +209,15 @@ void
 print_data(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *eval_metadata_time,
            uint64_t *eval_read_time)
 {
-
     printf("metric, value\n");
     printf("operation, dlio\n");
     printf("ranks, %d\n", NUM_RANKS);
-    //    printf("collective meta");
-    //    printf("collective data");
+    printf("read threads, %d\n", config.READ_THREADS);
+    printf("subfiling, %s\n", config.SUBFILING? "YES": "NO");
+    printf("chunking, %s\n", config.DO_CHUNKING? "YES": "NO");
+    printf("collective meta, %s\n", config.COLLECTIVE_META? "YES": "NO");
+    printf("collective data, %s\n", config.COLLECTIVE_DATA? "YES": "NO");
+
     // Train
     printf("train compute time, \"");
     uint64_t train_total_compute_time = 0;
@@ -275,26 +268,6 @@ print_data(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *e
     }
     printf("\"\n");
 
-    printf("train au percentage, \"");
-    double train_au_mean_percentage = 0.0;
-    for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        train_au_mean_percentage += global_stats[i].au.train;
-        printf("%lf", global_stats[i].au.train);
-        if (i != config.EPOCHS - 1)
-            printf(", ");
-    }
-    train_au_mean_percentage = train_au_mean_percentage / (double)config.EPOCHS;
-    printf("\"\ntrain au mean percentage, %lf\n", train_au_mean_percentage);
-    printf("train au meet expectation, %s\n", train_au_mean_percentage >= 100 * AU ? "success" : "fail");
-
-    double train_au_stdev_percentage = 0.0;
-    for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        train_au_stdev_percentage += (global_stats[i].au.train - train_au_mean_percentage) *
-                                     (global_stats[i].au.train - train_au_mean_percentage);
-    }
-    train_au_stdev_percentage = sqrt(train_au_stdev_percentage / (double)config.EPOCHS);
-    printf("train au stdev percentage, %lf\n", train_au_stdev_percentage);
-
     printf("train throughput samples per second, \"");
     double train_throughput_mean_samples_per_second = 0.0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
@@ -374,26 +347,6 @@ print_data(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *e
     }
     printf("\"\n");
 
-    printf("eval au percentage, \"");
-    double eval_au_mean_percentage = 0.0;
-    for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        eval_au_mean_percentage += global_stats[i].au.eval;
-        printf("%lf", global_stats[i].au.eval);
-        if (i != config.EPOCHS - 1)
-            printf(", ");
-    }
-    eval_au_mean_percentage = eval_au_mean_percentage / (double)config.EPOCHS;
-    printf("\"\neval au mean percentage, %lf\n", eval_au_mean_percentage);
-    printf("eval au meet expectation, %s\n", eval_au_mean_percentage >= 100 * AU ? "success" : "fail");
-
-    double eval_au_stdev_percentage = 0.0;
-    for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        eval_au_stdev_percentage += (global_stats[i].au.eval - eval_au_mean_percentage) *
-                                    (global_stats[i].au.eval - eval_au_mean_percentage);
-    }
-    eval_au_stdev_percentage = sqrt(eval_au_stdev_percentage / (double)config.EPOCHS);
-    printf("eval au stdev percentage, %lf\n", eval_au_stdev_percentage);
-
     printf("eval throughput samples per second, \"");
     double eval_throughput_mean_samples_per_second = 0.0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
@@ -422,6 +375,9 @@ print_data(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *e
     double eval_io_stdev_MB_per_second =
         eval_throughput_stdev_samples_per_second * config.RECORD_LENGTH / 1024 / 1024;
     printf("eval io stdev MB per second, %lf\n", eval_io_stdev_MB_per_second);
+
+    // TODO: EVALUATION_TIME & PREPROCESS_TIME
+    printf("total compute time, %lf", (train_total_compute_time + eval_total_compute_time) / 1000000.0);
 }
 
 void
@@ -460,18 +416,7 @@ void
 end_train(uint32_t epoch)
 {
     uint64_t end_time           = get_time_usec();
-    uint64_t total_compute_time = 0;
-    double   au                 = 0.0;
-
-    for (int i = 0; i < TRAIN_MAX_STEPS; i++) {
-        total_compute_time += stats[epoch].compute.train[i];
-    }
-    if (total_compute_time > 0) {
-        stats[epoch].observed_time.train = end_time - stats[epoch].start_time.train;
-        au                               = (double)total_compute_time / stats[epoch].observed_time.train;
-    }
-
-    stats[epoch].au.train = au * 100;
+    stats[epoch].observed_time.train = end_time - stats[epoch].start_time.train;
     stats[epoch].throughput.train =
         (double)TRAIN_MAX_STEPS * config.BATCH_SIZE * 1000000.0 / (end_time - stats[epoch].start_time.train);
 }
@@ -486,17 +431,7 @@ void
 end_eval(uint32_t epoch)
 {
     uint64_t end_time           = get_time_usec();
-    uint64_t total_compute_time = 0;
-    double   au                 = 0.0;
-
-    for (int i = 0; i < EVAL_MAX_STEPS; i++) {
-        total_compute_time += stats[epoch].compute.eval[i];
-    }
-    if (total_compute_time > 0) {
-        stats[epoch].observed_time.eval = end_time - stats[epoch].start_time.eval;
-        au                              = (double)total_compute_time / stats[epoch].observed_time.eval;
-    }
-    stats[epoch].au.eval         = au * 100;
+    stats[epoch].observed_time.eval = end_time - stats[epoch].start_time.eval;
     stats[epoch].throughput.eval = (double)EVAL_MAX_STEPS * config.BATCH_SIZE_EVAL * 1000000.0 /
                                    (end_time - stats[epoch].start_time.eval);
 }
diff --git a/dlio/stats.h b/dlio/stats.h
index 20c817c3..f0f3fd14 100644
--- a/dlio/stats.h
+++ b/dlio/stats.h
@@ -16,11 +16,6 @@ struct throughput_data {
     double eval;
 };
 
-struct au_data {
-    double train;
-    double eval;
-};
-
 struct compute_data {
     uint64_t *train;
     uint64_t *eval;
@@ -41,7 +36,6 @@ typedef struct epoch_data {
     struct load_data          load;
     struct proc_data          proc;
     struct throughput_data    throughput;
-    struct au_data            au;
     struct compute_data       compute;
     struct observed_time_data observed_time;
 } epoch_data_t;
diff --git a/dlio/utils.c b/dlio/utils.c
index 9f5a9e3c..90b8a6f4 100644
--- a/dlio/utils.c
+++ b/dlio/utils.c
@@ -41,6 +41,9 @@ config_datatype_t config = {
     .DO_CHUNKING       = false,
     .CHUNK_SIZE        = 1024, // should be greater than 120 on CLAIX23
     .KEEP_FILES        = false,
+    .COLLECTIVE_META      = false,
+    .COLLECTIVE_DATA      = false,
+    .SUBFILING            = false,
 
     // Reader
     //    .DATA_LOADER = PYTORCH,
@@ -53,6 +56,7 @@ config_datatype_t config = {
                          //    .TRANSFER_SIZE = 262144,
     .PREPROCESS_TIME       = 0.0f,
     .PREPROCESS_TIME_STDEV = 0.000f,
+
     // Train
     .EPOCHS                 = 5,
     .COMPUTATION_TIME       = 0.323f,
@@ -270,6 +274,15 @@ parse_args(int argc, char *argv[])
             i++;
             config.READ_THREADS = atoi(argv[i]);
         }
+        else if (strcmp(argv[i], "--collective-meta") == 0) {
+            config.COLLECTIVE_META = true;
+        }
+        else if (strcmp(argv[i], "--collective-data") == 0) {
+            config.COLLECTIVE_DATA = true;
+        }
+        else if (strcmp(argv[i], "--subfiling") == 0) {
+            config.SUBFILING = true;
+        }
         else {
             printf("WARNING: %s not found\n", argv[i]);
         }
diff --git a/dlio/utils.h b/dlio/utils.h
index cb861262..f6c2b6fe 100644
--- a/dlio/utils.h
+++ b/dlio/utils.h
@@ -32,6 +32,9 @@ typedef struct config_datatype {
     bool     DO_CHUNKING;
     uint32_t CHUNK_SIZE; // should be a square number
     bool     KEEP_FILES;
+    bool COLLECTIVE_META;
+    bool COLLECTIVE_DATA;
+    bool SUBFILING;
 
     // Reader
     //    DATA_LOADER;
diff --git a/dlio/workers.c b/dlio/workers.c
index 8f6e2008..7e9f544d 100644
--- a/dlio/workers.c
+++ b/dlio/workers.c
@@ -1,3 +1,5 @@
+// TODO: handle errors in child processes
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <sys/types.h>
diff --git a/samples/sync-dlio.json b/samples/sync-dlio.json
index d454cd37..c50f3a5a 100644
--- a/samples/sync-dlio.json
+++ b/samples/sync-dlio.json
@@ -15,7 +15,7 @@
       "benchmark": "dlio",
       "configuration": {
         "generate-data": "true",
-        "chunking": "true",
+        "chunking": "false",
         "keep-files": "true",
         "compression": "true",
         "record-length": "67108864",
@@ -24,7 +24,6 @@
         "num-samples-per-file": "4",
         "data-folder": "data",
         "file-prefix": "img",
-        "chunk-size": "1024",
         "compression-level": "4",
         "random-seed": "42",
         "train-data-folder": "train",
@@ -62,7 +61,10 @@
         "train-data-folder": "train",
         "valid-data-folder": "valid",
         "records-dataset-name": "records",
-        "labels-dataset-name": "labels"
+        "labels-dataset-name": "labels",
+        "collective-meta": "true",
+        "collective-data": "true",
+        "subfiling": "false"
       }
     }
   ]
diff --git a/src/h5bench.py b/src/h5bench.py
index f44d19f2..ac65536f 100755
--- a/src/h5bench.py
+++ b/src/h5bench.py
@@ -978,6 +978,9 @@ def run_dlio(self, id, setup):
                 'compression',
                 'shuffle',
                 'seed-change-epoch',
+                'collective-meta',
+                'collective-data',
+                'subfiling',
             ]
 
             # Create the configuration parameter list

From b2d829bd2c31041b05dad60068074c426305c150 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions[bot]@users.noreply.github.com>
Date: Sat, 22 Jun 2024 16:36:58 +0000
Subject: [PATCH 05/37] Committing clang-format changes

---
 dlio/h5bench_dlio.c | 32 +++++++++++++++++++++-----------
 dlio/stats.c        | 14 +++++++-------
 dlio/utils.c        |  6 +++---
 dlio/utils.h        |  6 +++---
 4 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/dlio/h5bench_dlio.c b/dlio/h5bench_dlio.c
index 0117cdb1..69447da1 100644
--- a/dlio/h5bench_dlio.c
+++ b/dlio/h5bench_dlio.c
@@ -139,13 +139,14 @@ generate_data()
     hid_t   extra_records_memspace = H5Screate_simple(3, extra_records_count, NULL);
     assert(extra_records_memspace >= 0);
 
-    uint32_t from = config.SUBFILING? 0: MY_RANK;
-    uint32_t increment = config.SUBFILING? 1: NUM_RANKS;
+    uint32_t from      = config.SUBFILING ? 0 : MY_RANK;
+    uint32_t increment = config.SUBFILING ? 1 : NUM_RANKS;
 
     for (uint32_t i = from; i < config.NUM_FILES_TRAIN; i += increment) {
         srand(config.RANDOM_SEED + i);
 
-        if (!config.SUBFILING || config.SUBFILING && (MY_RANK == 0)) printf("Generate train file %u / %u\n", i + 1, config.NUM_FILES_TRAIN);
+        if (!config.SUBFILING || config.SUBFILING && (MY_RANK == 0))
+            printf("Generate train file %u / %u\n", i + 1, config.NUM_FILES_TRAIN);
         char file_name[256];
         snprintf(file_name, sizeof(file_name), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER,
                  config.TRAIN_DATA_FOLDER, config.FILE_PREFIX, i + 1, config.NUM_FILES_TRAIN);
@@ -156,7 +157,8 @@ generate_data()
     for (uint32_t i = from; i < config.NUM_FILES_EVAL; i += increment) {
         srand(config.RANDOM_SEED + config.NUM_FILES_TRAIN + i);
 
-        if (!config.SUBFILING || config.SUBFILING && (MY_RANK == 0)) printf("Generate valid file %u / %u\n", i + 1, config.NUM_FILES_EVAL);
+        if (!config.SUBFILING || config.SUBFILING && (MY_RANK == 0))
+            printf("Generate valid file %u / %u\n", i + 1, config.NUM_FILES_EVAL);
         char file_name[256];
         snprintf(file_name, sizeof(file_name), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER,
                  config.VALID_DATA_FOLDER, config.FILE_PREFIX, i + 1, config.NUM_FILES_EVAL);
@@ -599,32 +601,40 @@ init_global_variables()
     if (config.SUBFILING) {
         H5Pset_fapl_subfiling(FAPL, NULL);
         if (config.COLLECTIVE_DATA) {
-            if (MY_RANK == 0) printf("Warning: Collective mode can't be used with subfiling\n");
+            if (MY_RANK == 0)
+                printf("Warning: Collective mode can't be used with subfiling\n");
             config.COLLECTIVE_DATA = false;
         }
         if (config.DO_CHUNKING) {
-            if (MY_RANK == 0) printf("Warning: Chunking can't be used with subfiling\n");
+            if (MY_RANK == 0)
+                printf("Warning: Chunking can't be used with subfiling\n");
             config.DO_CHUNKING = false;
         }
         if (config.READ_THREADS > 0) {
-            if (MY_RANK == 0) printf("Warning: Multiprocessing can't be used with subfiling. READ_THREADS is set to 0...\n");
+            if (MY_RANK == 0)
+                printf(
+                    "Warning: Multiprocessing can't be used with subfiling. READ_THREADS is set to 0...\n");
             config.READ_THREADS = 0;
         }
-    } else if (config.DO_CHUNKING) {
+    }
+    else if (config.DO_CHUNKING) {
         hsize_t chunk_dims[3] = {1, chunk_dimension, chunk_dimension};
         H5Pset_chunk(DCPL, 3, chunk_dims);
         if (config.DO_COMPRESSION) {
             H5Pset_deflate(DCPL, config.COMPRESSION_LEVEL);
         }
         if (config.COLLECTIVE_DATA) {
-            if (MY_RANK == 0) printf("Warning: Collective mode can't be used with subfiling\n");
+            if (MY_RANK == 0)
+                printf("Warning: Collective mode can't be used with subfiling\n");
             config.COLLECTIVE_DATA = false;
         }
-    } else {
+    }
+    else {
         H5Pset_fapl_mpio(FAPL, MPI_COMM_SELF, MPI_INFO_NULL);
         if (config.COLLECTIVE_DATA) {
             H5Pset_dxpl_mpio(DXPL, H5FD_MPIO_COLLECTIVE);
-        } else {
+        }
+        else {
             H5Pset_dxpl_mpio(DXPL, H5FD_MPIO_INDEPENDENT);
         }
     }
diff --git a/dlio/stats.c b/dlio/stats.c
index 29c8e258..d63e2405 100644
--- a/dlio/stats.c
+++ b/dlio/stats.c
@@ -213,10 +213,10 @@ print_data(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *e
     printf("operation, dlio\n");
     printf("ranks, %d\n", NUM_RANKS);
     printf("read threads, %d\n", config.READ_THREADS);
-    printf("subfiling, %s\n", config.SUBFILING? "YES": "NO");
-    printf("chunking, %s\n", config.DO_CHUNKING? "YES": "NO");
-    printf("collective meta, %s\n", config.COLLECTIVE_META? "YES": "NO");
-    printf("collective data, %s\n", config.COLLECTIVE_DATA? "YES": "NO");
+    printf("subfiling, %s\n", config.SUBFILING ? "YES" : "NO");
+    printf("chunking, %s\n", config.DO_CHUNKING ? "YES" : "NO");
+    printf("collective meta, %s\n", config.COLLECTIVE_META ? "YES" : "NO");
+    printf("collective data, %s\n", config.COLLECTIVE_DATA ? "YES" : "NO");
 
     // Train
     printf("train compute time, \"");
@@ -415,7 +415,7 @@ start_train(uint32_t epoch)
 void
 end_train(uint32_t epoch)
 {
-    uint64_t end_time           = get_time_usec();
+    uint64_t end_time                = get_time_usec();
     stats[epoch].observed_time.train = end_time - stats[epoch].start_time.train;
     stats[epoch].throughput.train =
         (double)TRAIN_MAX_STEPS * config.BATCH_SIZE * 1000000.0 / (end_time - stats[epoch].start_time.train);
@@ -430,8 +430,8 @@ start_eval(uint32_t epoch)
 void
 end_eval(uint32_t epoch)
 {
-    uint64_t end_time           = get_time_usec();
+    uint64_t end_time               = get_time_usec();
     stats[epoch].observed_time.eval = end_time - stats[epoch].start_time.eval;
-    stats[epoch].throughput.eval = (double)EVAL_MAX_STEPS * config.BATCH_SIZE_EVAL * 1000000.0 /
+    stats[epoch].throughput.eval    = (double)EVAL_MAX_STEPS * config.BATCH_SIZE_EVAL * 1000000.0 /
                                    (end_time - stats[epoch].start_time.eval);
 }
diff --git a/dlio/utils.c b/dlio/utils.c
index 90b8a6f4..47b5e938 100644
--- a/dlio/utils.c
+++ b/dlio/utils.c
@@ -41,9 +41,9 @@ config_datatype_t config = {
     .DO_CHUNKING       = false,
     .CHUNK_SIZE        = 1024, // should be greater than 120 on CLAIX23
     .KEEP_FILES        = false,
-    .COLLECTIVE_META      = false,
-    .COLLECTIVE_DATA      = false,
-    .SUBFILING            = false,
+    .COLLECTIVE_META   = false,
+    .COLLECTIVE_DATA   = false,
+    .SUBFILING         = false,
 
     // Reader
     //    .DATA_LOADER = PYTORCH,
diff --git a/dlio/utils.h b/dlio/utils.h
index f6c2b6fe..f2fcc338 100644
--- a/dlio/utils.h
+++ b/dlio/utils.h
@@ -32,9 +32,9 @@ typedef struct config_datatype {
     bool     DO_CHUNKING;
     uint32_t CHUNK_SIZE; // should be a square number
     bool     KEEP_FILES;
-    bool COLLECTIVE_META;
-    bool COLLECTIVE_DATA;
-    bool SUBFILING;
+    bool     COLLECTIVE_META;
+    bool     COLLECTIVE_DATA;
+    bool     SUBFILING;
 
     // Reader
     //    DATA_LOADER;

From 4cf969c093cb54d32e1be76a946d83d39b95acab Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Thu, 27 Jun 2024 19:09:06 +0200
Subject: [PATCH 06/37] Update the method of performance calculation

---
 dlio/h5bench_dlio.c | 112 +++++++-------------
 dlio/h5bench_dlio.h |   9 +-
 dlio/stats.c        | 253 ++++++++++++++++++++++++++++----------------
 dlio/stats.h        |  20 +++-
 dlio/utils.c        |  10 ++
 dlio/utils.h        |   8 ++
 6 files changed, 240 insertions(+), 172 deletions(-)

diff --git a/dlio/h5bench_dlio.c b/dlio/h5bench_dlio.c
index 69447da1..33f1626d 100644
--- a/dlio/h5bench_dlio.c
+++ b/dlio/h5bench_dlio.c
@@ -231,12 +231,10 @@ void
 eval_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out,
                      uint64_t *local_read_time_out)
 {
-    uint32_t batches_per_rank =
-        config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE_EVAL / NUM_RANKS;
-    uint32_t offset = MY_RANK * batches_per_rank;
+    uint32_t offset = MY_RANK * config.NUM_EVAL_BATCHES_PER_RANK;
 
     uint64_t t0 = get_time_usec();
-    for (uint32_t i = 0; i < batches_per_rank; i++) {
+    for (uint32_t i = 0; i < config.NUM_EVAL_BATCHES_PER_RANK; i++) {
         for (uint32_t j = 0; j < config.BATCH_SIZE_EVAL; j++) {
             uint32_t file_num =
                 indices[offset + i * config.BATCH_SIZE_EVAL + j] / config.NUM_SAMPLES_PER_FILE + 1;
@@ -303,17 +301,15 @@ eval_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *
 {
     force_workers_to_shuffle(get_eval_read_fd(), get_eval_write_fd(), get_eval_system_fd());
 
-    uint32_t batches_per_rank =
-        config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE_EVAL / NUM_RANKS;
-    uint32_t offset = MY_RANK * batches_per_rank;
+    uint32_t offset = MY_RANK * config.NUM_EVAL_BATCHES_PER_RANK;
 
     for (uint32_t i = 0;
-         i < (config.READ_THREADS > batches_per_rank ? batches_per_rank : config.READ_THREADS); i++) {
+         i < (config.READ_THREADS > config.NUM_EVAL_BATCHES_PER_RANK ? config.NUM_EVAL_BATCHES_PER_RANK : config.READ_THREADS); i++) {
         int32_t batch = offset + i;
         write(get_eval_write_fd(), &batch, sizeof(batch));
     }
 
-    for (uint32_t i = config.READ_THREADS; i < batches_per_rank; i++) {
+    for (uint32_t i = config.READ_THREADS; i < config.NUM_EVAL_BATCHES_PER_RANK; i++) {
         execution_time_t data_from_child_process;
         uint64_t         t0 = get_time_usec();
         read(get_eval_read_fd(), &data_from_child_process, sizeof(data_from_child_process));
@@ -326,13 +322,13 @@ eval_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *
         int32_t batch = offset + i;
         write(get_eval_write_fd(), &batch, sizeof(batch));
 
-        uint64_t t = compute(config.EVAL_TIME_STDEV, config.EVAL_TIME_STDEV);
+        uint64_t t = compute(config.EVAL_TIME, config.EVAL_TIME_STDEV);
         batch_processed_eval(epoch, t, t0);
         MPI_Barrier(MPI_COMM_WORLD);
     }
 
     for (uint32_t i = 0;
-         i < (config.READ_THREADS > batches_per_rank ? batches_per_rank : config.READ_THREADS); i++) {
+         i < (config.READ_THREADS > config.NUM_EVAL_BATCHES_PER_RANK ? config.NUM_EVAL_BATCHES_PER_RANK : config.READ_THREADS); i++) {
         execution_time_t data_from_child_process;
         uint64_t         t0 = get_time_usec();
         read(get_eval_read_fd(), &data_from_child_process, sizeof(data_from_child_process));
@@ -342,20 +338,20 @@ eval_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *
         *local_metadata_time_out += data_from_child_process.metadata_time;
         *local_read_time_out += data_from_child_process.read_time;
 
-        uint64_t t = compute(config.EVAL_TIME_STDEV, config.EVAL_TIME_STDEV);
+        uint64_t t = compute(config.EVAL_TIME, config.EVAL_TIME_STDEV);
         batch_processed_eval(epoch, t, t0);
         MPI_Barrier(MPI_COMM_WORLD);
     }
 }
 
 void
-eval(uint32_t epoch, uint32_t *indices, uint64_t *local_eval_metadata_time, uint64_t *local_eval_read_time,
-     bool enable_multiprocessing)
+eval(uint32_t epoch, uint32_t *indices, bool enable_multiprocessing)
 {
+    uint64_t eval_metadata_time = 0, eval_read_time = 0;
     if (enable_multiprocessing) {
         start_eval(epoch);
-        eval_using_workers(epoch, local_eval_metadata_time, local_eval_read_time);
-        end_eval(epoch);
+        eval_using_workers(epoch, &eval_metadata_time, &eval_read_time);
+        end_eval(epoch, eval_metadata_time, eval_read_time);
         return;
     }
 
@@ -365,20 +361,18 @@ eval(uint32_t epoch, uint32_t *indices, uint64_t *local_eval_metadata_time, uint
         shuffle(indices, config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE);
 
     start_eval(epoch);
-    eval_without_workers(epoch, indices, local_eval_metadata_time, local_eval_read_time);
-    end_eval(epoch);
+    eval_without_workers(epoch, indices, &eval_metadata_time, &eval_read_time);
+    end_eval(epoch, eval_metadata_time, eval_read_time);
 }
 
 void
 train_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out,
                       uint64_t *local_read_time_out)
 {
-    uint32_t batches_per_rank =
-        config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE / NUM_RANKS;
-    uint32_t offset = MY_RANK * batches_per_rank;
+    uint32_t offset = MY_RANK * config.NUM_TRAIN_BATCHES_PER_RANK;
 
     uint64_t t0 = get_time_usec();
-    for (uint32_t i = 0; i < batches_per_rank; i++) {
+    for (uint32_t i = 0; i < config.NUM_TRAIN_BATCHES_PER_RANK; i++) {
         for (uint32_t j = 0; j < config.BATCH_SIZE; j++) {
             uint32_t file_num = indices[offset + i * config.BATCH_SIZE + j] / config.NUM_SAMPLES_PER_FILE + 1;
             uint32_t sample_num = indices[offset + i * config.BATCH_SIZE + j] % config.NUM_SAMPLES_PER_FILE;
@@ -442,17 +436,14 @@ void
 train_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out)
 {
     force_workers_to_shuffle(get_train_read_fd(), get_train_write_fd(), get_train_system_fd());
-    uint32_t batches_per_rank =
-        config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE / NUM_RANKS;
-    uint32_t offset = MY_RANK * batches_per_rank;
+    uint32_t offset = MY_RANK * config.NUM_TRAIN_BATCHES_PER_RANK;
 
-    for (uint32_t i = 0;
-         i < (config.READ_THREADS > batches_per_rank ? batches_per_rank : config.READ_THREADS); i++) {
+    for (uint32_t i = 0; i < config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN; i++) {
         int32_t batch = offset + i;
         write(get_train_write_fd(), &batch, sizeof(batch));
     }
 
-    for (uint32_t i = config.READ_THREADS; i < batches_per_rank; i++) {
+    for (uint32_t i = config.READ_THREADS; i < config.NUM_TRAIN_BATCHES_PER_RANK; i++) {
         execution_time_t data_from_child_process;
         uint64_t         t0 = get_time_usec();
         read(get_train_read_fd(), &data_from_child_process, sizeof(data_from_child_process));
@@ -470,8 +461,7 @@ train_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t
         MPI_Barrier(MPI_COMM_WORLD);
     }
 
-    for (uint32_t i = 0;
-         i < (config.READ_THREADS > batches_per_rank ? batches_per_rank : config.READ_THREADS); i++) {
+    for (uint32_t i = 0; i < config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN; i++) {
         execution_time_t data_from_child_process;
         uint64_t         t0 = get_time_usec();
         read(get_train_read_fd(), &data_from_child_process, sizeof(data_from_child_process));
@@ -488,13 +478,13 @@ train_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t
 }
 
 void
-train(uint32_t epoch, uint32_t *indices, uint64_t *local_train_metadata_time, uint64_t *local_train_read_time,
-      bool enable_multiprocessing)
+train(uint32_t epoch, uint32_t *indices, bool enable_multiprocessing)
 {
+    uint64_t train_metadata_time = 0, train_read_time = 0;
     if (enable_multiprocessing) {
         start_train(epoch);
-        train_using_workers(epoch, local_train_metadata_time, local_train_read_time);
-        end_train(epoch);
+        train_using_workers(epoch, &train_metadata_time, &train_read_time);
+        end_train(epoch, train_metadata_time, train_read_time);
         return;
     }
 
@@ -504,13 +494,12 @@ train(uint32_t epoch, uint32_t *indices, uint64_t *local_train_metadata_time, ui
         shuffle(indices, config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE);
 
     start_train(epoch);
-    train_without_workers(epoch, indices, local_train_metadata_time, local_train_read_time);
-    end_train(epoch);
+    train_without_workers(epoch, indices, &train_metadata_time, &train_read_time);
+    end_train(epoch, train_metadata_time, train_read_time);
 }
 
 void
-run(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *eval_metadata_time,
-    uint64_t *eval_read_time)
+run()
 {
     uint32_t  total_train_samples = config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE;
     uint32_t *indices_train       = (uint32_t *)malloc(total_train_samples * sizeof(uint32_t));
@@ -530,10 +519,7 @@ run(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *eval_met
         indices_eval[i] = i;
     }
 
-    uint64_t local_train_metadata_time = 0, local_train_read_time = 0, local_eval_metadata_time = 0,
-             local_eval_read_time = 0;
-    uint32_t next_eval_epoch      = config.EPOCHS_BETWEEN_EVALS;
-
+    uint32_t next_eval_epoch    = config.EPOCHS_BETWEEN_EVALS;
     bool enable_multiprocessing = config.READ_THREADS > 0;
     if (enable_multiprocessing) {
         init_workers(indices_train, indices_eval);
@@ -542,15 +528,13 @@ run(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *eval_met
     MPI_Barrier(MPI_COMM_WORLD);
 
     for (uint32_t epoch = 0; epoch < config.EPOCHS; epoch++) {
-        //        if (MY_RANK == 0) printf("New Epoch %u\n", epoch + 1);
+        if (MY_RANK == 0) printf("New Epoch %u\n", epoch + 1);
 
-        train(epoch, indices_train, &local_train_metadata_time, &local_train_read_time,
-              enable_multiprocessing);
+        train(epoch, indices_train, enable_multiprocessing);
         MPI_Barrier(MPI_COMM_WORLD);
 
         if (config.DO_EVALUATION && (epoch + 1 >= next_eval_epoch)) {
-            eval(epoch, indices_eval, &local_eval_metadata_time, &local_eval_read_time,
-                 enable_multiprocessing);
+            eval(epoch, indices_eval, enable_multiprocessing);
             next_eval_epoch += config.EPOCHS_BETWEEN_EVALS;
             MPI_Barrier(MPI_COMM_WORLD);
         }
@@ -559,13 +543,6 @@ run(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *eval_met
         fin_workers();
     }
 
-    MPI_Reduce(&local_train_metadata_time, train_metadata_time, 1, MPI_UNSIGNED_LONG, MPI_SUM, 0,
-               MPI_COMM_WORLD);
-    MPI_Reduce(&local_train_read_time, train_read_time, 1, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
-    MPI_Reduce(&local_eval_metadata_time, eval_metadata_time, 1, MPI_UNSIGNED_LONG, MPI_SUM, 0,
-               MPI_COMM_WORLD);
-    MPI_Reduce(&local_eval_read_time, eval_read_time, 1, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
-
     free(indices_train);
     free(indices_eval);
 
@@ -585,9 +562,16 @@ init_global_variables()
     uint32_t data_length = config.RECORD_LENGTH * config.NUM_SAMPLES_PER_FILE;
     GENERATION_SIZE      = data_length > GENERATION_BUFFER_SIZE ? GENERATION_BUFFER_SIZE : data_length;
 
+    config.NUM_TRAIN_BATCHES_PER_RANK = config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE / NUM_RANKS / config.BATCH_SIZE;
+    config.NUM_EVAL_BATCHES_PER_RANK = config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE / NUM_RANKS / config.BATCH_SIZE_EVAL;
+
+    config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN = config.READ_THREADS > config.NUM_TRAIN_BATCHES_PER_RANK ? config.NUM_TRAIN_BATCHES_PER_RANK : config.READ_THREADS;
+    config.NUM_OF_ACTUALLY_USED_PROCESSES_EVAL = config.READ_THREADS > config.NUM_EVAL_BATCHES_PER_RANK ? config.NUM_EVAL_BATCHES_PER_RANK : config.READ_THREADS;
+
     srand(config.RANDOM_SEED);
 
     // check if read_threads < batch size and print warning
+    // drop last warning
 
 #ifndef HAVE_SUBFILING
     config.SUBFILING = false;
@@ -686,28 +670,12 @@ main(int argc, char *argv[])
         // TODO: check files dimension if generate=no
         stats_initialize();
 
-        uint64_t train_metadata_time = 0, train_read_time = 0, eval_metadata_time = 0, eval_read_time = 0;
-        run(&train_metadata_time, &train_read_time, &eval_metadata_time, &eval_read_time);
+        run();
         prepare_data();
-
-        MPI_Reduce(MY_RANK == 0 ? MPI_IN_PLACE : &train_metadata_time, &train_metadata_time, 1,
-                   MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(MY_RANK == 0 ? MPI_IN_PLACE : &train_read_time, &train_read_time, 1,
-                   MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(MY_RANK == 0 ? MPI_IN_PLACE : &eval_metadata_time, &eval_metadata_time, 1,
-                   MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(MY_RANK == 0 ? MPI_IN_PLACE : &eval_read_time, &eval_read_time, 1, MPI_UNSIGNED_LONG_LONG,
-                   MPI_SUM, 0, MPI_COMM_WORLD);
-
-        train_metadata_time /= NUM_RANKS / (config.READ_THREADS > 0 ? config.READ_THREADS : 1);
-        train_read_time /= NUM_RANKS / (config.READ_THREADS > 0 ? config.READ_THREADS : 1);
-        eval_metadata_time /= NUM_RANKS / (config.READ_THREADS > 0 ? config.READ_THREADS : 1);
-        eval_read_time /= NUM_RANKS / (config.READ_THREADS > 0 ? config.READ_THREADS : 1);
-
         MPI_Barrier(MPI_COMM_WORLD);
 
         if (MY_RANK == 0) {
-            print_data(&train_metadata_time, &train_read_time, &eval_metadata_time, &eval_read_time);
+            print_data();
         }
 
         stats_finalize();
diff --git a/dlio/h5bench_dlio.h b/dlio/h5bench_dlio.h
index ca38e895..61ca8ce8 100644
--- a/dlio/h5bench_dlio.h
+++ b/dlio/h5bench_dlio.h
@@ -24,19 +24,16 @@ void eval_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_met
 
 void eval_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out);
 
-void eval(uint32_t epoch, uint32_t *indices, uint64_t *local_eval_metadata_time,
-          uint64_t *local_eval_read_time, bool enable_multiprocessing);
+void eval(uint32_t epoch, uint32_t *indices, bool enable_multiprocessing);
 
 void train_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out,
                            uint64_t *local_read_time_out);
 
 void train_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out);
 
-void train(uint32_t epoch, uint32_t *indices, uint64_t *local_train_metadata_time,
-           uint64_t *local_train_read_time, bool enable_multiprocessing);
+void train(uint32_t epoch, uint32_t *indices, bool enable_multiprocessing);
 
-void run(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *eval_metadata_time,
-         uint64_t *eval_read_time);
+void run();
 
 void init_global_variables();
 
diff --git a/dlio/stats.c b/dlio/stats.c
index d63e2405..0ce2bcae 100644
--- a/dlio/stats.c
+++ b/dlio/stats.c
@@ -8,8 +8,6 @@
 #include "stats.h"
 #include "utils.h"
 
-uint32_t      TRAIN_MAX_STEPS;
-uint32_t      EVAL_MAX_STEPS;
 epoch_data_t *stats;
 epoch_data_t *global_stats;
 
@@ -23,56 +21,45 @@ uint32_t *last_compute_eval;
 void
 stats_initialize()
 {
-    uint32_t train_steps_count =
-        config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE / NUM_RANKS;
-    uint32_t train_steps_count_remainder =
-        config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE % (config.BATCH_SIZE * NUM_RANKS);
-    uint32_t eval_steps_count =
-        config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE_EVAL / NUM_RANKS;
-    uint32_t eval_steps_count_remainder =
-        config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE % (config.BATCH_SIZE_EVAL * NUM_RANKS);
-
-    TRAIN_MAX_STEPS = train_steps_count;
-    EVAL_MAX_STEPS  = eval_steps_count;
-
     //    TODO: drop_last = False
-    //    TRAIN_MAX_STEPS = train_steps_count + (train_steps_count_remainder > 0);
-    //    EVAL_MAX_STEPS = eval_steps_count + (eval_steps_count_remainder > 0);
-
     stats = (struct epoch_data *)malloc(config.EPOCHS * sizeof(struct epoch_data));
     if (stats == NULL) {
         exit(1);
     }
 
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        stats[i].load.train = (uint64_t *)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
+        stats[i].load.train = (uint64_t *)calloc(config.NUM_TRAIN_BATCHES_PER_RANK, sizeof(uint64_t));
         if (stats[i].load.train == NULL) {
             exit(1);
         }
-        stats[i].load.eval = (uint64_t *)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
+        stats[i].load.eval = (uint64_t *)calloc(config.NUM_EVAL_BATCHES_PER_RANK, sizeof(uint64_t));
         if (stats[i].load.eval == NULL) {
             exit(1);
         }
-        stats[i].proc.train = (uint64_t *)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
+        stats[i].proc.train = (uint64_t *)calloc(config.NUM_TRAIN_BATCHES_PER_RANK, sizeof(uint64_t));
         if (stats[i].proc.train == NULL) {
             exit(1);
         }
-        stats[i].proc.eval = (uint64_t *)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
+        stats[i].proc.eval = (uint64_t *)calloc(config.NUM_EVAL_BATCHES_PER_RANK, sizeof(uint64_t));
         if (stats[i].proc.eval == NULL) {
             exit(1);
         }
         stats[i].throughput.train = 0.0;
         stats[i].throughput.eval  = 0.0;
-        stats[i].compute.train    = (uint64_t *)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
+        stats[i].compute.train    = (uint64_t *)calloc(config.NUM_TRAIN_BATCHES_PER_RANK, sizeof(uint64_t));
         if (stats[i].compute.train == NULL) {
             exit(1);
         }
-        stats[i].compute.eval = (uint64_t *)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
+        stats[i].compute.eval = (uint64_t *)calloc(config.NUM_EVAL_BATCHES_PER_RANK, sizeof(uint64_t));
         if (stats[i].compute.eval == NULL) {
             exit(1);
         }
         stats[i].observed_time.train = 0;
         stats[i].observed_time.eval  = 0;
+        stats[i].metadata_time.train = 0;
+        stats[i].metadata_time.eval  = 0;
+        stats[i].raw_read_time.train = 0;
+        stats[i].raw_read_time.eval  = 0;
     }
 
     last_load_train = calloc(config.EPOCHS, sizeof(uint32_t));
@@ -140,59 +127,67 @@ prepare_data()
     }
 
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        global_stats[i].load.train = (uint64_t *)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
+        global_stats[i].load.train = (uint64_t *)calloc(config.NUM_TRAIN_BATCHES_PER_RANK, sizeof(uint64_t));
         if (global_stats[i].load.train == NULL) {
             exit(1);
         }
-        global_stats[i].load.eval = (uint64_t *)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
+        global_stats[i].load.eval = (uint64_t *)calloc(config.NUM_EVAL_BATCHES_PER_RANK, sizeof(uint64_t));
         if (global_stats[i].load.eval == NULL) {
             exit(1);
         }
-        global_stats[i].proc.train = (uint64_t *)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
+        global_stats[i].proc.train = (uint64_t *)calloc(config.NUM_TRAIN_BATCHES_PER_RANK, sizeof(uint64_t));
         if (global_stats[i].proc.train == NULL) {
             exit(1);
         }
-        global_stats[i].proc.eval = (uint64_t *)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
+        global_stats[i].proc.eval = (uint64_t *)calloc(config.NUM_EVAL_BATCHES_PER_RANK, sizeof(uint64_t));
         if (global_stats[i].proc.eval == NULL) {
             exit(1);
         }
-        global_stats[i].compute.train = (uint64_t *)calloc(TRAIN_MAX_STEPS, sizeof(uint64_t));
+        global_stats[i].compute.train = (uint64_t *)calloc(config.NUM_TRAIN_BATCHES_PER_RANK, sizeof(uint64_t));
         if (global_stats[i].compute.train == NULL) {
             exit(1);
         }
-        global_stats[i].compute.eval = (uint64_t *)calloc(EVAL_MAX_STEPS, sizeof(uint64_t));
+        global_stats[i].compute.eval = (uint64_t *)calloc(config.NUM_EVAL_BATCHES_PER_RANK, sizeof(uint64_t));
         if (global_stats[i].compute.eval == NULL) {
             exit(1);
         }
 
-        MPI_Reduce(stats[i].load.train, global_stats[i].load.train, TRAIN_MAX_STEPS, MPI_UNSIGNED_LONG_LONG,
+        MPI_Reduce(stats[i].load.train, global_stats[i].load.train, config.NUM_TRAIN_BATCHES_PER_RANK, MPI_UNSIGNED_LONG_LONG,
                    MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(stats[i].load.eval, global_stats[i].load.eval, EVAL_MAX_STEPS, MPI_UNSIGNED_LONG_LONG,
+        MPI_Reduce(stats[i].load.eval, global_stats[i].load.eval, config.NUM_EVAL_BATCHES_PER_RANK, MPI_UNSIGNED_LONG_LONG,
                    MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(stats[i].proc.train, global_stats[i].proc.train, TRAIN_MAX_STEPS, MPI_UNSIGNED_LONG_LONG,
+        MPI_Reduce(stats[i].proc.train, global_stats[i].proc.train, config.NUM_TRAIN_BATCHES_PER_RANK, MPI_UNSIGNED_LONG_LONG,
                    MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(stats[i].proc.eval, global_stats[i].proc.eval, EVAL_MAX_STEPS, MPI_UNSIGNED_LONG_LONG,
+        MPI_Reduce(stats[i].proc.eval, global_stats[i].proc.eval, config.NUM_EVAL_BATCHES_PER_RANK, MPI_UNSIGNED_LONG_LONG,
                    MPI_SUM, 0, MPI_COMM_WORLD);
         MPI_Reduce(&stats[i].throughput.train, &global_stats[i].throughput.train, 1, MPI_DOUBLE, MPI_SUM, 0,
                    MPI_COMM_WORLD);
         MPI_Reduce(&stats[i].throughput.eval, &global_stats[i].throughput.eval, 1, MPI_DOUBLE, MPI_SUM, 0,
                    MPI_COMM_WORLD);
-        MPI_Reduce(stats[i].compute.train, global_stats[i].compute.train, TRAIN_MAX_STEPS,
+        MPI_Reduce(stats[i].compute.train, global_stats[i].compute.train, config.NUM_TRAIN_BATCHES_PER_RANK,
                    MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(stats[i].compute.eval, global_stats[i].compute.eval, EVAL_MAX_STEPS,
+        MPI_Reduce(stats[i].compute.eval, global_stats[i].compute.eval, config.NUM_EVAL_BATCHES_PER_RANK,
                    MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
         MPI_Reduce(&stats[i].observed_time.train, &global_stats[i].observed_time.train, 1,
                    MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
         MPI_Reduce(&stats[i].observed_time.eval, &global_stats[i].observed_time.eval, 1,
                    MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(&stats[i].metadata_time.train, &global_stats[i].metadata_time.train, 1,
+                   MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(&stats[i].metadata_time.eval, &global_stats[i].metadata_time.eval, 1,
+                   MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(&stats[i].raw_read_time.train, &global_stats[i].raw_read_time.train, 1,
+                   MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(&stats[i].raw_read_time.eval, &global_stats[i].raw_read_time.eval, 1,
+                   MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
 
-        for (int j = 0; j < TRAIN_MAX_STEPS; j++) {
+        for (int j = 0; j < config.NUM_TRAIN_BATCHES_PER_RANK; j++) {
             global_stats[i].load.train[j] /= NUM_RANKS;
             global_stats[i].proc.train[j] /= NUM_RANKS;
             global_stats[i].compute.train[j] /= NUM_RANKS;
         }
 
-        for (int j = 0; j < EVAL_MAX_STEPS; j++) {
+        for (int j = 0; j < config.NUM_EVAL_BATCHES_PER_RANK; j++) {
             global_stats[i].load.eval[j] /= NUM_RANKS;
             global_stats[i].proc.eval[j] /= NUM_RANKS;
             global_stats[i].compute.eval[j] /= NUM_RANKS;
@@ -202,12 +197,24 @@ prepare_data()
         global_stats[i].throughput.eval /= NUM_RANKS;
         global_stats[i].observed_time.train /= NUM_RANKS;
         global_stats[i].observed_time.eval /= NUM_RANKS;
+        global_stats[i].metadata_time.train /= NUM_RANKS;
+        global_stats[i].metadata_time.eval /= NUM_RANKS;
+        global_stats[i].raw_read_time.train /= NUM_RANKS;
+        global_stats[i].raw_read_time.eval /= NUM_RANKS;
+
+//        if (config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN > 0) {
+//            global_stats[i].metadata_time.train /= config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN;
+//            global_stats[i].raw_read_time.train /= config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN;
+//        }
+//        if (config.NUM_OF_ACTUALLY_USED_PROCESSES_EVAL > 0) {
+//            global_stats[i].metadata_time.eval /= config.NUM_OF_ACTUALLY_USED_PROCESSES_EVAL;
+//            global_stats[i].raw_read_time.eval /= config.NUM_OF_ACTUALLY_USED_PROCESSES_EVAL;
+//        }
     }
 }
 
 void
-print_data(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *eval_metadata_time,
-           uint64_t *eval_read_time)
+print_data()
 {
     printf("metric, value\n");
     printf("operation, dlio\n");
@@ -219,11 +226,17 @@ print_data(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *e
     printf("collective data, %s\n", config.COLLECTIVE_DATA ? "YES" : "NO");
 
     // Train
-    printf("train compute time, \"");
+    // TODO: drop_last = false
+    uint64_t train_total_size_bytes = (uint64_t)config.NUM_TRAIN_BATCHES_PER_RANK * NUM_RANKS * config.BATCH_SIZE * config.RECORD_LENGTH;
+    printf("train total size, %" PRId64 "\n", train_total_size_bytes);
+    uint64_t train_size_bytes_per_rank = (uint64_t)config.NUM_TRAIN_BATCHES_PER_RANK * config.BATCH_SIZE * config.RECORD_LENGTH;
+    printf("train size per rank, %" PRId64 "\n", train_size_bytes_per_rank);
+
+    printf("train emulated compute time per epoch, \"");
     uint64_t train_total_compute_time = 0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
         unsigned long int compute_time = 0;
-        for (uint32_t j = 0; j < TRAIN_MAX_STEPS; j++) {
+        for (uint32_t j = 0; j < config.NUM_TRAIN_BATCHES_PER_RANK; j++) {
             compute_time += global_stats[i].compute.train[j];
         }
         train_total_compute_time += compute_time;
@@ -231,20 +244,43 @@ print_data(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *e
         if (i != config.EPOCHS - 1)
             printf(", ");
     }
-    printf("\"\ntrain total compute time, %lf\n", train_total_compute_time / 1000000.0);
+    printf("\"\ntrain emulated compute time, %lf\n", train_total_compute_time / 1000000.0);
 
-    // TODO: drop_last = False
-    uint64_t train_total_batches = (uint64_t)config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE /
-                                   config.BATCH_SIZE / NUM_RANKS * NUM_RANKS;
-    uint64_t train_total_size_bytes =
-        train_total_batches * config.BATCH_SIZE_EVAL * config.NUM_SAMPLES_PER_FILE * config.RECORD_LENGTH;
-    printf("train total size, %lu\n", train_total_size_bytes);
+    printf("train metadata time per epoch, \"");
+    double train_total_metadata_time = 0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        double metadata_time = stats[i].metadata_time.train / 1000000.0;
+        train_total_metadata_time += metadata_time;
+        printf("%lf", metadata_time);
+        if (i != config.EPOCHS - 1)
+            printf(", ");
+    }
+
+    printf("\"\ntrain metadata time, %lf\n", train_total_metadata_time);
 
-    printf("train total metadata time, %lf\n", *train_metadata_time / 1000000.0);
-    printf("train total raw read time, %lf\n", *train_read_time / 1000000.0);
-    printf("train total raw read rate, %lf\n", (double)train_total_size_bytes / *train_read_time * 1000000.0);
+    printf("train raw read time per epoch, \"");
+    double train_total_read_time = 0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        double read_time = stats[i].raw_read_time.train / 1000000.0;
+        train_total_read_time += read_time;
+        printf("%lf", read_time);
+        if (i != config.EPOCHS - 1)
+            printf(", ");
+    }
+    printf("\"\ntrain total raw read time, %lf\n", train_total_read_time);
 
-    printf("train observed time, \"");
+    printf("train raw read rate per epoch, \"");
+    double train_total_avg_read_rate = 0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        double read_rate = (double)train_size_bytes_per_rank / stats[i].raw_read_time.train * 1000000.0;
+        train_total_avg_read_rate += read_rate;
+        printf("%lf", read_rate);
+        if (i != config.EPOCHS - 1)
+            printf(", ");
+    }
+    printf("\"\ntrain avg raw read rate, %lf\n", train_total_avg_read_rate / config.EPOCHS);
+
+    printf("train observed time per epoch, \"");
     double train_total_observed_time = 0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
         double observed_time = global_stats[i].observed_time.train / 1000000.0;
@@ -253,22 +289,25 @@ print_data(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *e
         if (i != config.EPOCHS - 1)
             printf(", ");
     }
-    printf("\"\ntrain total observed time, %lf\n", train_total_observed_time);
+    printf("\"\ntrain observed time, %lf\n", train_total_observed_time);
 
-    printf("train observed rate, \"");
+    printf("train observed rate per epoch, \"");
+    double train_total_avg_observed_rate = 0.0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
         unsigned long int compute_time = 0;
-        for (uint32_t j = 0; j < TRAIN_MAX_STEPS; j++) {
+        for (uint32_t j = 0; j < config.NUM_TRAIN_BATCHES_PER_RANK; j++) {
             compute_time += global_stats[i].compute.train[j];
         }
-        printf("%lf", (double)train_total_size_bytes / (global_stats[i].observed_time.train - compute_time) *
-                          1000000.0);
+        double observed_rate = (double)train_size_bytes_per_rank / (global_stats[i].observed_time.train - compute_time) *
+                               1000000.0;
+        train_total_avg_observed_rate += observed_rate;
+        printf("%lf", observed_rate);
         if (i != config.EPOCHS - 1)
             printf(", ");
     }
-    printf("\"\n");
+    printf("\"\ntrain avg observed rate, %lf\n", train_total_avg_observed_rate / config.EPOCHS);
 
-    printf("train throughput samples per second, \"");
+    printf("train throughput samples per second per epoch, \"");
     double train_throughput_mean_samples_per_second = 0.0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
         train_throughput_mean_samples_per_second += global_stats[i].throughput.train;
@@ -278,7 +317,7 @@ print_data(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *e
     }
     train_throughput_mean_samples_per_second =
         train_throughput_mean_samples_per_second / (double)config.EPOCHS;
-    printf("\"\ntrain throughput mean samples per second, %lf\n", train_throughput_mean_samples_per_second);
+    printf("\"\ntrain throughput avg samples per second, %lf\n", train_throughput_mean_samples_per_second);
 
     double train_throughput_stdev_samples_per_second = 0.0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
@@ -292,18 +331,24 @@ print_data(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *e
 
     double train_io_mean_MB_per_second =
         train_throughput_mean_samples_per_second * config.RECORD_LENGTH / 1024 / 1024;
-    printf("train io mean MB per second, %lf\n", train_io_mean_MB_per_second);
+    printf("train io avg MB per second, %lf\n", train_io_mean_MB_per_second);
 
     double train_io_stdev_MB_per_second =
         train_throughput_stdev_samples_per_second * config.RECORD_LENGTH / 1024 / 1024;
     printf("train io stdev MB per second, %lf\n", train_io_stdev_MB_per_second);
 
     // Evaluation
-    printf("eval compute time, \"");
+    // TODO: drop_last = False
+    uint64_t eval_total_size_bytes = (uint64_t)config.NUM_EVAL_BATCHES_PER_RANK * NUM_RANKS * config.BATCH_SIZE_EVAL * config.RECORD_LENGTH;
+    printf("eval total size, %" PRId64 "\n", eval_total_size_bytes);
+    uint64_t eval_size_bytes_per_rank = (uint64_t)config.NUM_EVAL_BATCHES_PER_RANK * config.BATCH_SIZE_EVAL * config.RECORD_LENGTH;
+    printf("eval size per rank, %" PRId64 "\n", eval_size_bytes_per_rank);
+
+    printf("eval emulated compute time per epoch, \"");
     uint64_t eval_total_compute_time = 0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
         unsigned long int compute_time = 0;
-        for (uint32_t j = 0; j < EVAL_MAX_STEPS; j++) {
+        for (uint32_t j = 0; j < config.NUM_EVAL_BATCHES_PER_RANK; j++) {
             compute_time += global_stats[i].compute.eval[j];
         }
         eval_total_compute_time += compute_time;
@@ -311,19 +356,43 @@ print_data(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *e
         if (i != config.EPOCHS - 1)
             printf(", ");
     }
-    printf("\"\neval total compute time, %lf\n", eval_total_compute_time / 1000000.0);
-    // TODO: drop_last = False
-    uint64_t eval_total_batches = (uint64_t)config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE /
-                                  config.BATCH_SIZE_EVAL / NUM_RANKS * NUM_RANKS;
-    uint64_t eval_total_size_bytes =
-        eval_total_batches * config.BATCH_SIZE_EVAL * config.NUM_SAMPLES_PER_FILE * config.RECORD_LENGTH;
-    printf("eval total size, %lu\n", eval_total_size_bytes);
+    printf("\"\neval emulated compute time, %lf\n", eval_total_compute_time / 1000000.0);
+
+    printf("eval metadata time per epoch, \"");
+    double eval_total_metadata_time = 0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        double metadata_time = stats[i].metadata_time.eval / 1000000.0;
+        eval_total_metadata_time += metadata_time;
+        printf("%lf", metadata_time);
+        if (i != config.EPOCHS - 1)
+            printf(", ");
+    }
+
+    printf("\"\neval metadata time, %lf\n", eval_total_metadata_time);
 
-    printf("eval metadata time, %lf\n", *eval_metadata_time / 1000000.0);
-    printf("eval raw read time, %lf\n", *eval_read_time / 1000000.0);
-    printf("eval raw read rate, %lf\n", (double)eval_total_size_bytes / *eval_read_time * 1000000.0);
+    printf("eval raw read time per epoch, \"");
+    double eval_total_read_time = 0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        double read_time = stats[i].raw_read_time.eval / 1000000.0;
+        eval_total_read_time += read_time;
+        printf("%lf", read_time);
+        if (i != config.EPOCHS - 1)
+            printf(", ");
+    }
+    printf("\"\neval total raw read time, %lf\n", eval_total_read_time);
+
+    printf("eval raw read rate per epoch, \"");
+    double eval_total_avg_read_rate = 0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        double read_rate = (double)eval_size_bytes_per_rank / stats[i].raw_read_time.eval * 1000000.0;
+        eval_total_avg_read_rate += read_rate;
+        printf("%lf", read_rate);
+        if (i != config.EPOCHS - 1)
+            printf(", ");
+    }
+    printf("\"\neval avg raw read rate, %lf\n", eval_total_avg_read_rate / config.EPOCHS);
 
-    printf("eval observed time, \"");
+    printf("eval observed time per epoch, \"");
     double eval_total_observed_time = 0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
         double observed_time = global_stats[i].observed_time.eval / 1000000.0;
@@ -332,22 +401,25 @@ print_data(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *e
         if (i != config.EPOCHS - 1)
             printf(", ");
     }
-    printf("\"\neval total observed time, %lf\n", eval_total_observed_time);
+    printf("\"\neval observed time, %lf\n", eval_total_observed_time);
 
-    printf("eval observed rate, \"");
+    printf("eval observed rate per epoch, \"");
+    double eval_total_avg_observed_rate = 0.0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
         unsigned long compute_time = 0;
-        for (uint32_t j = 0; j < EVAL_MAX_STEPS; j++) {
+        for (uint32_t j = 0; j < config.NUM_EVAL_BATCHES_PER_RANK; j++) {
             compute_time += global_stats[i].compute.eval[j];
         }
-        printf("%lf", (double)eval_total_size_bytes / (global_stats[i].observed_time.eval - compute_time) *
-                          1000000.0);
+        double observed_rate = (double)eval_size_bytes_per_rank / (global_stats[i].observed_time.eval - compute_time) *
+                               1000000.0;
+        eval_total_avg_observed_rate += observed_rate;
+        printf("%lf", observed_rate);
         if (i != config.EPOCHS - 1)
             printf(", ");
     }
-    printf("\"\n");
+    printf("\"\neval avg observed rate, %lf\n", eval_total_avg_observed_rate / config.EPOCHS);
 
-    printf("eval throughput samples per second, \"");
+    printf("eval throughput samples per second per epoch, \"");
     double eval_throughput_mean_samples_per_second = 0.0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
         eval_throughput_mean_samples_per_second += global_stats[i].throughput.eval;
@@ -356,7 +428,7 @@ print_data(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *e
             printf(", ");
     }
     eval_throughput_mean_samples_per_second = eval_throughput_mean_samples_per_second / (double)config.EPOCHS;
-    printf("\"\neval throughput mean samples per second, %lf\n", eval_throughput_mean_samples_per_second);
+    printf("\"\neval throughput avg samples per second, %lf\n", eval_throughput_mean_samples_per_second);
 
     double eval_throughput_stdev_samples_per_second = 0.0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
@@ -370,14 +442,11 @@ print_data(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *e
 
     double eval_io_mean_MB_per_second =
         eval_throughput_mean_samples_per_second * config.RECORD_LENGTH / 1024 / 1024;
-    printf("eval io mean MB per second, %lf\n", eval_io_mean_MB_per_second);
+    printf("eval io avg MB per second, %lf\n", eval_io_mean_MB_per_second);
 
     double eval_io_stdev_MB_per_second =
         eval_throughput_stdev_samples_per_second * config.RECORD_LENGTH / 1024 / 1024;
     printf("eval io stdev MB per second, %lf\n", eval_io_stdev_MB_per_second);
-
-    // TODO: EVALUATION_TIME & PREPROCESS_TIME
-    printf("total compute time, %lf", (train_total_compute_time + eval_total_compute_time) / 1000000.0);
 }
 
 void
@@ -413,12 +482,14 @@ start_train(uint32_t epoch)
 }
 
 void
-end_train(uint32_t epoch)
+end_train(uint32_t epoch, uint64_t metadata_time, uint64_t read_time)
 {
     uint64_t end_time                = get_time_usec();
     stats[epoch].observed_time.train = end_time - stats[epoch].start_time.train;
     stats[epoch].throughput.train =
-        (double)TRAIN_MAX_STEPS * config.BATCH_SIZE * 1000000.0 / (end_time - stats[epoch].start_time.train);
+        (double)config.NUM_TRAIN_BATCHES_PER_RANK * config.BATCH_SIZE * 1000000.0 / (end_time - stats[epoch].start_time.train);
+    stats[epoch].metadata_time.train = metadata_time;
+    stats[epoch].raw_read_time.train = read_time;
 }
 
 void
@@ -428,10 +499,12 @@ start_eval(uint32_t epoch)
 }
 
 void
-end_eval(uint32_t epoch)
+end_eval(uint32_t epoch, uint64_t metadata_time, uint64_t read_time)
 {
     uint64_t end_time               = get_time_usec();
     stats[epoch].observed_time.eval = end_time - stats[epoch].start_time.eval;
-    stats[epoch].throughput.eval    = (double)EVAL_MAX_STEPS * config.BATCH_SIZE_EVAL * 1000000.0 /
+    stats[epoch].throughput.eval    = (double)config.NUM_EVAL_BATCHES_PER_RANK * config.BATCH_SIZE_EVAL * 1000000.0 /
                                    (end_time - stats[epoch].start_time.eval);
+    stats[epoch].metadata_time.eval = metadata_time;
+    stats[epoch].raw_read_time.eval = read_time;
 }
diff --git a/dlio/stats.h b/dlio/stats.h
index f0f3fd14..4ec59225 100644
--- a/dlio/stats.h
+++ b/dlio/stats.h
@@ -31,6 +31,17 @@ struct observed_time_data {
     uint64_t eval;
 };
 
+struct metadata_time_data {
+    uint64_t train;
+    uint64_t eval;
+};
+
+
+struct raw_read_time_data {
+    uint64_t train;
+    uint64_t eval;
+};
+
 typedef struct epoch_data {
     struct start_time_data    start_time;
     struct load_data          load;
@@ -38,6 +49,8 @@ typedef struct epoch_data {
     struct throughput_data    throughput;
     struct compute_data       compute;
     struct observed_time_data observed_time;
+    struct metadata_time_data metadata_time;
+    struct raw_read_time_data raw_read_time;
 } epoch_data_t;
 
 void stats_initialize();
@@ -46,8 +59,7 @@ void stats_finalize();
 
 void prepare_data();
 
-void print_data(uint64_t *train_metadata_time, uint64_t *train_read_time, uint64_t *eval_metadata_time,
-                uint64_t *eval_read_time);
+void print_data();
 
 void batch_loaded_train(uint32_t epoch, uint64_t start_time);
 
@@ -59,10 +71,10 @@ void batch_processed_eval(uint32_t epoch, uint64_t computation_time, uint64_t t0
 
 void start_train(uint32_t epoch);
 
-void end_train(uint32_t epoch);
+void end_train(uint32_t epoch, uint64_t metadata_time, uint64_t read_time);
 
 void start_eval(uint32_t epoch);
 
-void end_eval(uint32_t epoch);
+void end_eval(uint32_t epoch, uint64_t metadata_time, uint64_t read_time);
 
 #endif // SANDBOX_STATS_H
diff --git a/dlio/utils.c b/dlio/utils.c
index 47b5e938..d8549609 100644
--- a/dlio/utils.c
+++ b/dlio/utils.c
@@ -56,6 +56,7 @@ config_datatype_t config = {
                          //    .TRANSFER_SIZE = 262144,
     .PREPROCESS_TIME       = 0.0f,
     .PREPROCESS_TIME_STDEV = 0.000f,
+    .DROP_LAST             = true,
 
     // Train
     .EPOCHS                 = 5,
@@ -75,6 +76,12 @@ config_datatype_t config = {
     .VALID_DATA_FOLDER    = "valid",
     .RECORDS_DATASET_NAME = "records",
     .LABELS_DATASET_NAME  = "labels",
+
+    // Internal
+    .NUM_TRAIN_BATCHES_PER_RANK = 0,
+    .NUM_EVAL_BATCHES_PER_RANK = 0,
+    .NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN = 0,
+    .NUM_OF_ACTUALLY_USED_PROCESSES_EVAL = 0,
 };
 
 void
@@ -283,6 +290,9 @@ parse_args(int argc, char *argv[])
         else if (strcmp(argv[i], "--subfiling") == 0) {
             config.SUBFILING = true;
         }
+        else if (strcmp(argv[i], "--drop-last") == 0) {
+            config.DROP_LAST = true;
+        }
         else {
             printf("WARNING: %s not found\n", argv[i]);
         }
diff --git a/dlio/utils.h b/dlio/utils.h
index f2fcc338..720eb99b 100644
--- a/dlio/utils.h
+++ b/dlio/utils.h
@@ -47,6 +47,8 @@ typedef struct config_datatype {
                      //    unsigned int TRANSFER_SIZE;
     float PREPROCESS_TIME;
     float PREPROCESS_TIME_STDEV;
+    bool DROP_LAST;
+
     // Train
     uint32_t EPOCHS;
     float    COMPUTATION_TIME;
@@ -65,6 +67,12 @@ typedef struct config_datatype {
     char *VALID_DATA_FOLDER;
     char *RECORDS_DATASET_NAME;
     char *LABELS_DATASET_NAME;
+
+    // Internal
+    uint32_t NUM_TRAIN_BATCHES_PER_RANK;
+    uint32_t NUM_EVAL_BATCHES_PER_RANK;
+    uint32_t NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN;
+    uint32_t NUM_OF_ACTUALLY_USED_PROCESSES_EVAL;
 } config_datatype_t;
 
 extern config_datatype_t config;

From 40f127acf924fd65d5fd358d90bc3b08c871fa83 Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Tue, 9 Jul 2024 19:22:41 +0200
Subject: [PATCH 07/37] Add process logging and improve the display of
 benchmark results

---
 commons/h5bench_util.h |   1 +
 dlio/h5bench_dlio.c    | 108 ++++++---
 dlio/stats.c           | 502 ++++++++++++++++++++++++++++++-----------
 dlio/stats.h           |   3 +
 dlio/utils.c           |  10 +-
 dlio/utils.h           |   9 +-
 dlio/workers.h         |   1 +
 7 files changed, 454 insertions(+), 180 deletions(-)

diff --git a/commons/h5bench_util.h b/commons/h5bench_util.h
index 5fc8adb4..d787a274 100644
--- a/commons/h5bench_util.h
+++ b/commons/h5bench_util.h
@@ -7,6 +7,7 @@
 
 #ifndef COMMONS_H5BENCH_UTIL_H_
 #define COMMONS_H5BENCH_UTIL_H_
+#include <hdf5.h>
 
 #define DEBUG_PRINT                                                                                          \
     printf("%s:%d\n", __func__, __LINE__);                                                                   \
diff --git a/dlio/h5bench_dlio.c b/dlio/h5bench_dlio.c
index 33f1626d..c9c1ab01 100644
--- a/dlio/h5bench_dlio.c
+++ b/dlio/h5bench_dlio.c
@@ -1,17 +1,3 @@
-// TODO:
-// - Add logging
-// - Add vol-async support
-// - Add subfiling settings
-// - Add more DLIO features
-// - Add more data loaders: Tensorflow & dali
-// - Add prefetcher configuration?
-// - Add computation_threads only for Tensorflow
-// - Add file shuffle configuration
-// - Add more compression filters
-// - Add drop_last = False setting
-// - Replace fork() with MPI_Comm_spawn()
-// - Add Cache VOL connector support
-
 #include <assert.h>
 #include <hdf5.h>
 #include <math.h>
@@ -122,6 +108,12 @@ generate_file(const char *file_name, hid_t labels_filespace, hid_t labels_memspa
 void
 generate_data()
 {
+    if (MY_RANK == 0) {
+        printf("Starting data generation\n");
+        printf("Number of files for training dataset: %u\n", config.NUM_FILES_TRAIN);
+        printf("Number of files for evaluation dataset: %u\n", config.NUM_FILES_EVAL);
+    }
+
     hsize_t labels_dims[1]   = {config.NUM_SAMPLES_PER_FILE};
     hid_t   labels_filespace = H5Screate_simple(1, labels_dims, NULL);
     assert(labels_filespace >= 0);
@@ -145,8 +137,8 @@ generate_data()
     for (uint32_t i = from; i < config.NUM_FILES_TRAIN; i += increment) {
         srand(config.RANDOM_SEED + i);
 
-        if (!config.SUBFILING || config.SUBFILING && (MY_RANK == 0))
-            printf("Generate train file %u / %u\n", i + 1, config.NUM_FILES_TRAIN);
+//        if (!config.SUBFILING || config.SUBFILING && (MY_RANK == 0))
+//            printf("Generate train file %u / %u\n", i + 1, config.NUM_FILES_TRAIN);
         char file_name[256];
         snprintf(file_name, sizeof(file_name), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER,
                  config.TRAIN_DATA_FOLDER, config.FILE_PREFIX, i + 1, config.NUM_FILES_TRAIN);
@@ -157,8 +149,8 @@ generate_data()
     for (uint32_t i = from; i < config.NUM_FILES_EVAL; i += increment) {
         srand(config.RANDOM_SEED + config.NUM_FILES_TRAIN + i);
 
-        if (!config.SUBFILING || config.SUBFILING && (MY_RANK == 0))
-            printf("Generate valid file %u / %u\n", i + 1, config.NUM_FILES_EVAL);
+//        if (!config.SUBFILING || config.SUBFILING && (MY_RANK == 0))
+//            printf("Generate valid file %u / %u\n", i + 1, config.NUM_FILES_EVAL);
         char file_name[256];
         snprintf(file_name, sizeof(file_name), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER,
                  config.VALID_DATA_FOLDER, config.FILE_PREFIX, i + 1, config.NUM_FILES_EVAL);
@@ -171,6 +163,10 @@ generate_data()
     H5Sclose(records_memspace);
     H5Sclose(extra_records_memspace);
     H5Sclose(records_filespace);
+
+    if (MY_RANK == 0) {
+        printf("Generation done\n");
+    }
 }
 
 void
@@ -179,13 +175,13 @@ read_sample(const char *file_path, uint32_t sample, uint64_t *metadata_time_out,
     hsize_t offset[3] = {sample, 0, 0};
     hsize_t count[3]  = {1, DIM, DIM};
 
-    uint64_t t1         = get_time_usec();
+    uint64_t t1         = get_time_usec_return_uint64();
     hid_t    file_id    = H5Fopen(file_path, H5F_ACC_RDONLY, FAPL);
     hid_t    dataset_id = H5Dopen(file_id, config.RECORDS_DATASET_NAME, DAPL);
     hid_t    filespace  = H5Dget_space(dataset_id);
     hid_t    memspace   = H5Screate_simple(3, count, NULL);
     H5Sselect_hyperslab(filespace, H5S_SELECT_SET, offset, NULL, count, NULL);
-    uint64_t t2 = get_time_usec();
+    uint64_t t2 = get_time_usec_return_uint64();
     assert(file_id >= 0);
     assert(dataset_id >= 0);
     assert(filespace >= 0);
@@ -196,19 +192,19 @@ read_sample(const char *file_path, uint32_t sample, uint64_t *metadata_time_out,
         exit(1);
     }
 
-    uint64_t t3     = get_time_usec();
+    uint64_t t3     = get_time_usec_return_uint64();
     herr_t   status = H5Dread(dataset_id, H5T_STD_U8LE, memspace, filespace, DXPL, data);
-    uint64_t t4     = get_time_usec();
+    uint64_t t4     = get_time_usec_return_uint64();
     assert(status >= 0);
 
     free(data); // TODO: free memory only after compute() call?
 
-    uint64_t t5 = get_time_usec();
+    uint64_t t5 = get_time_usec_return_uint64();
     H5Sclose(memspace);
     H5Sclose(filespace);
     H5Dclose(dataset_id);
     H5Fclose(file_id);
-    uint64_t t6 = get_time_usec();
+    uint64_t t6 = get_time_usec_return_uint64();
 
     *metadata_time_out = (t2 - t1) + (t6 - t5);
     *read_time_out     = t4 - t3;
@@ -233,7 +229,7 @@ eval_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata
 {
     uint32_t offset = MY_RANK * config.NUM_EVAL_BATCHES_PER_RANK;
 
-    uint64_t t0 = get_time_usec();
+    uint64_t t0 = get_time_usec_return_uint64();
     for (uint32_t i = 0; i < config.NUM_EVAL_BATCHES_PER_RANK; i++) {
         for (uint32_t j = 0; j < config.BATCH_SIZE_EVAL; j++) {
             uint32_t file_num =
@@ -257,7 +253,7 @@ eval_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata
         batch_processed_eval(epoch, t, t0);
         MPI_Barrier(MPI_COMM_WORLD);
 
-        t0 = get_time_usec();
+        t0 = get_time_usec_return_uint64();
     }
 
     //    TODO: drop_data = False
@@ -284,7 +280,7 @@ eval_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata
     //            batch_processed_eval(epoch, t, t0);
     //            read_counter = 0;
     //
-    //            t0 = get_time_usec();
+    //            t0 = get_time_usec_return_uint64();
     //        }
     //    }
     //
@@ -311,7 +307,7 @@ eval_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *
 
     for (uint32_t i = config.READ_THREADS; i < config.NUM_EVAL_BATCHES_PER_RANK; i++) {
         execution_time_t data_from_child_process;
-        uint64_t         t0 = get_time_usec();
+        uint64_t         t0 = get_time_usec_return_uint64();
         read(get_eval_read_fd(), &data_from_child_process, sizeof(data_from_child_process));
 
         batch_loaded_eval(epoch, t0);
@@ -330,7 +326,7 @@ eval_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *
     for (uint32_t i = 0;
          i < (config.READ_THREADS > config.NUM_EVAL_BATCHES_PER_RANK ? config.NUM_EVAL_BATCHES_PER_RANK : config.READ_THREADS); i++) {
         execution_time_t data_from_child_process;
-        uint64_t         t0 = get_time_usec();
+        uint64_t         t0 = get_time_usec_return_uint64();
         read(get_eval_read_fd(), &data_from_child_process, sizeof(data_from_child_process));
 
         batch_loaded_eval(epoch, t0);
@@ -371,7 +367,7 @@ train_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadat
 {
     uint32_t offset = MY_RANK * config.NUM_TRAIN_BATCHES_PER_RANK;
 
-    uint64_t t0 = get_time_usec();
+    uint64_t t0 = get_time_usec_return_uint64();
     for (uint32_t i = 0; i < config.NUM_TRAIN_BATCHES_PER_RANK; i++) {
         for (uint32_t j = 0; j < config.BATCH_SIZE; j++) {
             uint32_t file_num = indices[offset + i * config.BATCH_SIZE + j] / config.NUM_SAMPLES_PER_FILE + 1;
@@ -393,7 +389,7 @@ train_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadat
         batch_processed_train(epoch, t, t0);
         MPI_Barrier(MPI_COMM_WORLD);
 
-        t0 = get_time_usec();
+        t0 = get_time_usec_return_uint64();
     }
 
     //    TODO: drop_data = True
@@ -420,7 +416,7 @@ train_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadat
     //            batch_processed_train(epoch, t, t0);
     //
     //            read_counter = 0;
-    //            t0 = get_time_usec();
+    //            t0 = get_time_usec_return_uint64();
     //        }
     //    }
     //
@@ -445,7 +441,7 @@ train_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t
 
     for (uint32_t i = config.READ_THREADS; i < config.NUM_TRAIN_BATCHES_PER_RANK; i++) {
         execution_time_t data_from_child_process;
-        uint64_t         t0 = get_time_usec();
+        uint64_t         t0 = get_time_usec_return_uint64();
         read(get_train_read_fd(), &data_from_child_process, sizeof(data_from_child_process));
 
         batch_loaded_train(epoch, t0);
@@ -463,7 +459,7 @@ train_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t
 
     for (uint32_t i = 0; i < config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN; i++) {
         execution_time_t data_from_child_process;
-        uint64_t         t0 = get_time_usec();
+        uint64_t         t0 = get_time_usec_return_uint64();
         read(get_train_read_fd(), &data_from_child_process, sizeof(data_from_child_process));
 
         batch_loaded_train(epoch, t0);
@@ -528,12 +524,13 @@ run()
     MPI_Barrier(MPI_COMM_WORLD);
 
     for (uint32_t epoch = 0; epoch < config.EPOCHS; epoch++) {
-        if (MY_RANK == 0) printf("New Epoch %u\n", epoch + 1);
+        if (MY_RANK == 0) printf("Starting epoch %u...\n", epoch + 1);
 
         train(epoch, indices_train, enable_multiprocessing);
         MPI_Barrier(MPI_COMM_WORLD);
 
         if (config.DO_EVALUATION && (epoch + 1 >= next_eval_epoch)) {
+            if (MY_RANK == 0) printf("Starting evaluation...\n");
             eval(epoch, indices_eval, enable_multiprocessing);
             next_eval_epoch += config.EPOCHS_BETWEEN_EVALS;
             MPI_Barrier(MPI_COMM_WORLD);
@@ -552,6 +549,21 @@ run()
 void
 init_global_variables()
 {
+    if (MY_RANK == 0) {
+        if (config.EPOCHS == 0) {
+            printf("The value of parameter \"epochs\" must be greater than 0\n");
+        }
+        if (config.NUM_SAMPLES_PER_FILE == 0) {
+            printf("The value of parameter \"num-samples-per-file\" must be greater than 0\n");
+        }
+        if (config.BATCH_SIZE == 0) {
+            printf("The value of parameter \"batch-size\" must be greater than 0\n");
+        }
+        if (config.BATCH_SIZE_EVAL == 0) {
+            printf("The value of parameter \"batch-size-eval\" must be greater than 0\n");
+        }
+    }
+
     DIM                  = (uint32_t)sqrt(config.RECORD_LENGTH);
     config.RECORD_LENGTH = DIM * DIM;
 
@@ -570,7 +582,6 @@ init_global_variables()
 
     srand(config.RANDOM_SEED);
 
-    // check if read_threads < batch size and print warning
     // drop last warning
 
 #ifndef HAVE_SUBFILING
@@ -583,6 +594,8 @@ init_global_variables()
     DXPL = H5Pcreate(H5P_DATASET_XFER);
 
     if (config.SUBFILING) {
+        if (MY_RANK == 0)
+            printf("Using Subfiling VFD\n");
         H5Pset_fapl_subfiling(FAPL, NULL);
         if (config.COLLECTIVE_DATA) {
             if (MY_RANK == 0)
@@ -602,11 +615,17 @@ init_global_variables()
         }
     }
     else if (config.DO_CHUNKING) {
+        if (MY_RANK == 0)
+            printf("Using chunking with the chunk shape (1, %u, %u)", chunk_dimension, chunk_dimension);
         hsize_t chunk_dims[3] = {1, chunk_dimension, chunk_dimension};
         H5Pset_chunk(DCPL, 3, chunk_dims);
         if (config.DO_COMPRESSION) {
+            if (MY_RANK == 0)
+                printf(" and compression (level %u)", config.COMPRESSION_LEVEL);
             H5Pset_deflate(DCPL, config.COMPRESSION_LEVEL);
         }
+        if (MY_RANK == 0)
+            printf("\n");
         if (config.COLLECTIVE_DATA) {
             if (MY_RANK == 0)
                 printf("Warning: Collective mode can't be used with subfiling\n");
@@ -616,20 +635,37 @@ init_global_variables()
     else {
         H5Pset_fapl_mpio(FAPL, MPI_COMM_SELF, MPI_INFO_NULL);
         if (config.COLLECTIVE_DATA) {
+            if (MY_RANK == 0)
+                printf("Using collective I/O mode\n");
             H5Pset_dxpl_mpio(DXPL, H5FD_MPIO_COLLECTIVE);
         }
         else {
+            if (MY_RANK == 0)
+                printf("Using independent I/O mode\n");
             H5Pset_dxpl_mpio(DXPL, H5FD_MPIO_INDEPENDENT);
         }
     }
 
 #if H5_VERSION_GE(1, 10, 0)
     if (config.COLLECTIVE_META) {
+        if (MY_RANK == 0)
+            printf("Using collective meta-data I/O mode\n");
         H5Pset_all_coll_metadata_ops(FAPL, true);
         H5Pset_coll_metadata_write(FAPL, true);
         H5Pset_all_coll_metadata_ops(DAPL, true);
     }
 #endif
+
+    if (MY_RANK == 0) {
+        printf("The number of training batches per rank: %u\n", config.NUM_TRAIN_BATCHES_PER_RANK);
+        if (config.READ_THREADS > config.NUM_TRAIN_BATCHES_PER_RANK) {
+            printf("Warning: The number of requested read threads (%u) is greater than the number of training batches per rank (%u)!\n", config.READ_THREADS, config.NUM_TRAIN_BATCHES_PER_RANK);
+        }
+        printf("The number of evaluation batches per rank: %u\n", config.NUM_EVAL_BATCHES_PER_RANK);
+        if (config.READ_THREADS > config.NUM_EVAL_BATCHES_PER_RANK) {
+            printf("Warning: The number of requested read threads (%u) is greater than the number of evaluation batches per rank (%u)!\n", config.READ_THREADS, config.NUM_EVAL_BATCHES_PER_RANK);
+        }
+    }
 }
 
 int
diff --git a/dlio/stats.c b/dlio/stats.c
index 0ce2bcae..7cd52be2 100644
--- a/dlio/stats.c
+++ b/dlio/stats.c
@@ -4,6 +4,7 @@
 #include <stdlib.h>
 #include <math.h>
 
+#include "../commons/h5bench_util.h"
 #include "h5bench_dlio.h"
 #include "stats.h"
 #include "utils.h"
@@ -216,108 +217,70 @@ prepare_data()
 void
 print_data()
 {
-    printf("metric, value\n");
-    printf("operation, dlio\n");
-    printf("ranks, %d\n", NUM_RANKS);
-    printf("read threads, %d\n", config.READ_THREADS);
-    printf("subfiling, %s\n", config.SUBFILING ? "YES" : "NO");
-    printf("chunking, %s\n", config.DO_CHUNKING ? "YES" : "NO");
-    printf("collective meta, %s\n", config.COLLECTIVE_META ? "YES" : "NO");
-    printf("collective data, %s\n", config.COLLECTIVE_DATA ? "YES" : "NO");
-
     // Train
     // TODO: drop_last = false
     uint64_t train_total_size_bytes = (uint64_t)config.NUM_TRAIN_BATCHES_PER_RANK * NUM_RANKS * config.BATCH_SIZE * config.RECORD_LENGTH;
-    printf("train total size, %" PRId64 "\n", train_total_size_bytes);
     uint64_t train_size_bytes_per_rank = (uint64_t)config.NUM_TRAIN_BATCHES_PER_RANK * config.BATCH_SIZE * config.RECORD_LENGTH;
-    printf("train size per rank, %" PRId64 "\n", train_size_bytes_per_rank);
 
-    printf("train emulated compute time per epoch, \"");
     uint64_t train_total_compute_time = 0;
+    uint64_t *train_compute_time_per_epoch = (uint64_t *)malloc(config.EPOCHS * sizeof(uint64_t));
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
         unsigned long int compute_time = 0;
         for (uint32_t j = 0; j < config.NUM_TRAIN_BATCHES_PER_RANK; j++) {
             compute_time += global_stats[i].compute.train[j];
         }
         train_total_compute_time += compute_time;
-        printf("%lf", compute_time / 1000000.0);
-        if (i != config.EPOCHS - 1)
-            printf(", ");
+        train_compute_time_per_epoch[i] = compute_time;
+
     }
-    printf("\"\ntrain emulated compute time, %lf\n", train_total_compute_time / 1000000.0);
 
-    printf("train metadata time per epoch, \"");
-    double train_total_metadata_time = 0;
+    uint64_t train_total_metadata_time = 0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        double metadata_time = stats[i].metadata_time.train / 1000000.0;
-        train_total_metadata_time += metadata_time;
-        printf("%lf", metadata_time);
-        if (i != config.EPOCHS - 1)
-            printf(", ");
+        train_total_metadata_time += stats[i].metadata_time.train;
     }
 
-    printf("\"\ntrain metadata time, %lf\n", train_total_metadata_time);
-
-    printf("train raw read time per epoch, \"");
-    double train_total_read_time = 0;
+    uint64_t train_total_read_time = 0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        double read_time = stats[i].raw_read_time.train / 1000000.0;
-        train_total_read_time += read_time;
-        printf("%lf", read_time);
-        if (i != config.EPOCHS - 1)
-            printf(", ");
+        train_total_read_time += stats[i].raw_read_time.train;
     }
-    printf("\"\ntrain total raw read time, %lf\n", train_total_read_time);
 
-    printf("train raw read rate per epoch, \"");
     double train_total_avg_read_rate = 0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        double read_rate = (double)train_size_bytes_per_rank / stats[i].raw_read_time.train * 1000000.0;
-        train_total_avg_read_rate += read_rate;
-        printf("%lf", read_rate);
-        if (i != config.EPOCHS - 1)
-            printf(", ");
+        if (stats[i].raw_read_time.train == 0) {
+            continue;
+        }
+        train_total_avg_read_rate += (double)train_size_bytes_per_rank / stats[i].raw_read_time.train * 1000000.0;
     }
-    printf("\"\ntrain avg raw read rate, %lf\n", train_total_avg_read_rate / config.EPOCHS);
+    train_total_avg_read_rate /= config.EPOCHS;
 
-    printf("train observed time per epoch, \"");
-    double train_total_observed_time = 0;
+    uint64_t train_total_observed_time = 0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        double observed_time = global_stats[i].observed_time.train / 1000000.0;
-        train_total_observed_time += observed_time;
-        printf("%lf", observed_time);
-        if (i != config.EPOCHS - 1)
-            printf(", ");
+        train_total_observed_time += global_stats[i].observed_time.train;
     }
-    printf("\"\ntrain observed time, %lf\n", train_total_observed_time);
 
-    printf("train observed rate per epoch, \"");
     double train_total_avg_observed_rate = 0.0;
+    double *train_avg_observed_rate_per_epoch = (double *)malloc(config.EPOCHS * sizeof(double));
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
         unsigned long int compute_time = 0;
         for (uint32_t j = 0; j < config.NUM_TRAIN_BATCHES_PER_RANK; j++) {
             compute_time += global_stats[i].compute.train[j];
         }
-        double observed_rate = (double)train_size_bytes_per_rank / (global_stats[i].observed_time.train - compute_time) *
+        if ((global_stats[i].observed_time.train - compute_time) == 0) {
+            train_avg_observed_rate_per_epoch[i] = NAN;
+            continue;
+        }
+        train_avg_observed_rate_per_epoch[i] = (double)train_size_bytes_per_rank / (global_stats[i].observed_time.train - compute_time) *
                                1000000.0;
-        train_total_avg_observed_rate += observed_rate;
-        printf("%lf", observed_rate);
-        if (i != config.EPOCHS - 1)
-            printf(", ");
+        train_total_avg_observed_rate += train_avg_observed_rate_per_epoch[i];
     }
-    printf("\"\ntrain avg observed rate, %lf\n", train_total_avg_observed_rate / config.EPOCHS);
+    train_total_avg_observed_rate /= config.EPOCHS;
 
-    printf("train throughput samples per second per epoch, \"");
     double train_throughput_mean_samples_per_second = 0.0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
         train_throughput_mean_samples_per_second += global_stats[i].throughput.train;
-        printf("%lf", global_stats[i].throughput.train);
-        if (i != config.EPOCHS - 1)
-            printf(", ");
     }
     train_throughput_mean_samples_per_second =
         train_throughput_mean_samples_per_second / (double)config.EPOCHS;
-    printf("\"\ntrain throughput avg samples per second, %lf\n", train_throughput_mean_samples_per_second);
 
     double train_throughput_stdev_samples_per_second = 0.0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
@@ -327,108 +290,75 @@ print_data()
     }
     train_throughput_stdev_samples_per_second =
         sqrt(train_throughput_stdev_samples_per_second / (double)config.EPOCHS);
-    printf("train throughput stdev samples per second, %lf\n", train_throughput_stdev_samples_per_second);
 
-    double train_io_mean_MB_per_second =
-        train_throughput_mean_samples_per_second * config.RECORD_LENGTH / 1024 / 1024;
-    printf("train io avg MB per second, %lf\n", train_io_mean_MB_per_second);
+    double train_io_mean =
+        train_throughput_mean_samples_per_second * config.RECORD_LENGTH;
 
-    double train_io_stdev_MB_per_second =
-        train_throughput_stdev_samples_per_second * config.RECORD_LENGTH / 1024 / 1024;
-    printf("train io stdev MB per second, %lf\n", train_io_stdev_MB_per_second);
+    double train_io_stdev =
+        train_throughput_stdev_samples_per_second * config.RECORD_LENGTH;
 
     // Evaluation
     // TODO: drop_last = False
     uint64_t eval_total_size_bytes = (uint64_t)config.NUM_EVAL_BATCHES_PER_RANK * NUM_RANKS * config.BATCH_SIZE_EVAL * config.RECORD_LENGTH;
-    printf("eval total size, %" PRId64 "\n", eval_total_size_bytes);
     uint64_t eval_size_bytes_per_rank = (uint64_t)config.NUM_EVAL_BATCHES_PER_RANK * config.BATCH_SIZE_EVAL * config.RECORD_LENGTH;
-    printf("eval size per rank, %" PRId64 "\n", eval_size_bytes_per_rank);
 
-    printf("eval emulated compute time per epoch, \"");
     uint64_t eval_total_compute_time = 0;
+    uint64_t *eval_compute_time_per_epoch = (uint64_t *)malloc(config.EPOCHS * sizeof(uint64_t));
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
         unsigned long int compute_time = 0;
         for (uint32_t j = 0; j < config.NUM_EVAL_BATCHES_PER_RANK; j++) {
             compute_time += global_stats[i].compute.eval[j];
         }
+        eval_compute_time_per_epoch[i] = compute_time;
         eval_total_compute_time += compute_time;
-        printf("%lf", compute_time / 1000000.0);
-        if (i != config.EPOCHS - 1)
-            printf(", ");
     }
-    printf("\"\neval emulated compute time, %lf\n", eval_total_compute_time / 1000000.0);
 
-    printf("eval metadata time per epoch, \"");
-    double eval_total_metadata_time = 0;
+    uint64_t eval_total_metadata_time = 0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        double metadata_time = stats[i].metadata_time.eval / 1000000.0;
-        eval_total_metadata_time += metadata_time;
-        printf("%lf", metadata_time);
-        if (i != config.EPOCHS - 1)
-            printf(", ");
+        eval_total_metadata_time += stats[i].metadata_time.eval;
     }
 
-    printf("\"\neval metadata time, %lf\n", eval_total_metadata_time);
-
-    printf("eval raw read time per epoch, \"");
-    double eval_total_read_time = 0;
+    uint64_t eval_total_read_time = 0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        double read_time = stats[i].raw_read_time.eval / 1000000.0;
-        eval_total_read_time += read_time;
-        printf("%lf", read_time);
-        if (i != config.EPOCHS - 1)
-            printf(", ");
+        eval_total_read_time += stats[i].raw_read_time.eval;
     }
-    printf("\"\neval total raw read time, %lf\n", eval_total_read_time);
 
-    printf("eval raw read rate per epoch, \"");
     double eval_total_avg_read_rate = 0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        double read_rate = (double)eval_size_bytes_per_rank / stats[i].raw_read_time.eval * 1000000.0;
-        eval_total_avg_read_rate += read_rate;
-        printf("%lf", read_rate);
-        if (i != config.EPOCHS - 1)
-            printf(", ");
+        if (stats[i].raw_read_time.eval == 0) {
+            continue;
+        }
+        eval_total_avg_read_rate += (double)eval_size_bytes_per_rank / stats[i].raw_read_time.eval * 1000000.0;
     }
-    printf("\"\neval avg raw read rate, %lf\n", eval_total_avg_read_rate / config.EPOCHS);
+    eval_total_avg_read_rate /= config.EPOCHS;
 
-    printf("eval observed time per epoch, \"");
-    double eval_total_observed_time = 0;
+    uint64_t eval_total_observed_time = 0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        double observed_time = global_stats[i].observed_time.eval / 1000000.0;
-        eval_total_observed_time += observed_time;
-        printf("%lf", observed_time);
-        if (i != config.EPOCHS - 1)
-            printf(", ");
+        eval_total_observed_time += global_stats[i].observed_time.eval;
     }
-    printf("\"\neval observed time, %lf\n", eval_total_observed_time);
 
-    printf("eval observed rate per epoch, \"");
     double eval_total_avg_observed_rate = 0.0;
+    double *eval_avg_observed_rate_per_epoch = (double *)malloc(config.EPOCHS * sizeof(double));
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
         unsigned long compute_time = 0;
         for (uint32_t j = 0; j < config.NUM_EVAL_BATCHES_PER_RANK; j++) {
             compute_time += global_stats[i].compute.eval[j];
         }
-        double observed_rate = (double)eval_size_bytes_per_rank / (global_stats[i].observed_time.eval - compute_time) *
-                               1000000.0;
-        eval_total_avg_observed_rate += observed_rate;
-        printf("%lf", observed_rate);
-        if (i != config.EPOCHS - 1)
-            printf(", ");
+        if ((global_stats[i].observed_time.eval - compute_time) == 0) {
+            eval_avg_observed_rate_per_epoch[i] = NAN;
+            continue;
+        }
+        eval_avg_observed_rate_per_epoch[i] = (double)eval_size_bytes_per_rank / (global_stats[i].observed_time.eval - compute_time) *
+                                              1000000.0;
+        eval_total_avg_observed_rate += eval_avg_observed_rate_per_epoch[i];
     }
-    printf("\"\neval avg observed rate, %lf\n", eval_total_avg_observed_rate / config.EPOCHS);
+    eval_total_avg_observed_rate /= config.EPOCHS;
 
-    printf("eval throughput samples per second per epoch, \"");
     double eval_throughput_mean_samples_per_second = 0.0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
         eval_throughput_mean_samples_per_second += global_stats[i].throughput.eval;
-        printf("%lf", global_stats[i].throughput.eval);
-        if (i != config.EPOCHS - 1)
-            printf(", ");
     }
     eval_throughput_mean_samples_per_second = eval_throughput_mean_samples_per_second / (double)config.EPOCHS;
-    printf("\"\neval throughput avg samples per second, %lf\n", eval_throughput_mean_samples_per_second);
 
     double eval_throughput_stdev_samples_per_second = 0.0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
@@ -438,56 +368,348 @@ print_data()
     }
     eval_throughput_stdev_samples_per_second =
         sqrt(eval_throughput_stdev_samples_per_second / (double)config.EPOCHS);
-    printf("eval throughput stdev samples per second, %lf\n", eval_throughput_stdev_samples_per_second);
 
-    double eval_io_mean_MB_per_second =
-        eval_throughput_mean_samples_per_second * config.RECORD_LENGTH / 1024 / 1024;
-    printf("eval io avg MB per second, %lf\n", eval_io_mean_MB_per_second);
+    double eval_io_mean =
+        eval_throughput_mean_samples_per_second * config.RECORD_LENGTH;
+
+    double eval_io_stdev =
+        eval_throughput_stdev_samples_per_second * config.RECORD_LENGTH;
+
+    human_readable value;
+
+    printf("\n=================== Performance Results ==================\n");
+    printf("Total number of ranks: %d\n", NUM_RANKS);
+    printf("The number of read threads per rank: %d\n", config.READ_THREADS);
+
+    value = format_human_readable(train_total_size_bytes);
+    printf("Total training set size: %.3lf %cB\n", value.value, value.unit);
+    value = format_human_readable(train_size_bytes_per_rank);
+    printf("Training set size per rank: %.3lf %cB\n", value.value, value.unit);
+    printf("Total training emulated compute time: %.3lf s\n", train_total_compute_time / 1000000.0);
+    printf("Training metadata time: %.3lf s\n", train_total_metadata_time / 1000000.0);
+    printf("Training raw read time: %.3lf s\n", train_total_read_time / 1000000.0);
+    value = format_human_readable(train_total_avg_read_rate);
+    printf("Training average raw read rate: %.3f %cB/s\n", value.value, value.unit);
+    printf("Observed training completion time: %.3lf s\n", train_total_observed_time / 1000000.0);
+    value = format_human_readable(train_total_avg_observed_rate);
+    printf("Observed average training rate: %.3f %cB/s\n", value.value, value.unit);
+    printf("Training average throughput: %.3lf samples/s\n", train_throughput_mean_samples_per_second);
+    printf("Training throughput standard deviation: %.3lf samples/s\n", train_throughput_stdev_samples_per_second);
+    value = format_human_readable(train_io_mean);
+    printf("Training average IO: %.3f %cB/s\n", value.value, value.unit);
+    value = format_human_readable(train_io_stdev);
+    printf("Training IO standard deviation: %.3f %cB/s\n", value.value, value.unit);
+
+    value = format_human_readable(eval_total_size_bytes);
+    printf("Total evaluation set size: %.3lf %cB\n", value.value, value.unit);
+    value = format_human_readable(eval_size_bytes_per_rank);
+    printf("Evaluation set size per rank: %.3lf %cB\n", value.value, value.unit);
+    printf("Total evaluation emulated compute time: %.3lf s\n", eval_total_compute_time / 1000000.0);
+    printf("Evaluation metadata time: %.3lf s\n", eval_total_metadata_time / 1000000.0);
+    printf("Evaluation raw read time: %.3lf s\n", eval_total_read_time / 1000000.0);
+    value = format_human_readable(eval_total_avg_read_rate);
+    printf("Evaluation average raw read rate: %.3lf %cB/s\n", value.value, value.unit);
+    printf("Observed evaluation completion time: %.3lf s\n", eval_total_observed_time / 1000000.0);
+    value = format_human_readable(eval_total_avg_observed_rate);
+    printf("Observed average evaluation rate: %.3lf %cB/s\n", value.value, value.unit);
+    printf("Evaluation average throughput avg: %.3lf samples/s\n", eval_throughput_mean_samples_per_second);
+    printf("Evaluation throughput standard deviation: %.3lf samples/s\n", eval_throughput_stdev_samples_per_second);
+    value = format_human_readable(eval_io_mean);
+    printf("Evaluation average IO: %.3lf %cB/s\n", value.value, value.unit);
+    value = format_human_readable(eval_io_stdev);
+    printf("Evaluation IO standard deviation: %.3lf %cB/s\n", value.value, value.unit);
+
+    printf("===========================================================\n");
+
+    FILE *csv_file = fopen(config.CSV_FILE, "w+");
+
+    char* units = (char *)malloc(config.EPOCHS * sizeof(char));
+
+    fprintf(csv_file, "metric, value, unit\n");
+    fprintf(csv_file, "operation, dlio,\n");
+    fprintf(csv_file, "ranks, %d,\n", NUM_RANKS);
+    fprintf(csv_file, "read threads, %d,\n", config.READ_THREADS);
+    fprintf(csv_file, "subfiling, %s,\n", config.SUBFILING ? "YES" : "NO");
+    fprintf(csv_file, "chunking, %s,\n", config.DO_CHUNKING ? "YES" : "NO");
+    fprintf(csv_file, "collective meta, %s,\n", config.COLLECTIVE_META ? "YES" : "NO");
+    fprintf(csv_file, "collective data, %s,\n", config.COLLECTIVE_DATA ? "YES" : "NO");
+
+    value = format_human_readable(train_total_size_bytes);
+    fprintf(csv_file, "train total size, %.3lf, %cB\n", value.value, value.unit);
+    value = format_human_readable(train_size_bytes_per_rank);
+    fprintf(csv_file, "train size per rank, %.3lf, %cB\n", value.value, value.unit);
+    fprintf(csv_file, "train emulated compute time per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%.3lf", train_compute_time_per_epoch[i] / 1000000.0);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "s");
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\"\ntrain emulated compute time, %.3lf, s\n", train_total_compute_time / 1000000.0);
+    fprintf(csv_file, "train metadata time per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%.3lf", stats[i].metadata_time.train / 1000000.0);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "s");
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\"\ntrain metadata time, %.3lf, s\n", train_total_metadata_time / 1000000.0);
+    fprintf(csv_file, "train raw read time per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%.3lf", stats[i].raw_read_time.train / 1000000.0);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "s");
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\"\ntrain total raw read time, %.3lf, s\n", train_total_read_time / 1000000.0);
+    fprintf(csv_file, "train raw read rate per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        if (stats[i].raw_read_time.train == 0) {
+            units[i] = ' ';
+            fprintf(csv_file, "NaN");
+        } else {
+            value = format_human_readable((double)train_size_bytes_per_rank / stats[i].raw_read_time.train * 1000000.0);
+            units[i] = value.unit;
+            fprintf(csv_file, "%.3lf", value.value);
+        }
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%cB/s", units[i]);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    value = format_human_readable(train_total_avg_read_rate);
+    fprintf(csv_file, "\"\ntrain avg raw read rate, %.3lf, %cB/s\n", value.value, value.unit);
+    fprintf(csv_file, "train observed time per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%.3lf", global_stats[i].observed_time.train / 1000000.0);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "s");
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\"\ntrain observed time, %.3lf, s\n", train_total_observed_time / 1000000.0);
+    fprintf(csv_file, "train observed rate per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        value = format_human_readable(train_avg_observed_rate_per_epoch[i]);
+        units[i] = value.unit;
+        fprintf(csv_file, "%.3lf", value.value);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%cB/s", units[i]);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    value = format_human_readable(train_total_avg_observed_rate);
+    fprintf(csv_file, "\"\ntrain avg observed rate, %.3lf, %cB/s\n", value.value, value.unit);
+    fprintf(csv_file, "train throughput samples per second per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%.3lf", global_stats[i].throughput.train);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "samples/s");
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\"\ntrain throughput avg samples per second, %.3lf, samples/s\n", train_throughput_mean_samples_per_second);
+    fprintf(csv_file, "train throughput stdev samples per second, %.3lf, samples/s\n", train_throughput_stdev_samples_per_second);
+    value = format_human_readable(train_io_mean);
+    fprintf(csv_file, "train io avg, %.3lf, %cB/s\n", value.value, value.unit);
+    value = format_human_readable(train_io_stdev);
+    fprintf(csv_file, "train io stdev, %.3lf, %cB/s\n", value.value, value.unit);
+
+    value = format_human_readable(eval_total_size_bytes);
+    fprintf(csv_file, "eval total size, %.3lf, %cB\n", value.value, value.unit);
+    value = format_human_readable(eval_size_bytes_per_rank);
+    fprintf(csv_file, "eval size per rank, %.3lf, %cB\n", value.value, value.unit);
+    fprintf(csv_file, "eval emulated compute time per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%.3lf", eval_compute_time_per_epoch[i] / 1000000.0);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "s");
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\"\neval emulated compute time, %.3lf, s\n", eval_total_compute_time / 1000000.0);
+    fprintf(csv_file, "eval metadata time per epoch, \"");
+        for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%.3lf", stats[i].metadata_time.eval / 1000000.0);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "s");
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\"\neval metadata time, %.3lf, s\n", eval_total_metadata_time / 1000000.0);
+    fprintf(csv_file, "eval raw read time per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%.3lf", stats[i].raw_read_time.eval / 1000000.0);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "s");
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
 
-    double eval_io_stdev_MB_per_second =
-        eval_throughput_stdev_samples_per_second * config.RECORD_LENGTH / 1024 / 1024;
-    printf("eval io stdev MB per second, %lf\n", eval_io_stdev_MB_per_second);
+    fprintf(csv_file, "\"\neval total raw read time, %.3lf, s\n", eval_total_read_time / 1000000.0);
+    fprintf(csv_file, "eval raw read rate per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        if (stats[i].raw_read_time.eval == 0) {
+            units[i] = ' ';
+            fprintf(csv_file, "NaN");
+        } else {
+            value = format_human_readable(eval_size_bytes_per_rank / stats[i].raw_read_time.eval * 1000000.0);
+            units[i] = value.unit;
+            fprintf(csv_file, "%.3lf", value.value);
+        }
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%cB/s", units[i]);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    value = format_human_readable(eval_total_avg_read_rate);
+    fprintf(csv_file, "\"\neval avg raw read rate, %.3lf, %cB/s\n", value.value, value.unit);
+    fprintf(csv_file, "eval observed time per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%.3lf", global_stats[i].observed_time.eval / 1000000.0);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "s");
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\"\neval observed time, %.3lf, s\n", eval_total_observed_time / 1000000.0);
+    fprintf(csv_file, "eval observed rate per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        value = format_human_readable(eval_avg_observed_rate_per_epoch[i]);
+        units[i] = value.unit;
+        fprintf(csv_file, "%.3lf", value.value);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%cB/s", units[i]);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    value = format_human_readable(eval_total_avg_observed_rate);
+    fprintf(csv_file, "\"\neval avg observed rate, %.3lf, %cB/s\n", value.value, value.unit);
+    fprintf(csv_file, "eval throughput samples per second per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%.3lf", global_stats[i].throughput.eval);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "samples/s");
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\"\neval throughput avg samples per second, %.3lf, samples/s\n", eval_throughput_mean_samples_per_second);
+    fprintf(csv_file, "eval throughput stdev samples per second, %.3lf, samples/s\n", eval_throughput_stdev_samples_per_second);
+    value = format_human_readable(eval_io_mean);
+    fprintf(csv_file, "eval io avg, %.3lf, %cB/s\n", value.value, value.unit);
+    value = format_human_readable(eval_io_stdev);
+    fprintf(csv_file, "eval io stdev, %.3lf, %cB/s\n", value.value, value.unit);
+
+    fclose(csv_file);
+    free(units);
+    free(train_compute_time_per_epoch);
+    free(eval_compute_time_per_epoch);
+    free(train_avg_observed_rate_per_epoch);
+    free(eval_avg_observed_rate_per_epoch);
 }
 
 void
 batch_loaded_train(uint32_t epoch, uint64_t t0)
 {
-    stats[epoch].load.train[last_load_train[epoch]++] = (get_time_usec() - t0);
+    stats[epoch].load.train[last_load_train[epoch]++] = (get_time_usec_return_uint64() - t0);
 }
 
 void
 batch_processed_train(uint32_t epoch, uint64_t computation_time, uint64_t t0)
 {
-    stats[epoch].proc.train[last_proc_train[epoch]++]       = (get_time_usec() - t0);
+    stats[epoch].proc.train[last_proc_train[epoch]++]       = (get_time_usec_return_uint64() - t0);
     stats[epoch].compute.train[last_compute_train[epoch]++] = computation_time;
 }
 
 void
 batch_loaded_eval(uint32_t epoch, uint64_t t0)
 {
-    stats[epoch].load.eval[last_load_eval[epoch]++] = (get_time_usec() - t0);
+    stats[epoch].load.eval[last_load_eval[epoch]++] = (get_time_usec_return_uint64() - t0);
 }
 
 void
 batch_processed_eval(uint32_t epoch, uint64_t computation_time, uint64_t t0)
 {
-    stats[epoch].proc.eval[last_proc_eval[epoch]++]       = (get_time_usec() - t0);
+    stats[epoch].proc.eval[last_proc_eval[epoch]++]       = (get_time_usec_return_uint64() - t0);
     stats[epoch].compute.eval[last_compute_eval[epoch]++] = computation_time;
 }
 
 void
 start_train(uint32_t epoch)
 {
-    stats[epoch].start_time.train = get_time_usec();
+    stats[epoch].start_time.train = get_time_usec_return_uint64();
 }
 
 void
 end_train(uint32_t epoch, uint64_t metadata_time, uint64_t read_time)
 {
-    uint64_t end_time                = get_time_usec();
+    uint64_t end_time                = get_time_usec_return_uint64();
     stats[epoch].observed_time.train = end_time - stats[epoch].start_time.train;
-    stats[epoch].throughput.train =
-        (double)config.NUM_TRAIN_BATCHES_PER_RANK * config.BATCH_SIZE * 1000000.0 / (end_time - stats[epoch].start_time.train);
+    if ((end_time - stats[epoch].start_time.train) == 0) {
+        stats[epoch].throughput.train = NAN;
+    } else {
+        stats[epoch].throughput.train =
+            (double)config.NUM_TRAIN_BATCHES_PER_RANK * config.BATCH_SIZE * 1000000.0 / (end_time - stats[epoch].start_time.train);
+    }
     stats[epoch].metadata_time.train = metadata_time;
     stats[epoch].raw_read_time.train = read_time;
 }
@@ -495,16 +717,20 @@ end_train(uint32_t epoch, uint64_t metadata_time, uint64_t read_time)
 void
 start_eval(uint32_t epoch)
 {
-    stats[epoch].start_time.eval = get_time_usec();
+    stats[epoch].start_time.eval = get_time_usec_return_uint64();
 }
 
 void
 end_eval(uint32_t epoch, uint64_t metadata_time, uint64_t read_time)
 {
-    uint64_t end_time               = get_time_usec();
+    uint64_t end_time               = get_time_usec_return_uint64();
     stats[epoch].observed_time.eval = end_time - stats[epoch].start_time.eval;
-    stats[epoch].throughput.eval    = (double)config.NUM_EVAL_BATCHES_PER_RANK * config.BATCH_SIZE_EVAL * 1000000.0 /
-                                   (end_time - stats[epoch].start_time.eval);
+    if ((end_time - stats[epoch].start_time.eval) == 0) {
+        stats[epoch].throughput.eval = NAN;
+    } else {
+        stats[epoch].throughput.eval    = (double)config.NUM_EVAL_BATCHES_PER_RANK * config.BATCH_SIZE_EVAL * 1000000.0 /
+                                          (end_time - stats[epoch].start_time.eval);
+    }
     stats[epoch].metadata_time.eval = metadata_time;
     stats[epoch].raw_read_time.eval = read_time;
 }
diff --git a/dlio/stats.h b/dlio/stats.h
index 4ec59225..df41cfb0 100644
--- a/dlio/stats.h
+++ b/dlio/stats.h
@@ -1,6 +1,9 @@
 #ifndef SANDBOX_STATS_H
 #define SANDBOX_STATS_H
 
+#include <stdint.h>
+#include <stdio.h>
+
 struct load_data {
     uint64_t *train;
     uint64_t *eval;
diff --git a/dlio/utils.c b/dlio/utils.c
index d8549609..f5e3d768 100644
--- a/dlio/utils.c
+++ b/dlio/utils.c
@@ -12,7 +12,7 @@
 #include "utils.h"
 
 uint64_t
-get_time_usec()
+get_time_usec_return_uint64()
 {
     struct timeval tv;
     gettimeofday(&tv, NULL);
@@ -76,6 +76,7 @@ config_datatype_t config = {
     .VALID_DATA_FOLDER    = "valid",
     .RECORDS_DATASET_NAME = "records",
     .LABELS_DATASET_NAME  = "labels",
+    .CSV_FILE             = "output.csv",
 
     // Internal
     .NUM_TRAIN_BATCHES_PER_RANK = 0,
@@ -293,6 +294,13 @@ parse_args(int argc, char *argv[])
         else if (strcmp(argv[i], "--drop-last") == 0) {
             config.DROP_LAST = true;
         }
+//        else if (strcmp(argv[i], "--async-mode") == 0) {
+//            config.ASYNC_MODE = true;
+//        }
+        else if (strcmp(argv[i], "--output") == 0) {
+            i++;
+            config.CSV_FILE = argv[i];
+        }
         else {
             printf("WARNING: %s not found\n", argv[i]);
         }
diff --git a/dlio/utils.h b/dlio/utils.h
index 720eb99b..44c4bee8 100644
--- a/dlio/utils.h
+++ b/dlio/utils.h
@@ -3,12 +3,9 @@
 
 #include <stdbool.h>
 #include <stdint.h>
+#include <stdlib.h>
 
-// ------------------------------ H5bench utils ------------------------------
-
-uint64_t get_time_usec();
-
-// ---------------------------------------------------------------------------
+uint64_t get_time_usec_return_uint64();
 
 typedef struct config_datatype {
     // Workflow
@@ -35,6 +32,7 @@ typedef struct config_datatype {
     bool     COLLECTIVE_META;
     bool     COLLECTIVE_DATA;
     bool     SUBFILING;
+//    bool     ASYNC_MODE;
 
     // Reader
     //    DATA_LOADER;
@@ -67,6 +65,7 @@ typedef struct config_datatype {
     char *VALID_DATA_FOLDER;
     char *RECORDS_DATASET_NAME;
     char *LABELS_DATASET_NAME;
+    char *CSV_FILE;
 
     // Internal
     uint32_t NUM_TRAIN_BATCHES_PER_RANK;
diff --git a/dlio/workers.h b/dlio/workers.h
index 62d0ee82..31933616 100644
--- a/dlio/workers.h
+++ b/dlio/workers.h
@@ -2,6 +2,7 @@
 #define H5BENCH_WORKERS_H
 
 #include <stdint.h>
+#include <stdbool.h>
 
 typedef struct execution_time {
     uint64_t metadata_time;

From c42040a86f7a8528f781a8d19421d867df575a10 Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Tue, 9 Jul 2024 19:23:07 +0200
Subject: [PATCH 08/37] Update the documentation

---
 dlio/README.md | 148 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 108 insertions(+), 40 deletions(-)

diff --git a/dlio/README.md b/dlio/README.md
index 68cc9ed4..3b08d3ab 100644
--- a/dlio/README.md
+++ b/dlio/README.md
@@ -1,47 +1,115 @@
 # DLIO benchmark
 
-The benchmark is designed to measure the performance of training and evaluation of deep learning models on data stored as HDF5 files.
+The benchmark is designed to measure the performance of training and evaluation of deep learning models on data stored 
+as HDF5 files. Based on collected and analysed I/O patterns from [DLIO Benchmark](https://github.com/argonne-lcf/dlio_benchmark), 
+this benchmark simulates the learning process and evaluation of deep learning models using PyTorch and Tensorflow 
+frameworks, while gathering valuable information about system performance.
 
-## Overview
+## Configuration
+As in the case with other extensions, the following parameters should be specified in the configuration section of the 
+json file to configure the benchmark:
 
-### Command-line Arguments (Options)
+| Parameter              | Description                                                         | Type   | Default    |
+|------------------------|---------------------------------------------------------------------|--------|------------|
+| generate-data          | Enable generation of benchmarking data                              | bool   | false      |
+| train                  | Enable model training simulation                                    | bool   | false      |
+| evaluation             | Enable model evaluation simulation                                  | bool   | false      |
+| record-length          | Record size of a single sample in bytes                             | int    | 67108864   |
+| num-files-train        | The number of files used to train the model                         | int    | 64         |
+| num-files-eval         | The number of files used to evaluate the model                      | int    | 8          |
+| num-samples-per-file   | The number of samples in each file                                  | int    | 4          |
+| data-folder            | Name of the directory storing the benchmark data                    | string | ./data     |
+| file-prefix            | Prefix in the name of files containing training and evaluation data | string | img        |
+| chunking               | Enable chunking                                                     | bool   | false      |
+| chunk-size             | Chunk size                                                          | int    | 1024       |
+| keep-files             | Does not delete data after the benchmark is finished                | bool   | false      |
+| compression            | Enable compression                                                  | bool   | false      |
+| compression-level      | Compression level from 1 to 9                                       | int    | 4          |
+| batch-size             | Training batch size                                                 | int    | 7          |
+| batch-size-eval        | Evaluation batch size                                               | int    | 2          |
+| shuffle                | Enable samples shuffle                                              | bool   | false      |
+| preprocess-time        | Preprocessing time after reading each sample in seconds             | float  | 0.0        |
+| preprocess-time-stdev  | Standard deviation in preprocessing time in seconds                 | float  | 0.0        |
+| epochs                 | The number of epochs                                                | int    | 5          |
+| computation-time       | Computation time after reading each batch in seconds                | float  | 0.323      |
+| computation-time-stdev | Standard deviation in computation time in seconds                   | float  | 0.0        |
+| random-seed            | Random seed to be used                                              | int    | 42         |
+| eval-time              | Evaluation time after reading each batch in seconds                 | float  | 0.323      |
+| eval-time-stdev        | Standard deviation in evaluation time in seconds                    | float  | 0.0        |
+| epochs-between-evals   | The number of epochs between evaluations                            | int    | 1          |
+| train-data-folder      | Name of the directory containing the training data                  | string | train      |
+| valid-data-folder      | Name of the directory containing the validation data                | string | valid      |
+| records-dataset-name   | Name of the dataset with records                                    | string | records    |
+| labels-dataset-name    | Name of the dataset with labels                                     | string | labels     |
+| seed-change-epoch      | Enable seed changes every epoch                                     | bool   | false      |
+| read-threads           | The number of workers used to read the data                         | int    | 4          |
+| collective-meta        | Enable collective HDF5 metadata operations                          | bool   | false      |
+| collective-data        | Enable collective HDF5 data operations                              | bool   | false      |
+| subfiling              | Enable HDF5 Subfiling Virtual File Driver                           | bool   | false      |
+| csv-file               | Name of the output csv file                                         | string | output.csv |
 
-- ``--generate-data``: Enable generation of benchmarking data. [default: *false*]
-- ``--train``: Enable model training simulation [default: *false*]
-- ``--evaluation``: Enable model evaluation simulation [default: *false*]
-- ``--record-length <x>``: Record size of a single sample in bytes [default: *67108864*]
-- ``--num-files-train <x>``: The number of files used to train the model [default: *64*]
-- ``--num-files-eval <x>``: The number of files used to evaluate the model [default: *8*]
-- ``--num-samples-per-file <x>``: The number of samples in each file [default: *4*]
-- ``--data-folder <x>``: Name of the directory storing the benchmark data [default: *./data*]
-- ``--file-prefix <x>``: Prefix in the name of files containing training and evaluation data [default: *img*]
-- ``--chunking``: Enable chunking [default: *false*]
-- ``--chunk-size <x>``: Chunk size [default: *1024*]
-- ``--keep-files``: Does not delete data after the benchmark is finished [default: *1024*]
-- ``--compression``: Enable compression [default: *false*]
-- ``--compression-level <x>``: Compression level from 1 to 9 [default: *4*]
-- ``--batch-size <x>``: Training batch size [default: *7*]
-- ``--batch-size-eval <x>``: Evaluation batch size [default: *2*]
-- ``--shuffle``: Enable samples shuffle [default: *false*]
-- ``--preprocess-time <x>``: Preprocessing time after reading each sample in seconds [default: *0.0*]
-- ``--preprocess-time-stdev <x>``: Standard deviation in preprocessing time in seconds [default: *0.0*]
-- ``--epochs <x>``: The number of epochs [default: *5*]
-- ``--computation-time <x>``: Computation time after reading each batch in seconds [default: *0.323*]
-- ``--computation-time-stdev <x>``:  Standard deviation in computation time in seconds [default: *0.0*]
-- ``--random-seed <x>``: Random seed to be used [default: *42*]
-- ``--eval-time <x>``: Evaluation time after reading each batch in seconds [default: *0.323*]
-- ``--eval-time-stdev <x>``: Standard deviation in evaluation time in seconds [default: *0.0*]
-- ``--epochs-between-evals <x>``: The number of epochs between evaluations [default: *1*]
-- ``--train-data-folder <x>``: Name of the directory containing the training data [default: *train*]
-- ``--valid-data-folder <x>``: Name of the directory containing the validation data [default: *valid*]
-- ``--records-dataset-name <x>``: Name of the dataset with records [default: *records*]
-- ``--labels-dataset-name <x>``: Name of the dataset with labels [default: *labels*]
-- ``--seed-change-epoch``: Enable seed changes every epoch [default: *false*]
-- ``--read-threads``: The number of workers used to read the data [default: *4*]
-- ``--collective-meta``: Enable collective HDF5 metadata operations [default: *false*]
-- ``--collective-data``: Enable collective HDF5 data operations [default: *false*]
-- ``--subfiling``: Enable HDF5 Subfiling Virtual File Driver [default: *false*]
+It should be noted that for each parameter there is a default value that applies if the parameter has not been specified 
+in the configuration file. Thus, by default the benchmark will not run because the generate-data, train and evaluation 
+parameters are false. A sample configuration file can be found in the `samples/` directory.
 
-### Exerciser Basics
+## Understanding the output
+The sample output of the benchmark is as follows:
+```
+=================== Performance Results ==================
+Total number of ranks: 8
+The number of read threads per rank: 0
+Total training set size: 7.000 GB
+Training set size per rank: 896.000 MB
+Total training emulated compute time: 3.229 s
+Training metadata time: 2.808 s
+Training raw read time: 30.905 s
+Training average raw read rate: 145.141 MB/s
+Observed training completion time: 37.432 s
+Observed average training rate: 131.044 MB/s
+Training average throughput: 1.871 samples/s
+Training throughput standard deviation: 0.037 samples/s
+Training average IO: 119.729 MB/s
+Training IO standard deviation: 2.379 MB/s
+Total evaluation set size: 7.000 GB
+Evaluation set size per rank: 896.000 MB
+Total evaluation emulated compute time: 3.206 s
+Evaluation metadata time: 2.805 s
+Evaluation raw read time: 31.699 s
+Evaluation average raw read rate: 141.906 MB/s
+Observed evaluation completion time: 38.424 s
+Observed average evaluation rate: 127.595 MB/s
+Evaluation average throughput avg: 1.826 samples/s
+Evaluation throughput standard deviation: 0.090 samples/s
+Evaluation average IO: 116.883 MB/s
+Evaluation IO standard deviation: 5.735 MB/s
+===========================================================
+```
+Let's take a closer look at it. First, information about the number of MPI ranks and processes per MPI rank used in the 
+simulation is output. Then, the same values are used to describe the training and evaluation performance, so for the 
+sake of reducing redundancy, let us consider only the first half of the results concerning the training process. Total 
+training set size is calculated as the size of all HDF5 files used for training. Accordingly, the training set size per 
+rank gives an idea of how much of the load is taken over by one MPI rank. Total training emulated compute time contains 
+information about the total time spent on compute emulation for all epochs in total, as well as training metadata time 
+and training raw read time, about which, however, it should be noted that they are not interleaved and measure the time 
+of execution of `H5Fopen`, `H5Dget_space`, `H5Screate_simple`, `H5Sclose` and `H5Dread` commands respectively. Training 
+average raw read rate is calculated as training set size per rank divided by training raw read time. Observed training 
+completion time includes all the time spent on the training process, among other things including resource allocation 
+and computation simulation. Observed average training rate is equal to training set size per rank divided by the 
+difference of observed training completion time and total training emulated compute time, thus showing the data reading 
+rate without taking into account emulation costs. Training average throughput and training throughput standard deviation 
+give an indication of the number of samples from the training dataset processed in one second. Training average IO and 
+Training IO standard deviation translate these values into bytes/second by multiplying by the size of one sample.
 
-## Building Exerciser
+## Future work
+
+There are plans to add more configuration options for the extension in the future to increase its flexibility:
+- Add settings for Subfiling VFD. Currently, the default settings are used.
+- Add more features from [DLIO Benchmark](https://github.com/argonne-lcf/dlio_benchmark) such as resizable records.
+- Analyze and add support for other ml frameworks and data loaders. For example, DALI.
+- Add support for prefetching.
+- Expand the ability to randomly shuffle samples. At the moment, it is not possible to shuffle only samples in each file
+without changing the order of the files for training.
+- Add more compression filters and thus support different compression algorithms for HDF5 data.
+- Add support for drop_last customization. Currently, by default, all batches left after MPI ranks distribution are not processed.
+- Replace the use of `fork()` with `MPI_Comm_spawn()` when creating new processes, as using `fork()` with MPI may be unsafe
+- Test support for the Cache VOL connector.

From e3c23f79e6a44fa731f3b4e8ff9e8a3e0b2d32a1 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions[bot]@users.noreply.github.com>
Date: Tue, 9 Jul 2024 17:42:15 +0000
Subject: [PATCH 09/37] Committing clang-format changes

---
 dlio/h5bench_dlio.c |  48 ++++++++++------
 dlio/stats.c        | 132 ++++++++++++++++++++++++--------------------
 dlio/stats.h        |   1 -
 dlio/utils.c        |  12 ++--
 dlio/utils.h        |   4 +-
 5 files changed, 113 insertions(+), 84 deletions(-)

diff --git a/dlio/h5bench_dlio.c b/dlio/h5bench_dlio.c
index c9c1ab01..3e5a3a99 100644
--- a/dlio/h5bench_dlio.c
+++ b/dlio/h5bench_dlio.c
@@ -137,8 +137,8 @@ generate_data()
     for (uint32_t i = from; i < config.NUM_FILES_TRAIN; i += increment) {
         srand(config.RANDOM_SEED + i);
 
-//        if (!config.SUBFILING || config.SUBFILING && (MY_RANK == 0))
-//            printf("Generate train file %u / %u\n", i + 1, config.NUM_FILES_TRAIN);
+        //        if (!config.SUBFILING || config.SUBFILING && (MY_RANK == 0))
+        //            printf("Generate train file %u / %u\n", i + 1, config.NUM_FILES_TRAIN);
         char file_name[256];
         snprintf(file_name, sizeof(file_name), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER,
                  config.TRAIN_DATA_FOLDER, config.FILE_PREFIX, i + 1, config.NUM_FILES_TRAIN);
@@ -149,8 +149,8 @@ generate_data()
     for (uint32_t i = from; i < config.NUM_FILES_EVAL; i += increment) {
         srand(config.RANDOM_SEED + config.NUM_FILES_TRAIN + i);
 
-//        if (!config.SUBFILING || config.SUBFILING && (MY_RANK == 0))
-//            printf("Generate valid file %u / %u\n", i + 1, config.NUM_FILES_EVAL);
+        //        if (!config.SUBFILING || config.SUBFILING && (MY_RANK == 0))
+        //            printf("Generate valid file %u / %u\n", i + 1, config.NUM_FILES_EVAL);
         char file_name[256];
         snprintf(file_name, sizeof(file_name), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER,
                  config.VALID_DATA_FOLDER, config.FILE_PREFIX, i + 1, config.NUM_FILES_EVAL);
@@ -300,7 +300,9 @@ eval_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *
     uint32_t offset = MY_RANK * config.NUM_EVAL_BATCHES_PER_RANK;
 
     for (uint32_t i = 0;
-         i < (config.READ_THREADS > config.NUM_EVAL_BATCHES_PER_RANK ? config.NUM_EVAL_BATCHES_PER_RANK : config.READ_THREADS); i++) {
+         i < (config.READ_THREADS > config.NUM_EVAL_BATCHES_PER_RANK ? config.NUM_EVAL_BATCHES_PER_RANK
+                                                                     : config.READ_THREADS);
+         i++) {
         int32_t batch = offset + i;
         write(get_eval_write_fd(), &batch, sizeof(batch));
     }
@@ -324,7 +326,9 @@ eval_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *
     }
 
     for (uint32_t i = 0;
-         i < (config.READ_THREADS > config.NUM_EVAL_BATCHES_PER_RANK ? config.NUM_EVAL_BATCHES_PER_RANK : config.READ_THREADS); i++) {
+         i < (config.READ_THREADS > config.NUM_EVAL_BATCHES_PER_RANK ? config.NUM_EVAL_BATCHES_PER_RANK
+                                                                     : config.READ_THREADS);
+         i++) {
         execution_time_t data_from_child_process;
         uint64_t         t0 = get_time_usec_return_uint64();
         read(get_eval_read_fd(), &data_from_child_process, sizeof(data_from_child_process));
@@ -515,8 +519,8 @@ run()
         indices_eval[i] = i;
     }
 
-    uint32_t next_eval_epoch    = config.EPOCHS_BETWEEN_EVALS;
-    bool enable_multiprocessing = config.READ_THREADS > 0;
+    uint32_t next_eval_epoch        = config.EPOCHS_BETWEEN_EVALS;
+    bool     enable_multiprocessing = config.READ_THREADS > 0;
     if (enable_multiprocessing) {
         init_workers(indices_train, indices_eval);
     }
@@ -524,13 +528,15 @@ run()
     MPI_Barrier(MPI_COMM_WORLD);
 
     for (uint32_t epoch = 0; epoch < config.EPOCHS; epoch++) {
-        if (MY_RANK == 0) printf("Starting epoch %u...\n", epoch + 1);
+        if (MY_RANK == 0)
+            printf("Starting epoch %u...\n", epoch + 1);
 
         train(epoch, indices_train, enable_multiprocessing);
         MPI_Barrier(MPI_COMM_WORLD);
 
         if (config.DO_EVALUATION && (epoch + 1 >= next_eval_epoch)) {
-            if (MY_RANK == 0) printf("Starting evaluation...\n");
+            if (MY_RANK == 0)
+                printf("Starting evaluation...\n");
             eval(epoch, indices_eval, enable_multiprocessing);
             next_eval_epoch += config.EPOCHS_BETWEEN_EVALS;
             MPI_Barrier(MPI_COMM_WORLD);
@@ -574,11 +580,17 @@ init_global_variables()
     uint32_t data_length = config.RECORD_LENGTH * config.NUM_SAMPLES_PER_FILE;
     GENERATION_SIZE      = data_length > GENERATION_BUFFER_SIZE ? GENERATION_BUFFER_SIZE : data_length;
 
-    config.NUM_TRAIN_BATCHES_PER_RANK = config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE / NUM_RANKS / config.BATCH_SIZE;
-    config.NUM_EVAL_BATCHES_PER_RANK = config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE / NUM_RANKS / config.BATCH_SIZE_EVAL;
+    config.NUM_TRAIN_BATCHES_PER_RANK =
+        config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE / NUM_RANKS / config.BATCH_SIZE;
+    config.NUM_EVAL_BATCHES_PER_RANK =
+        config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE / NUM_RANKS / config.BATCH_SIZE_EVAL;
 
-    config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN = config.READ_THREADS > config.NUM_TRAIN_BATCHES_PER_RANK ? config.NUM_TRAIN_BATCHES_PER_RANK : config.READ_THREADS;
-    config.NUM_OF_ACTUALLY_USED_PROCESSES_EVAL = config.READ_THREADS > config.NUM_EVAL_BATCHES_PER_RANK ? config.NUM_EVAL_BATCHES_PER_RANK : config.READ_THREADS;
+    config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN = config.READ_THREADS > config.NUM_TRAIN_BATCHES_PER_RANK
+                                                      ? config.NUM_TRAIN_BATCHES_PER_RANK
+                                                      : config.READ_THREADS;
+    config.NUM_OF_ACTUALLY_USED_PROCESSES_EVAL = config.READ_THREADS > config.NUM_EVAL_BATCHES_PER_RANK
+                                                     ? config.NUM_EVAL_BATCHES_PER_RANK
+                                                     : config.READ_THREADS;
 
     srand(config.RANDOM_SEED);
 
@@ -659,11 +671,15 @@ init_global_variables()
     if (MY_RANK == 0) {
         printf("The number of training batches per rank: %u\n", config.NUM_TRAIN_BATCHES_PER_RANK);
         if (config.READ_THREADS > config.NUM_TRAIN_BATCHES_PER_RANK) {
-            printf("Warning: The number of requested read threads (%u) is greater than the number of training batches per rank (%u)!\n", config.READ_THREADS, config.NUM_TRAIN_BATCHES_PER_RANK);
+            printf("Warning: The number of requested read threads (%u) is greater than the number of "
+                   "training batches per rank (%u)!\n",
+                   config.READ_THREADS, config.NUM_TRAIN_BATCHES_PER_RANK);
         }
         printf("The number of evaluation batches per rank: %u\n", config.NUM_EVAL_BATCHES_PER_RANK);
         if (config.READ_THREADS > config.NUM_EVAL_BATCHES_PER_RANK) {
-            printf("Warning: The number of requested read threads (%u) is greater than the number of evaluation batches per rank (%u)!\n", config.READ_THREADS, config.NUM_EVAL_BATCHES_PER_RANK);
+            printf("Warning: The number of requested read threads (%u) is greater than the number of "
+                   "evaluation batches per rank (%u)!\n",
+                   config.READ_THREADS, config.NUM_EVAL_BATCHES_PER_RANK);
         }
     }
 }
diff --git a/dlio/stats.c b/dlio/stats.c
index 7cd52be2..da54553d 100644
--- a/dlio/stats.c
+++ b/dlio/stats.c
@@ -144,7 +144,8 @@ prepare_data()
         if (global_stats[i].proc.eval == NULL) {
             exit(1);
         }
-        global_stats[i].compute.train = (uint64_t *)calloc(config.NUM_TRAIN_BATCHES_PER_RANK, sizeof(uint64_t));
+        global_stats[i].compute.train =
+            (uint64_t *)calloc(config.NUM_TRAIN_BATCHES_PER_RANK, sizeof(uint64_t));
         if (global_stats[i].compute.train == NULL) {
             exit(1);
         }
@@ -153,14 +154,14 @@ prepare_data()
             exit(1);
         }
 
-        MPI_Reduce(stats[i].load.train, global_stats[i].load.train, config.NUM_TRAIN_BATCHES_PER_RANK, MPI_UNSIGNED_LONG_LONG,
-                   MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(stats[i].load.eval, global_stats[i].load.eval, config.NUM_EVAL_BATCHES_PER_RANK, MPI_UNSIGNED_LONG_LONG,
-                   MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(stats[i].proc.train, global_stats[i].proc.train, config.NUM_TRAIN_BATCHES_PER_RANK, MPI_UNSIGNED_LONG_LONG,
-                   MPI_SUM, 0, MPI_COMM_WORLD);
-        MPI_Reduce(stats[i].proc.eval, global_stats[i].proc.eval, config.NUM_EVAL_BATCHES_PER_RANK, MPI_UNSIGNED_LONG_LONG,
-                   MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(stats[i].load.train, global_stats[i].load.train, config.NUM_TRAIN_BATCHES_PER_RANK,
+                   MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(stats[i].load.eval, global_stats[i].load.eval, config.NUM_EVAL_BATCHES_PER_RANK,
+                   MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(stats[i].proc.train, global_stats[i].proc.train, config.NUM_TRAIN_BATCHES_PER_RANK,
+                   MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+        MPI_Reduce(stats[i].proc.eval, global_stats[i].proc.eval, config.NUM_EVAL_BATCHES_PER_RANK,
+                   MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
         MPI_Reduce(&stats[i].throughput.train, &global_stats[i].throughput.train, 1, MPI_DOUBLE, MPI_SUM, 0,
                    MPI_COMM_WORLD);
         MPI_Reduce(&stats[i].throughput.eval, &global_stats[i].throughput.eval, 1, MPI_DOUBLE, MPI_SUM, 0,
@@ -203,14 +204,14 @@ prepare_data()
         global_stats[i].raw_read_time.train /= NUM_RANKS;
         global_stats[i].raw_read_time.eval /= NUM_RANKS;
 
-//        if (config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN > 0) {
-//            global_stats[i].metadata_time.train /= config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN;
-//            global_stats[i].raw_read_time.train /= config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN;
-//        }
-//        if (config.NUM_OF_ACTUALLY_USED_PROCESSES_EVAL > 0) {
-//            global_stats[i].metadata_time.eval /= config.NUM_OF_ACTUALLY_USED_PROCESSES_EVAL;
-//            global_stats[i].raw_read_time.eval /= config.NUM_OF_ACTUALLY_USED_PROCESSES_EVAL;
-//        }
+        //        if (config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN > 0) {
+        //            global_stats[i].metadata_time.train /= config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN;
+        //            global_stats[i].raw_read_time.train /= config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN;
+        //        }
+        //        if (config.NUM_OF_ACTUALLY_USED_PROCESSES_EVAL > 0) {
+        //            global_stats[i].metadata_time.eval /= config.NUM_OF_ACTUALLY_USED_PROCESSES_EVAL;
+        //            global_stats[i].raw_read_time.eval /= config.NUM_OF_ACTUALLY_USED_PROCESSES_EVAL;
+        //        }
     }
 }
 
@@ -219,10 +220,12 @@ print_data()
 {
     // Train
     // TODO: drop_last = false
-    uint64_t train_total_size_bytes = (uint64_t)config.NUM_TRAIN_BATCHES_PER_RANK * NUM_RANKS * config.BATCH_SIZE * config.RECORD_LENGTH;
-    uint64_t train_size_bytes_per_rank = (uint64_t)config.NUM_TRAIN_BATCHES_PER_RANK * config.BATCH_SIZE * config.RECORD_LENGTH;
+    uint64_t train_total_size_bytes =
+        (uint64_t)config.NUM_TRAIN_BATCHES_PER_RANK * NUM_RANKS * config.BATCH_SIZE * config.RECORD_LENGTH;
+    uint64_t train_size_bytes_per_rank =
+        (uint64_t)config.NUM_TRAIN_BATCHES_PER_RANK * config.BATCH_SIZE * config.RECORD_LENGTH;
 
-    uint64_t train_total_compute_time = 0;
+    uint64_t  train_total_compute_time     = 0;
     uint64_t *train_compute_time_per_epoch = (uint64_t *)malloc(config.EPOCHS * sizeof(uint64_t));
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
         unsigned long int compute_time = 0;
@@ -231,7 +234,6 @@ print_data()
         }
         train_total_compute_time += compute_time;
         train_compute_time_per_epoch[i] = compute_time;
-
     }
 
     uint64_t train_total_metadata_time = 0;
@@ -249,7 +251,8 @@ print_data()
         if (stats[i].raw_read_time.train == 0) {
             continue;
         }
-        train_total_avg_read_rate += (double)train_size_bytes_per_rank / stats[i].raw_read_time.train * 1000000.0;
+        train_total_avg_read_rate +=
+            (double)train_size_bytes_per_rank / stats[i].raw_read_time.train * 1000000.0;
     }
     train_total_avg_read_rate /= config.EPOCHS;
 
@@ -258,7 +261,7 @@ print_data()
         train_total_observed_time += global_stats[i].observed_time.train;
     }
 
-    double train_total_avg_observed_rate = 0.0;
+    double  train_total_avg_observed_rate     = 0.0;
     double *train_avg_observed_rate_per_epoch = (double *)malloc(config.EPOCHS * sizeof(double));
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
         unsigned long int compute_time = 0;
@@ -269,8 +272,9 @@ print_data()
             train_avg_observed_rate_per_epoch[i] = NAN;
             continue;
         }
-        train_avg_observed_rate_per_epoch[i] = (double)train_size_bytes_per_rank / (global_stats[i].observed_time.train - compute_time) *
-                               1000000.0;
+        train_avg_observed_rate_per_epoch[i] = (double)train_size_bytes_per_rank /
+                                               (global_stats[i].observed_time.train - compute_time) *
+                                               1000000.0;
         train_total_avg_observed_rate += train_avg_observed_rate_per_epoch[i];
     }
     train_total_avg_observed_rate /= config.EPOCHS;
@@ -291,18 +295,18 @@ print_data()
     train_throughput_stdev_samples_per_second =
         sqrt(train_throughput_stdev_samples_per_second / (double)config.EPOCHS);
 
-    double train_io_mean =
-        train_throughput_mean_samples_per_second * config.RECORD_LENGTH;
+    double train_io_mean = train_throughput_mean_samples_per_second * config.RECORD_LENGTH;
 
-    double train_io_stdev =
-        train_throughput_stdev_samples_per_second * config.RECORD_LENGTH;
+    double train_io_stdev = train_throughput_stdev_samples_per_second * config.RECORD_LENGTH;
 
     // Evaluation
     // TODO: drop_last = False
-    uint64_t eval_total_size_bytes = (uint64_t)config.NUM_EVAL_BATCHES_PER_RANK * NUM_RANKS * config.BATCH_SIZE_EVAL * config.RECORD_LENGTH;
-    uint64_t eval_size_bytes_per_rank = (uint64_t)config.NUM_EVAL_BATCHES_PER_RANK * config.BATCH_SIZE_EVAL * config.RECORD_LENGTH;
+    uint64_t eval_total_size_bytes = (uint64_t)config.NUM_EVAL_BATCHES_PER_RANK * NUM_RANKS *
+                                     config.BATCH_SIZE_EVAL * config.RECORD_LENGTH;
+    uint64_t eval_size_bytes_per_rank =
+        (uint64_t)config.NUM_EVAL_BATCHES_PER_RANK * config.BATCH_SIZE_EVAL * config.RECORD_LENGTH;
 
-    uint64_t eval_total_compute_time = 0;
+    uint64_t  eval_total_compute_time     = 0;
     uint64_t *eval_compute_time_per_epoch = (uint64_t *)malloc(config.EPOCHS * sizeof(uint64_t));
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
         unsigned long int compute_time = 0;
@@ -328,7 +332,8 @@ print_data()
         if (stats[i].raw_read_time.eval == 0) {
             continue;
         }
-        eval_total_avg_read_rate += (double)eval_size_bytes_per_rank / stats[i].raw_read_time.eval * 1000000.0;
+        eval_total_avg_read_rate +=
+            (double)eval_size_bytes_per_rank / stats[i].raw_read_time.eval * 1000000.0;
     }
     eval_total_avg_read_rate /= config.EPOCHS;
 
@@ -337,7 +342,7 @@ print_data()
         eval_total_observed_time += global_stats[i].observed_time.eval;
     }
 
-    double eval_total_avg_observed_rate = 0.0;
+    double  eval_total_avg_observed_rate     = 0.0;
     double *eval_avg_observed_rate_per_epoch = (double *)malloc(config.EPOCHS * sizeof(double));
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
         unsigned long compute_time = 0;
@@ -348,8 +353,8 @@ print_data()
             eval_avg_observed_rate_per_epoch[i] = NAN;
             continue;
         }
-        eval_avg_observed_rate_per_epoch[i] = (double)eval_size_bytes_per_rank / (global_stats[i].observed_time.eval - compute_time) *
-                                              1000000.0;
+        eval_avg_observed_rate_per_epoch[i] = (double)eval_size_bytes_per_rank /
+                                              (global_stats[i].observed_time.eval - compute_time) * 1000000.0;
         eval_total_avg_observed_rate += eval_avg_observed_rate_per_epoch[i];
     }
     eval_total_avg_observed_rate /= config.EPOCHS;
@@ -369,11 +374,9 @@ print_data()
     eval_throughput_stdev_samples_per_second =
         sqrt(eval_throughput_stdev_samples_per_second / (double)config.EPOCHS);
 
-    double eval_io_mean =
-        eval_throughput_mean_samples_per_second * config.RECORD_LENGTH;
+    double eval_io_mean = eval_throughput_mean_samples_per_second * config.RECORD_LENGTH;
 
-    double eval_io_stdev =
-        eval_throughput_stdev_samples_per_second * config.RECORD_LENGTH;
+    double eval_io_stdev = eval_throughput_stdev_samples_per_second * config.RECORD_LENGTH;
 
     human_readable value;
 
@@ -394,7 +397,8 @@ print_data()
     value = format_human_readable(train_total_avg_observed_rate);
     printf("Observed average training rate: %.3f %cB/s\n", value.value, value.unit);
     printf("Training average throughput: %.3lf samples/s\n", train_throughput_mean_samples_per_second);
-    printf("Training throughput standard deviation: %.3lf samples/s\n", train_throughput_stdev_samples_per_second);
+    printf("Training throughput standard deviation: %.3lf samples/s\n",
+           train_throughput_stdev_samples_per_second);
     value = format_human_readable(train_io_mean);
     printf("Training average IO: %.3f %cB/s\n", value.value, value.unit);
     value = format_human_readable(train_io_stdev);
@@ -413,7 +417,8 @@ print_data()
     value = format_human_readable(eval_total_avg_observed_rate);
     printf("Observed average evaluation rate: %.3lf %cB/s\n", value.value, value.unit);
     printf("Evaluation average throughput avg: %.3lf samples/s\n", eval_throughput_mean_samples_per_second);
-    printf("Evaluation throughput standard deviation: %.3lf samples/s\n", eval_throughput_stdev_samples_per_second);
+    printf("Evaluation throughput standard deviation: %.3lf samples/s\n",
+           eval_throughput_stdev_samples_per_second);
     value = format_human_readable(eval_io_mean);
     printf("Evaluation average IO: %.3lf %cB/s\n", value.value, value.unit);
     value = format_human_readable(eval_io_stdev);
@@ -423,7 +428,7 @@ print_data()
 
     FILE *csv_file = fopen(config.CSV_FILE, "w+");
 
-    char* units = (char *)malloc(config.EPOCHS * sizeof(char));
+    char *units = (char *)malloc(config.EPOCHS * sizeof(char));
 
     fprintf(csv_file, "metric, value, unit\n");
     fprintf(csv_file, "operation, dlio,\n");
@@ -482,8 +487,10 @@ print_data()
         if (stats[i].raw_read_time.train == 0) {
             units[i] = ' ';
             fprintf(csv_file, "NaN");
-        } else {
-            value = format_human_readable((double)train_size_bytes_per_rank / stats[i].raw_read_time.train * 1000000.0);
+        }
+        else {
+            value = format_human_readable((double)train_size_bytes_per_rank / stats[i].raw_read_time.train *
+                                          1000000.0);
             units[i] = value.unit;
             fprintf(csv_file, "%.3lf", value.value);
         }
@@ -513,7 +520,7 @@ print_data()
     fprintf(csv_file, "\"\ntrain observed time, %.3lf, s\n", train_total_observed_time / 1000000.0);
     fprintf(csv_file, "train observed rate per epoch, \"");
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        value = format_human_readable(train_avg_observed_rate_per_epoch[i]);
+        value    = format_human_readable(train_avg_observed_rate_per_epoch[i]);
         units[i] = value.unit;
         fprintf(csv_file, "%.3lf", value.value);
         if (i != config.EPOCHS - 1)
@@ -539,8 +546,10 @@ print_data()
         if (i != config.EPOCHS - 1)
             fprintf(csv_file, ", ");
     }
-    fprintf(csv_file, "\"\ntrain throughput avg samples per second, %.3lf, samples/s\n", train_throughput_mean_samples_per_second);
-    fprintf(csv_file, "train throughput stdev samples per second, %.3lf, samples/s\n", train_throughput_stdev_samples_per_second);
+    fprintf(csv_file, "\"\ntrain throughput avg samples per second, %.3lf, samples/s\n",
+            train_throughput_mean_samples_per_second);
+    fprintf(csv_file, "train throughput stdev samples per second, %.3lf, samples/s\n",
+            train_throughput_stdev_samples_per_second);
     value = format_human_readable(train_io_mean);
     fprintf(csv_file, "train io avg, %.3lf, %cB/s\n", value.value, value.unit);
     value = format_human_readable(train_io_stdev);
@@ -564,7 +573,7 @@ print_data()
     }
     fprintf(csv_file, "\"\neval emulated compute time, %.3lf, s\n", eval_total_compute_time / 1000000.0);
     fprintf(csv_file, "eval metadata time per epoch, \"");
-        for (uint32_t i = 0; i < config.EPOCHS; i++) {
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
         fprintf(csv_file, "%.3lf", stats[i].metadata_time.eval / 1000000.0);
         if (i != config.EPOCHS - 1)
             fprintf(csv_file, ", ");
@@ -595,7 +604,8 @@ print_data()
         if (stats[i].raw_read_time.eval == 0) {
             units[i] = ' ';
             fprintf(csv_file, "NaN");
-        } else {
+        }
+        else {
             value = format_human_readable(eval_size_bytes_per_rank / stats[i].raw_read_time.eval * 1000000.0);
             units[i] = value.unit;
             fprintf(csv_file, "%.3lf", value.value);
@@ -626,7 +636,7 @@ print_data()
     fprintf(csv_file, "\"\neval observed time, %.3lf, s\n", eval_total_observed_time / 1000000.0);
     fprintf(csv_file, "eval observed rate per epoch, \"");
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        value = format_human_readable(eval_avg_observed_rate_per_epoch[i]);
+        value    = format_human_readable(eval_avg_observed_rate_per_epoch[i]);
         units[i] = value.unit;
         fprintf(csv_file, "%.3lf", value.value);
         if (i != config.EPOCHS - 1)
@@ -652,8 +662,10 @@ print_data()
         if (i != config.EPOCHS - 1)
             fprintf(csv_file, ", ");
     }
-    fprintf(csv_file, "\"\neval throughput avg samples per second, %.3lf, samples/s\n", eval_throughput_mean_samples_per_second);
-    fprintf(csv_file, "eval throughput stdev samples per second, %.3lf, samples/s\n", eval_throughput_stdev_samples_per_second);
+    fprintf(csv_file, "\"\neval throughput avg samples per second, %.3lf, samples/s\n",
+            eval_throughput_mean_samples_per_second);
+    fprintf(csv_file, "eval throughput stdev samples per second, %.3lf, samples/s\n",
+            eval_throughput_stdev_samples_per_second);
     value = format_human_readable(eval_io_mean);
     fprintf(csv_file, "eval io avg, %.3lf, %cB/s\n", value.value, value.unit);
     value = format_human_readable(eval_io_stdev);
@@ -706,9 +718,10 @@ end_train(uint32_t epoch, uint64_t metadata_time, uint64_t read_time)
     stats[epoch].observed_time.train = end_time - stats[epoch].start_time.train;
     if ((end_time - stats[epoch].start_time.train) == 0) {
         stats[epoch].throughput.train = NAN;
-    } else {
-        stats[epoch].throughput.train =
-            (double)config.NUM_TRAIN_BATCHES_PER_RANK * config.BATCH_SIZE * 1000000.0 / (end_time - stats[epoch].start_time.train);
+    }
+    else {
+        stats[epoch].throughput.train = (double)config.NUM_TRAIN_BATCHES_PER_RANK * config.BATCH_SIZE *
+                                        1000000.0 / (end_time - stats[epoch].start_time.train);
     }
     stats[epoch].metadata_time.train = metadata_time;
     stats[epoch].raw_read_time.train = read_time;
@@ -727,9 +740,10 @@ end_eval(uint32_t epoch, uint64_t metadata_time, uint64_t read_time)
     stats[epoch].observed_time.eval = end_time - stats[epoch].start_time.eval;
     if ((end_time - stats[epoch].start_time.eval) == 0) {
         stats[epoch].throughput.eval = NAN;
-    } else {
-        stats[epoch].throughput.eval    = (double)config.NUM_EVAL_BATCHES_PER_RANK * config.BATCH_SIZE_EVAL * 1000000.0 /
-                                          (end_time - stats[epoch].start_time.eval);
+    }
+    else {
+        stats[epoch].throughput.eval = (double)config.NUM_EVAL_BATCHES_PER_RANK * config.BATCH_SIZE_EVAL *
+                                       1000000.0 / (end_time - stats[epoch].start_time.eval);
     }
     stats[epoch].metadata_time.eval = metadata_time;
     stats[epoch].raw_read_time.eval = read_time;
diff --git a/dlio/stats.h b/dlio/stats.h
index df41cfb0..f82fbaeb 100644
--- a/dlio/stats.h
+++ b/dlio/stats.h
@@ -39,7 +39,6 @@ struct metadata_time_data {
     uint64_t eval;
 };
 
-
 struct raw_read_time_data {
     uint64_t train;
     uint64_t eval;
diff --git a/dlio/utils.c b/dlio/utils.c
index f5e3d768..dd80338a 100644
--- a/dlio/utils.c
+++ b/dlio/utils.c
@@ -79,10 +79,10 @@ config_datatype_t config = {
     .CSV_FILE             = "output.csv",
 
     // Internal
-    .NUM_TRAIN_BATCHES_PER_RANK = 0,
-    .NUM_EVAL_BATCHES_PER_RANK = 0,
+    .NUM_TRAIN_BATCHES_PER_RANK           = 0,
+    .NUM_EVAL_BATCHES_PER_RANK            = 0,
     .NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN = 0,
-    .NUM_OF_ACTUALLY_USED_PROCESSES_EVAL = 0,
+    .NUM_OF_ACTUALLY_USED_PROCESSES_EVAL  = 0,
 };
 
 void
@@ -294,9 +294,9 @@ parse_args(int argc, char *argv[])
         else if (strcmp(argv[i], "--drop-last") == 0) {
             config.DROP_LAST = true;
         }
-//        else if (strcmp(argv[i], "--async-mode") == 0) {
-//            config.ASYNC_MODE = true;
-//        }
+        //        else if (strcmp(argv[i], "--async-mode") == 0) {
+        //            config.ASYNC_MODE = true;
+        //        }
         else if (strcmp(argv[i], "--output") == 0) {
             i++;
             config.CSV_FILE = argv[i];
diff --git a/dlio/utils.h b/dlio/utils.h
index 44c4bee8..7c94cc0e 100644
--- a/dlio/utils.h
+++ b/dlio/utils.h
@@ -32,7 +32,7 @@ typedef struct config_datatype {
     bool     COLLECTIVE_META;
     bool     COLLECTIVE_DATA;
     bool     SUBFILING;
-//    bool     ASYNC_MODE;
+    //    bool     ASYNC_MODE;
 
     // Reader
     //    DATA_LOADER;
@@ -45,7 +45,7 @@ typedef struct config_datatype {
                      //    unsigned int TRANSFER_SIZE;
     float PREPROCESS_TIME;
     float PREPROCESS_TIME_STDEV;
-    bool DROP_LAST;
+    bool  DROP_LAST;
 
     // Train
     uint32_t EPOCHS;

From 55a8b55c9826ec732eb92eb65a00c105e6fb7e3d Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Tue, 9 Jul 2024 20:09:33 +0200
Subject: [PATCH 10/37] Correct where the csv file is created

---
 dlio/utils.c   | 3 ++-
 src/h5bench.py | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/dlio/utils.c b/dlio/utils.c
index f5e3d768..ede4fea4 100644
--- a/dlio/utils.c
+++ b/dlio/utils.c
@@ -44,6 +44,7 @@ config_datatype_t config = {
     .COLLECTIVE_META   = false,
     .COLLECTIVE_DATA   = false,
     .SUBFILING         = false,
+//    .ASYNC_MODE        = false,
 
     // Reader
     //    .DATA_LOADER = PYTORCH,
@@ -297,7 +298,7 @@ parse_args(int argc, char *argv[])
 //        else if (strcmp(argv[i], "--async-mode") == 0) {
 //            config.ASYNC_MODE = true;
 //        }
-        else if (strcmp(argv[i], "--output") == 0) {
+        else if (strcmp(argv[i], "--csv-file") == 0) {
             i++;
             config.CSV_FILE = argv[i];
         }
diff --git a/src/h5bench.py b/src/h5bench.py
index ac65536f..a346b2d2 100755
--- a/src/h5bench.py
+++ b/src/h5bench.py
@@ -988,6 +988,10 @@ def run_dlio(self, id, setup):
                 if key in parameters_binary:
                     if configuration[key].lower() == 'true':
                         parameters.append('--{} '.format(key))
+
+                # Make sure the CSV file is generated in the temporary path
+                elif key == 'csv-file':
+                    parameters.append('--{} {} '.format(key, '{}/{}/{}'.format(self.directory, id, configuration[key])))
                 else:
                     parameters.append('--{} {} '.format(key, configuration[key]))
 

From 9d80ffc0bfa6f60187992f4226039f238b0d3542 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions[bot]@users.noreply.github.com>
Date: Tue, 9 Jul 2024 18:10:57 +0000
Subject: [PATCH 11/37] Committing clang-format changes

---
 dlio/utils.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dlio/utils.c b/dlio/utils.c
index 06c64e39..15ddd45b 100644
--- a/dlio/utils.c
+++ b/dlio/utils.c
@@ -44,7 +44,7 @@ config_datatype_t config = {
     .COLLECTIVE_META   = false,
     .COLLECTIVE_DATA   = false,
     .SUBFILING         = false,
-//    .ASYNC_MODE        = false,
+    //    .ASYNC_MODE        = false,
 
     // Reader
     //    .DATA_LOADER = PYTORCH,
@@ -295,9 +295,9 @@ parse_args(int argc, char *argv[])
         else if (strcmp(argv[i], "--drop-last") == 0) {
             config.DROP_LAST = true;
         }
-//        else if (strcmp(argv[i], "--async-mode") == 0) {
-//            config.ASYNC_MODE = true;
-//        }
+        //        else if (strcmp(argv[i], "--async-mode") == 0) {
+        //            config.ASYNC_MODE = true;
+        //        }
         else if (strcmp(argv[i], "--csv-file") == 0) {
             i++;
             config.CSV_FILE = argv[i];

From 923af9488197528f20480cb550952e0808108a3d Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Tue, 9 Jul 2024 21:03:30 +0200
Subject: [PATCH 12/37] Attempt number 1 to fix CI/CD

---
 dlio/h5bench_dlio.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dlio/h5bench_dlio.c b/dlio/h5bench_dlio.c
index 3e5a3a99..08bfd435 100644
--- a/dlio/h5bench_dlio.c
+++ b/dlio/h5bench_dlio.c
@@ -608,7 +608,9 @@ init_global_variables()
     if (config.SUBFILING) {
         if (MY_RANK == 0)
             printf("Using Subfiling VFD\n");
+#ifdef HAVE_SUBFILING
         H5Pset_fapl_subfiling(FAPL, NULL);
+#endif
         if (config.COLLECTIVE_DATA) {
             if (MY_RANK == 0)
                 printf("Warning: Collective mode can't be used with subfiling\n");

From 78f4e9a1cbd3ca7a9bdc05f2e9f83e000418aed6 Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Tue, 9 Jul 2024 21:36:42 +0200
Subject: [PATCH 13/37] Fix CI/CD (Attempt number 2)

---
 .gitlab-ci.yml          |  3 +++
 src/h5bench.py          |  1 +
 tests/CMakeLists.txt    |  8 ++++++++
 tests/test_sync_dlio.py | 33 +++++++++++++++++++++++++++++++++
 4 files changed, 45 insertions(+)
 create mode 100644 tests/test_sync_dlio.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 2360447c..55b39a87 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -80,6 +80,9 @@ test-perlmutter:
         - echo "Test h5bench SYNC openpmd"
         - python3 ../../samples/update-perlmutter.py ../../samples/sync-openpmd.json
         - ./h5bench --debug --abort-on-failure ../../samples/sync-openpmd.json
+        - echo "Test h5bench SYNC dlio"
+        - python3 ../../samples/update-perlmutter.py ../../samples/sync-dlio.json
+        - ./h5bench --debug --abort-on-failure ../../samples/sync-dlio.json
 
 build-perlmutter-metrics:
     stage: build
diff --git a/src/h5bench.py b/src/h5bench.py
index a346b2d2..0b4b98bd 100755
--- a/src/h5bench.py
+++ b/src/h5bench.py
@@ -1039,6 +1039,7 @@ def run_dlio(self, id, setup):
 
             sys.exit(os.EX_SOFTWARE)
 
+
 def main():
     PARSER = argparse.ArgumentParser(
         description='H5bench: a Parallel I/O Benchmark Suite for HDF5: '
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index acb5c416..4d8bb4f3 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -52,4 +52,12 @@ if(Python3_Interpreter_FOUND)
 			WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
 		)
 	endif()
+
+	if(H5BENCH_DLIO)
+		add_test(
+				NAME "h5bench-sync-dlio"
+				COMMAND Python3::Interpreter -m pytest --verbose --rootdir ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/test_sync_dlio.py
+				WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+		)
+	endif()
 endif()
\ No newline at end of file
diff --git a/tests/test_sync_dlio.py b/tests/test_sync_dlio.py
new file mode 100644
index 00000000..b3f6f346
--- /dev/null
+++ b/tests/test_sync_dlio.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+
+import os
+import glob
+import pytest
+
+from src import h5bench
+
+DEBUG = True
+ABORT = True
+VALIDATE = True
+
+BINARY = 'h5bench_dlio'
+
+samples = glob.glob('sync-dlio*.json')
+
+@pytest.mark.parametrize('configuration', samples)
+@pytest.mark.skipif(
+    os.path.isfile(BINARY) == False,
+    reason="DLIO is disabled"
+)
+def test_benchmark(configuration):
+    assert os.path.isfile(configuration) is True
+
+    benchmark = h5bench.H5bench(
+        configuration,
+        None,
+        DEBUG,
+        ABORT,
+        VALIDATE
+    )
+
+    benchmark.run()

From 6394c0c4f0f000b269108bf231750de7cce017de Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Tue, 9 Jul 2024 22:07:35 +0200
Subject: [PATCH 14/37] Fix CI/CD (Attempt number 3)

---
 workflows/h5bench-hdf5-1.14.0.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/workflows/h5bench-hdf5-1.14.0.yml b/workflows/h5bench-hdf5-1.14.0.yml
index 4d26a71c..435d0efc 100644
--- a/workflows/h5bench-hdf5-1.14.0.yml
+++ b/workflows/h5bench-hdf5-1.14.0.yml
@@ -541,3 +541,8 @@ jobs:
           name: test
           path: build*/storage/**/std*
           retention-days: 1
+
+      - name: Test h5bench SYNC dlio
+        run: |
+          cd build-sync
+          ./h5bench --debug --abort-on-failure ../samples/sync-dlio.json

From 6c9124bc55cbc1eb80e37f95c5a572d5e6acebe6 Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Wed, 10 Jul 2024 00:41:43 +0200
Subject: [PATCH 15/37] Fix CI/CD (Attempt number 4)

---
 .github/workflows/h5bench-hdf5-1.10.4.yml  |  7 +++++++
 .github/workflows/h5bench-hdf5-1.10.7.yml  |  7 +++++++
 .github/workflows/h5bench-hdf5-1.10.8.yml  |  7 +++++++
 .github/workflows/h5bench-hdf5-1.12.0.yml  |  7 +++++++
 .github/workflows/h5bench-hdf5-1.14.0.yml  |  7 +++++++
 .github/workflows/h5bench-hdf5-1.14.1.yml  |  7 +++++++
 .github/workflows/h5bench-hdf5-develop.yml | 11 +++++++++++
 dlio/h5bench_dlio.c                        |  2 +-
 samples/sync-dlio.json                     |  4 ++--
 src/h5bench.py                             |  4 +++-
 workflows/h5bench-hdf5-1.14.0.yml          | 10 +++++-----
 11 files changed, 64 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/h5bench-hdf5-1.10.4.yml b/.github/workflows/h5bench-hdf5-1.10.4.yml
index 712eb4e9..03610de3 100644
--- a/.github/workflows/h5bench-hdf5-1.10.4.yml
+++ b/.github/workflows/h5bench-hdf5-1.10.4.yml
@@ -217,6 +217,13 @@ jobs:
           cd build
           ./h5bench --debug --abort-on-failure ../samples/sync-macsio.json
 
+      - name: Test h5bench SYNC dlio
+        run: |
+          export LD_LIBRARY_PATH=/opt/hdf5/lib:$LD_LIBRARY_PATH
+          
+          cd build-sync
+          ./h5bench --debug --abort-on-failure ../samples/sync-dlio.json
+
       - name: Setup tmate session
         if: ${{ failure() }}
         uses: mxschmitt/action-tmate@v3
diff --git a/.github/workflows/h5bench-hdf5-1.10.7.yml b/.github/workflows/h5bench-hdf5-1.10.7.yml
index 8834b81b..c51a31f0 100644
--- a/.github/workflows/h5bench-hdf5-1.10.7.yml
+++ b/.github/workflows/h5bench-hdf5-1.10.7.yml
@@ -217,6 +217,13 @@ jobs:
           cd build
           ./h5bench --debug --abort-on-failure ../samples/sync-macsio.json
 
+      - name: Test h5bench SYNC dlio
+        run: |
+          export LD_LIBRARY_PATH=/opt/hdf5/lib:$LD_LIBRARY_PATH
+          
+          cd build-sync
+          ./h5bench --debug --abort-on-failure ../samples/sync-dlio.json
+
       - name: Setup tmate session
         if: ${{ failure() }}
         uses: mxschmitt/action-tmate@v3
diff --git a/.github/workflows/h5bench-hdf5-1.10.8.yml b/.github/workflows/h5bench-hdf5-1.10.8.yml
index 3c2fc5f1..697cb037 100644
--- a/.github/workflows/h5bench-hdf5-1.10.8.yml
+++ b/.github/workflows/h5bench-hdf5-1.10.8.yml
@@ -217,6 +217,13 @@ jobs:
           cd build
           ./h5bench --debug --abort-on-failure ../samples/sync-macsio.json
 
+      - name: Test h5bench SYNC dlio
+        run: |
+          export LD_LIBRARY_PATH=/opt/hdf5/lib:$LD_LIBRARY_PATH
+          
+          cd build-sync
+          ./h5bench --debug --abort-on-failure ../samples/sync-dlio.json
+
       - name: Setup tmate session
         if: ${{ failure() }}
         uses: mxschmitt/action-tmate@v3
diff --git a/.github/workflows/h5bench-hdf5-1.12.0.yml b/.github/workflows/h5bench-hdf5-1.12.0.yml
index d194ae41..1584e39d 100644
--- a/.github/workflows/h5bench-hdf5-1.12.0.yml
+++ b/.github/workflows/h5bench-hdf5-1.12.0.yml
@@ -252,6 +252,13 @@ jobs:
           cd build
           ./h5bench --debug --abort-on-failure ../samples/sync-macsio.json
 
+      - name: Test h5bench SYNC dlio
+        run: |
+          export LD_LIBRARY_PATH=/opt/hdf5/lib:$LD_LIBRARY_PATH
+          
+          cd build-sync
+          ./h5bench --debug --abort-on-failure ../samples/sync-dlio.json
+
       - name: Setup tmate session
         if: ${{ failure() }}
         uses: mxschmitt/action-tmate@v3
diff --git a/.github/workflows/h5bench-hdf5-1.14.0.yml b/.github/workflows/h5bench-hdf5-1.14.0.yml
index b3be7393..361961cf 100644
--- a/.github/workflows/h5bench-hdf5-1.14.0.yml
+++ b/.github/workflows/h5bench-hdf5-1.14.0.yml
@@ -284,6 +284,13 @@ jobs:
           cd build-sync
           ./h5bench --debug --abort-on-failure ../samples/sync-macsio.json
 
+      - name: Test h5bench SYNC dlio
+        run: |
+          export LD_LIBRARY_PATH=/opt/hdf5/lib:$LD_LIBRARY_PATH
+          
+          cd build-sync
+          ./h5bench --debug --abort-on-failure ../samples/sync-dlio.json
+
       - name: Test h5bench ASYNC write/read
         run: |
           export HDF5_DIR=/opt/hdf5
diff --git a/.github/workflows/h5bench-hdf5-1.14.1.yml b/.github/workflows/h5bench-hdf5-1.14.1.yml
index 2de7aceb..c0e46e2e 100644
--- a/.github/workflows/h5bench-hdf5-1.14.1.yml
+++ b/.github/workflows/h5bench-hdf5-1.14.1.yml
@@ -284,6 +284,13 @@ jobs:
           cd build-sync
           ./h5bench --debug --abort-on-failure ../samples/sync-macsio.json
 
+      - name: Test h5bench SYNC dlio
+        run: |
+          export LD_LIBRARY_PATH=/opt/hdf5/lib:$LD_LIBRARY_PATH
+          
+          cd build-sync
+          ./h5bench --debug --abort-on-failure ../samples/sync-dlio.json
+
       - name: Test h5bench ASYNC write/read
         run: |
           export HDF5_DIR=/opt/hdf5
diff --git a/.github/workflows/h5bench-hdf5-develop.yml b/.github/workflows/h5bench-hdf5-develop.yml
index b68c92e2..2d6cd0ac 100644
--- a/.github/workflows/h5bench-hdf5-develop.yml
+++ b/.github/workflows/h5bench-hdf5-develop.yml
@@ -416,6 +416,17 @@ jobs:
           cd build-sync
           ./h5bench --debug --abort-on-failure ../samples/sync-macsio.json
 
+      - name: Test h5bench SYNC dlio
+        run: |
+          export HDF5_HOME=/opt/hdf5
+          export ABT_HOME=/opt/argobots
+          export ASYNC_HOME=/opt/vol-async
+
+          export LD_LIBRARY_PATH=$ASYNC_HOME/lib:$HDF5_HOME/lib:$ABT_HOME/lib:$LD_LIBRARY_PATH
+          
+          cd build-sync
+          ./h5bench --debug --abort-on-failure ../samples/sync-dlio.json
+
       - name: Test h5bench ASYNC write/read
         run: |
           export HDF5_DIR=/opt/hdf5
diff --git a/dlio/h5bench_dlio.c b/dlio/h5bench_dlio.c
index 08bfd435..279316e8 100644
--- a/dlio/h5bench_dlio.c
+++ b/dlio/h5bench_dlio.c
@@ -670,7 +670,7 @@ init_global_variables()
     }
 #endif
 
-    if (MY_RANK == 0) {
+    if ((MY_RANK == 0) && config.DO_TRAIN) {
         printf("The number of training batches per rank: %u\n", config.NUM_TRAIN_BATCHES_PER_RANK);
         if (config.READ_THREADS > config.NUM_TRAIN_BATCHES_PER_RANK) {
             printf("Warning: The number of requested read threads (%u) is greater than the number of "
diff --git a/samples/sync-dlio.json b/samples/sync-dlio.json
index c50f3a5a..bdde505c 100644
--- a/samples/sync-dlio.json
+++ b/samples/sync-dlio.json
@@ -20,7 +20,7 @@
         "compression": "true",
         "record-length": "67108864",
         "num-files-train": "64",
-        "num-files-eval": "8",
+        "num-files-eval": "16",
         "num-samples-per-file": "4",
         "data-folder": "data",
         "file-prefix": "img",
@@ -42,7 +42,7 @@
         "seed-change-epoch": "true",
         "record-length": "67108864",
         "num-files-train": "64",
-        "num-files-eval": "8",
+        "num-files-eval": "16",
         "num-samples-per-file": "4",
         "data-folder": "./data",
         "file-prefix": "img",
diff --git a/src/h5bench.py b/src/h5bench.py
index 0b4b98bd..69c8f255 100755
--- a/src/h5bench.py
+++ b/src/h5bench.py
@@ -989,9 +989,11 @@ def run_dlio(self, id, setup):
                     if configuration[key].lower() == 'true':
                         parameters.append('--{} '.format(key))
 
-                # Make sure the CSV file is generated in the temporary path
+                # Make sure the CSV file and datasets are generated in the temporary path
                 elif key == 'csv-file':
                     parameters.append('--{} {} '.format(key, '{}/{}/{}'.format(self.directory, id, configuration[key])))
+                elif key == 'data-folder':
+                    parameters.append('--{} {} '.format(key, '{}/{}'.format(self.directory, configuration[key])))
                 else:
                     parameters.append('--{} {} '.format(key, configuration[key]))
 
diff --git a/workflows/h5bench-hdf5-1.14.0.yml b/workflows/h5bench-hdf5-1.14.0.yml
index 435d0efc..1d15a38a 100644
--- a/workflows/h5bench-hdf5-1.14.0.yml
+++ b/workflows/h5bench-hdf5-1.14.0.yml
@@ -243,6 +243,11 @@ jobs:
           cd build-sync
           ./h5bench --debug --abort-on-failure ../samples/sync-e3sm.json
 
+      - name: Test h5bench SYNC dlio
+        run: |
+          cd build-sync
+          ./h5bench --debug --abort-on-failure ../samples/sync-dlio.json
+          
       - name: Test h5bench ASYNC write/read
         run: |
           current="$PWD"
@@ -541,8 +546,3 @@ jobs:
           name: test
           path: build*/storage/**/std*
           retention-days: 1
-
-      - name: Test h5bench SYNC dlio
-        run: |
-          cd build-sync
-          ./h5bench --debug --abort-on-failure ../samples/sync-dlio.json

From 3b2675bbdc5edfad49a88f99565605df7248c9f8 Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Wed, 10 Jul 2024 00:42:57 +0200
Subject: [PATCH 16/37] Delete the Async VOL references in the code

---
 dlio/utils.c | 4 ----
 dlio/utils.h | 1 -
 2 files changed, 5 deletions(-)

diff --git a/dlio/utils.c b/dlio/utils.c
index 15ddd45b..dfe185df 100644
--- a/dlio/utils.c
+++ b/dlio/utils.c
@@ -44,7 +44,6 @@ config_datatype_t config = {
     .COLLECTIVE_META   = false,
     .COLLECTIVE_DATA   = false,
     .SUBFILING         = false,
-    //    .ASYNC_MODE        = false,
 
     // Reader
     //    .DATA_LOADER = PYTORCH,
@@ -295,9 +294,6 @@ parse_args(int argc, char *argv[])
         else if (strcmp(argv[i], "--drop-last") == 0) {
             config.DROP_LAST = true;
         }
-        //        else if (strcmp(argv[i], "--async-mode") == 0) {
-        //            config.ASYNC_MODE = true;
-        //        }
         else if (strcmp(argv[i], "--csv-file") == 0) {
             i++;
             config.CSV_FILE = argv[i];
diff --git a/dlio/utils.h b/dlio/utils.h
index 7c94cc0e..1aab60df 100644
--- a/dlio/utils.h
+++ b/dlio/utils.h
@@ -32,7 +32,6 @@ typedef struct config_datatype {
     bool     COLLECTIVE_META;
     bool     COLLECTIVE_DATA;
     bool     SUBFILING;
-    //    bool     ASYNC_MODE;
 
     // Reader
     //    DATA_LOADER;

From 15e697bed6094bd1aab6b0aa17b9af4dde4b11fa Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Wed, 10 Jul 2024 01:04:57 +0200
Subject: [PATCH 17/37] Fix CI/CD (Attempt number 5)

---
 .github/workflows/h5bench-hdf5-1.10.4.yml | 2 +-
 .github/workflows/h5bench-hdf5-1.10.7.yml | 2 +-
 .github/workflows/h5bench-hdf5-1.10.8.yml | 2 +-
 .github/workflows/h5bench-hdf5-1.12.0.yml | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/h5bench-hdf5-1.10.4.yml b/.github/workflows/h5bench-hdf5-1.10.4.yml
index 03610de3..25ae373e 100644
--- a/.github/workflows/h5bench-hdf5-1.10.4.yml
+++ b/.github/workflows/h5bench-hdf5-1.10.4.yml
@@ -221,7 +221,7 @@ jobs:
         run: |
           export LD_LIBRARY_PATH=/opt/hdf5/lib:$LD_LIBRARY_PATH
           
-          cd build-sync
+          cd build
           ./h5bench --debug --abort-on-failure ../samples/sync-dlio.json
 
       - name: Setup tmate session
diff --git a/.github/workflows/h5bench-hdf5-1.10.7.yml b/.github/workflows/h5bench-hdf5-1.10.7.yml
index c51a31f0..68bee711 100644
--- a/.github/workflows/h5bench-hdf5-1.10.7.yml
+++ b/.github/workflows/h5bench-hdf5-1.10.7.yml
@@ -221,7 +221,7 @@ jobs:
         run: |
           export LD_LIBRARY_PATH=/opt/hdf5/lib:$LD_LIBRARY_PATH
           
-          cd build-sync
+          cd build
           ./h5bench --debug --abort-on-failure ../samples/sync-dlio.json
 
       - name: Setup tmate session
diff --git a/.github/workflows/h5bench-hdf5-1.10.8.yml b/.github/workflows/h5bench-hdf5-1.10.8.yml
index 697cb037..4df604ec 100644
--- a/.github/workflows/h5bench-hdf5-1.10.8.yml
+++ b/.github/workflows/h5bench-hdf5-1.10.8.yml
@@ -221,7 +221,7 @@ jobs:
         run: |
           export LD_LIBRARY_PATH=/opt/hdf5/lib:$LD_LIBRARY_PATH
           
-          cd build-sync
+          cd build
           ./h5bench --debug --abort-on-failure ../samples/sync-dlio.json
 
       - name: Setup tmate session
diff --git a/.github/workflows/h5bench-hdf5-1.12.0.yml b/.github/workflows/h5bench-hdf5-1.12.0.yml
index 1584e39d..a445af9e 100644
--- a/.github/workflows/h5bench-hdf5-1.12.0.yml
+++ b/.github/workflows/h5bench-hdf5-1.12.0.yml
@@ -256,7 +256,7 @@ jobs:
         run: |
           export LD_LIBRARY_PATH=/opt/hdf5/lib:$LD_LIBRARY_PATH
           
-          cd build-sync
+          cd build
           ./h5bench --debug --abort-on-failure ../samples/sync-dlio.json
 
       - name: Setup tmate session

From 11c793050ccbd911b38c8a074f8eb9b61c2dd50c Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Wed, 10 Jul 2024 15:27:06 +0200
Subject: [PATCH 18/37] Fix CI/CD (Attempt number 6)

---
 samples/sync-dlio.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/samples/sync-dlio.json b/samples/sync-dlio.json
index bdde505c..c0c0ccc7 100644
--- a/samples/sync-dlio.json
+++ b/samples/sync-dlio.json
@@ -1,7 +1,7 @@
 {
   "mpi": {
-    "command": "srun",
-    "ranks": "8"
+    "command": "mpirun",
+    "ranks": "24"
   },
   "vol": {
 

From 3a2e7d7181e249fab73f18f23cc23b0c032ce15b Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Wed, 10 Jul 2024 15:56:18 +0200
Subject: [PATCH 19/37] Fix CI/CD (Attempt number 7)

---
 samples/sync-dlio.json | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/samples/sync-dlio.json b/samples/sync-dlio.json
index c0c0ccc7..5569944c 100644
--- a/samples/sync-dlio.json
+++ b/samples/sync-dlio.json
@@ -1,7 +1,8 @@
 {
   "mpi": {
     "command": "mpirun",
-    "ranks": "24"
+    "ranks": "4",
+    "configuration": "--allow-run-as-root --oversubscribe -np 24"
   },
   "vol": {
 
@@ -19,8 +20,8 @@
         "keep-files": "true",
         "compression": "true",
         "record-length": "67108864",
-        "num-files-train": "64",
-        "num-files-eval": "16",
+        "num-files-train": "32",
+        "num-files-eval": "8",
         "num-samples-per-file": "4",
         "data-folder": "data",
         "file-prefix": "img",

From 8fc0dd28da035b951d79e060db6c2ec092bff517 Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Wed, 10 Jul 2024 17:06:56 +0200
Subject: [PATCH 20/37] Fix CI/CD (Attempt number 8)

---
 samples/sync-dlio.json | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/samples/sync-dlio.json b/samples/sync-dlio.json
index 5569944c..894eefd1 100644
--- a/samples/sync-dlio.json
+++ b/samples/sync-dlio.json
@@ -2,7 +2,7 @@
   "mpi": {
     "command": "mpirun",
     "ranks": "4",
-    "configuration": "--allow-run-as-root --oversubscribe -np 24"
+    "configuration": "--allow-run-as-root --oversubscribe -np 4"
   },
   "vol": {
 
@@ -20,8 +20,8 @@
         "keep-files": "true",
         "compression": "true",
         "record-length": "67108864",
-        "num-files-train": "32",
-        "num-files-eval": "8",
+        "num-files-train": "8",
+        "num-files-eval": "2",
         "num-samples-per-file": "4",
         "data-folder": "data",
         "file-prefix": "img",
@@ -42,8 +42,8 @@
         "shuffle": "true",
         "seed-change-epoch": "true",
         "record-length": "67108864",
-        "num-files-train": "64",
-        "num-files-eval": "16",
+        "num-files-train": "8",
+        "num-files-eval": "2",
         "num-samples-per-file": "4",
         "data-folder": "./data",
         "file-prefix": "img",

From cb3994a97b383f5cccb419b93dc9d0eb992a9259 Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Wed, 10 Jul 2024 21:23:34 +0200
Subject: [PATCH 21/37] Add the readthedocs documentation page

---
 docs/source/buildinstructions.rst |   3 +-
 docs/source/dlio.rst              | 124 ++++++++++++++++++++++++++++++
 docs/source/index.rst             |   4 +-
 3 files changed, 129 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/dlio.rst

diff --git a/docs/source/buildinstructions.rst b/docs/source/buildinstructions.rst
index 9cebf08c..a6af0d2c 100644
--- a/docs/source/buildinstructions.rst
+++ b/docs/source/buildinstructions.rst
@@ -104,7 +104,8 @@ Exerciser            ``h5bench_exerciser``       ``-DH5BENCH_EXERCISER=ON``
 OpenPMD (write)      ``h5bench_openpmd_write``   ``-DH5BENCH_OPENPMD=ON``
 OpenPMD (read)       ``h5bench_openpmd_read``    ``-DH5BENCH_OPENPMD=ON``
 E3SM-IO              ``h5bench_e3sm``            ``-DH5BENCH_E3SM=ON`` 
-MACSio               ``h5bench_macsio``          ``-DH5BENCH_MACSIO=ON`` 
+MACSio               ``h5bench_macsio``          ``-DH5BENCH_MACSIO=ON``
+DLIO                 ``h5bench_dlio``            ``-DH5BENCH_DLIO=ON``
 ==================== =========================== ===============================
 
 .. warning::
diff --git a/docs/source/dlio.rst b/docs/source/dlio.rst
new file mode 100644
index 00000000..d80190be
--- /dev/null
+++ b/docs/source/dlio.rst
@@ -0,0 +1,124 @@
+DLIO
+====
+
+The benchmark is designed to measure the performance of training and evaluation of deep learning models on data stored
+as HDF5 files. Based on collected and analysed I/O patterns from `DLIO Benchmark <https://github.com/argonne-lcf/dlio_benchmark>`_,
+this benchmark simulates the learning process and evaluation of deep learning models using PyTorch and Tensorflow
+frameworks, while gathering valuable information about system performance.
+
+Configuration
+-------------
+
+As in the case with other extensions, the following parameters should be specified in the configuration section of the json file to configure the benchmark:
+
+========================== ===================================================================== ======== ==============
+**Parameter**              **Description**                                                       **Type** **Default**
+========================== ===================================================================== ======== ==============
+``generate-data``          Enable generation of benchmarking data                                bool     false
+``train``                  Enable model training simulation                                      bool     false
+``evaluation``             Enable model evaluation simulation                                    bool     false
+``record-length``          Record size of a single sample in bytes                               int      67108864
+``num-files-train``        The number of files used to train the model                           int      64
+``num-files-eval``         The number of files used to evaluate the model                        int      8
+``num-samples-per-file``   The number of samples in each file                                    int      4
+``data-folder``            Name of the directory storing the benchmark data                      string   ./data
+``file-prefix``            Prefix in the name of files containing training and evaluation data   string   img
+``chunking``               Enable chunking                                                       bool     false
+``chunk-size``             Chunk size                                                            int      1024
+``keep-files``             Does not delete data after the benchmark is finished                  bool     false
+``compression``            Enable compression                                                    bool     false
+``compression-level``      Compression level from 1 to 9                                         int      4
+``batch-size``             Training batch size                                                   int      7
+``batch-size-eval``        Evaluation batch size                                                 int      2
+``shuffle``                Enable samples shuffle                                                bool     false
+``preprocess-time``        Preprocessing time after reading each sample in seconds               float    0.0
+``preprocess-time-stdev``  Standard deviation in preprocessing time in seconds                   float    0.0
+``epochs``                 The number of epochs                                                  int      5
+``computation-time``       Computation time after reading each batch in seconds                  float    0.323
+``computation-time-stdev`` Standard deviation in computation time in seconds                     float    0.0
+``random-seed``            Random seed to be used                                                int      42
+``eval-time``              Evaluation time after reading each batch in seconds                   float    0.323
+``eval-time-stdev``        Standard deviation in evaluation time in seconds                      float    0.0
+``epochs-between-evals``   The number of epochs between evaluations                              int      1
+``train-data-folder``      Name of the directory containing the training data                    string   train
+``valid-data-folder``      Name of the directory containing the validation data                  string   valid
+``records-dataset-name``   Name of the dataset with records                                      string   records
+``labels-dataset-name``    Name of the dataset with labels                                       string   labels
+``seed-change-epoch``      Enable seed changes every epoch                                       bool     false
+``read-threads``           The number of workers used to read the data                           int      4
+``collective-meta``        Enable collective HDF5 metadata operations                            bool     false
+``collective-data``        Enable collective HDF5 data operations                                bool     false
+``subfiling``              Enable HDF5 Subfiling Virtual File Driver                             bool     false
+``csv-file``               Name of the output csv file                                           string   output.csv
+========================== ===================================================================== ======== ==============
+
+It should be noted that for each parameter there is a default value that applies if the parameter has not been specified
+in the configuration file. Thus, by default the benchmark will not run because the generate-data, train and evaluation
+parameters are false. A sample configuration file can be found in the ``samples/`` directory.
+
+Understanding the output
+------------------------
+The sample output of the benchmark is as follows:
+
+.. code-block::
+
+    =================== Performance Results ==================
+    Total number of ranks: 8
+    The number of read threads per rank: 0
+    Total training set size: 7.000 GB
+    Training set size per rank: 896.000 MB
+    Total training emulated compute time: 3.229 s
+    Training metadata time: 2.808 s
+    Training raw read time: 30.905 s
+    Training average raw read rate: 145.141 MB/s
+    Observed training completion time: 37.432 s
+    Observed average training rate: 131.044 MB/s
+    Training average throughput: 1.871 samples/s
+    Training throughput standard deviation: 0.037 samples/s
+    Training average IO: 119.729 MB/s
+    Training IO standard deviation: 2.379 MB/s
+    Total evaluation set size: 7.000 GB
+    Evaluation set size per rank: 896.000 MB
+    Total evaluation emulated compute time: 3.206 s
+    Evaluation metadata time: 2.805 s
+    Evaluation raw read time: 31.699 s
+    Evaluation average raw read rate: 141.906 MB/s
+    Observed evaluation completion time: 38.424 s
+    Observed average evaluation rate: 127.595 MB/s
+    Evaluation average throughput avg: 1.826 samples/s
+    Evaluation throughput standard deviation: 0.090 samples/s
+    Evaluation average IO: 116.883 MB/s
+    Evaluation IO standard deviation: 5.735 MB/s
+    ===========================================================
+
+Let's take a closer look at it. First, information about the number of MPI ranks and processes per MPI rank used in the
+simulation is output. Then, the same values are used to describe the training and evaluation performance, so for the
+sake of reducing redundancy, let us consider only the first half of the results concerning the training process. Total
+training set size is calculated as the size of all HDF5 files used for training. Accordingly, the training set size per
+rank gives an idea of how much of the load is taken over by one MPI rank. Total training emulated compute time contains
+information about the total time spent on compute emulation for all epochs in total, as well as training metadata time
+and training raw read time, about which, however, it should be noted that they are not interleaved and measure the time
+of execution of ``H5Fopen``, ``H5Dget_space``, ``H5Screate_simple``, ``H5Sclose`` and ``H5Dread`` commands respectively.
+Training average raw read rate is calculated as training set size per rank divided by training raw read time. Observed
+training completion time includes all the time spent on the training process, among other things including resource
+allocation and computation simulation. Observed average training rate is equal to training set size per rank divided by
+the difference of observed training completion time and total training emulated compute time, thus showing the data
+reading rate without taking into account emulation costs. Training average throughput and training throughput standard
+deviation give an indication of the number of samples from the training dataset processed in one second. Training
+average IO and Training IO standard deviation translate these values into bytes/second by multiplying by the size of
+one sample.
+
+Future work
+-----------
+
+There are plans to add more configuration options for the extension in the future to increase its flexibility:
+* Add settings for Subfiling VFD. Currently, the default settings are used.
+* Add more features from `DLIO Benchmark <https://github.com/argonne-lcf/dlio_benchmark>`_ such as resizable records.
+* Analyze and add support for other ml frameworks and data loaders. For example, DALI.
+* Add support for prefetching.
+* Expand the ability to randomly shuffle samples. At the moment, it is not possible to shuffle only samples in each file
+without changing the order of the files for training.
+* Add more compression filters and thus support different compression algorithms for HDF5 data.
+* Add support for drop_last customization. Currently, by default, all batches left after MPI ranks distribution are not processed.
+* Replace the use of ``fork()`` with ``MPI_Comm_spawn()`` when creating new processes, as using ``fork()`` with MPI may be unsafe
+* Test support for the Cache VOL connector.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index bb3d56a0..ca8e47df 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -35,7 +35,8 @@ Exerciser            ``h5bench_exerciser``       |:white_check_mark:| |:white_la
 OpenPMD (write)      ``h5bench_openpmd_write``   |:white_check_mark:| |:white_large_square:|   |:white_large_square:|   |:white_large_square:| 
 OpenPMD (read)       ``h5bench_openpmd_read``    |:white_check_mark:| |:white_large_square:|   |:white_large_square:|   |:white_large_square:| 
 E3SM-IO              ``h5bench_e3sm``            |:white_check_mark:| |:white_large_square:|   |:white_large_square:|   |:white_check_mark:| 
-MACSio               ``h5bench_macsio``          |:white_check_mark:| |:white_large_square:|   |:white_large_square:|   |:white_check_mark:| 
+MACSio               ``h5bench_macsio``          |:white_check_mark:| |:white_large_square:|   |:white_large_square:|   |:white_check_mark:|
+DLIO                 ``h5bench_dlio``            |:white_check_mark:| |:white_large_square:|   |:white_large_square:|   |:white_large_square:|
 ==================== =========================== ==================== ======================== ======================== ========================
 
 .. toctree::
@@ -56,6 +57,7 @@ MACSio               ``h5bench_macsio``          |:white_check_mark:| |:white_la
    openpmd
    e3sm
    macsio
+   dlio
 
 .. toctree::
    :maxdepth: 2

From 92c3c57fbfbf2e77a2890953139930b7097fcf98 Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Wed, 10 Jul 2024 21:38:21 +0200
Subject: [PATCH 22/37] Update the way of counting the number of batches per
 rank

---
 dlio/h5bench_dlio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dlio/h5bench_dlio.c b/dlio/h5bench_dlio.c
index 279316e8..152f09c7 100644
--- a/dlio/h5bench_dlio.c
+++ b/dlio/h5bench_dlio.c
@@ -581,9 +581,9 @@ init_global_variables()
     GENERATION_SIZE      = data_length > GENERATION_BUFFER_SIZE ? GENERATION_BUFFER_SIZE : data_length;
 
     config.NUM_TRAIN_BATCHES_PER_RANK =
-        config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE / NUM_RANKS / config.BATCH_SIZE;
+        config.NUM_FILES_TRAIN * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE / NUM_RANKS;
     config.NUM_EVAL_BATCHES_PER_RANK =
-        config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE / NUM_RANKS / config.BATCH_SIZE_EVAL;
+        config.NUM_FILES_EVAL * config.NUM_SAMPLES_PER_FILE / config.BATCH_SIZE_EVAL / NUM_RANKS;
 
     config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN = config.READ_THREADS > config.NUM_TRAIN_BATCHES_PER_RANK
                                                       ? config.NUM_TRAIN_BATCHES_PER_RANK

From 96facfca2b83aa8cb2296f2a3e5f8161ed83a734 Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Thu, 18 Jul 2024 20:42:32 +0200
Subject: [PATCH 23/37] Add the ability to output information about all ranks

---
 dlio/README.md         |  78 +++----
 dlio/h5bench_dlio.c    |  11 +-
 dlio/stats.c           | 453 +++++++++++++++++++++++++++++++++++++++--
 dlio/stats.h           |   4 +-
 dlio/utils.c           |  17 +-
 dlio/utils.h           |   4 +-
 samples/sync-dlio.json |  10 +-
 src/h5bench.py         |   7 +-
 8 files changed, 518 insertions(+), 66 deletions(-)

diff --git a/dlio/README.md b/dlio/README.md
index 3b08d3ab..9e09f73b 100644
--- a/dlio/README.md
+++ b/dlio/README.md
@@ -9,44 +9,45 @@ frameworks, while gathering valuable information about system performance.
 As in the case with other extensions, the following parameters should be specified in the configuration section of the 
 json file to configure the benchmark:
 
-| Parameter              | Description                                                         | Type   | Default    |
-|------------------------|---------------------------------------------------------------------|--------|------------|
-| generate-data          | Enable generation of benchmarking data                              | bool   | false      |
-| train                  | Enable model training simulation                                    | bool   | false      |
-| evaluation             | Enable model evaluation simulation                                  | bool   | false      |
-| record-length          | Record size of a single sample in bytes                             | int    | 67108864   |
-| num-files-train        | The number of files used to train the model                         | int    | 64         |
-| num-files-eval         | The number of files used to evaluate the model                      | int    | 8          |
-| num-samples-per-file   | The number of samples in each file                                  | int    | 4          |
-| data-folder            | Name of the directory storing the benchmark data                    | string | ./data     |
-| file-prefix            | Prefix in the name of files containing training and evaluation data | string | img        |
-| chunking               | Enable chunking                                                     | bool   | false      |
-| chunk-size             | Chunk size                                                          | int    | 1024       |
-| keep-files             | Does not delete data after the benchmark is finished                | bool   | false      |
-| compression            | Enable compression                                                  | bool   | false      |
-| compression-level      | Compression level from 1 to 9                                       | int    | 4          |
-| batch-size             | Training batch size                                                 | int    | 7          |
-| batch-size-eval        | Evaluation batch size                                               | int    | 2          |
-| shuffle                | Enable samples shuffle                                              | bool   | false      |
-| preprocess-time        | Preprocessing time after reading each sample in seconds             | float  | 0.0        |
-| preprocess-time-stdev  | Standard deviation in preprocessing time in seconds                 | float  | 0.0        |
-| epochs                 | The number of epochs                                                | int    | 5          |
-| computation-time       | Computation time after reading each batch in seconds                | float  | 0.323      |
-| computation-time-stdev | Standard deviation in computation time in seconds                   | float  | 0.0        |
-| random-seed            | Random seed to be used                                              | int    | 42         |
-| eval-time              | Evaluation time after reading each batch in seconds                 | float  | 0.323      |
-| eval-time-stdev        | Standard deviation in evaluation time in seconds                    | float  | 0.0        |
-| epochs-between-evals   | The number of epochs between evaluations                            | int    | 1          |
-| train-data-folder      | Name of the directory containing the training data                  | string | train      |
-| valid-data-folder      | Name of the directory containing the validation data                | string | valid      |
-| records-dataset-name   | Name of the dataset with records                                    | string | records    |
-| labels-dataset-name    | Name of the dataset with labels                                     | string | labels     |
-| seed-change-epoch      | Enable seed changes every epoch                                     | bool   | false      |
-| read-threads           | The number of workers used to read the data                         | int    | 4          |
-| collective-meta        | Enable collective HDF5 metadata operations                          | bool   | false      |
-| collective-data        | Enable collective HDF5 data operations                              | bool   | false      |
-| subfiling              | Enable HDF5 Subfiling Virtual File Driver                           | bool   | false      |
-| csv-file               | Name of the output csv file                                         | string | output.csv |
+| Parameter              | Description                                                         | Type   | Default  |
+|------------------------|---------------------------------------------------------------------|--------|----------|
+| generate-data          | Enable generation of benchmarking data                              | bool   | false    |
+| train                  | Enable model training simulation                                    | bool   | false    |
+| evaluation             | Enable model evaluation simulation                                  | bool   | false    |
+| record-length          | Record size of a single sample in bytes                             | int    | 67108864 |
+| num-files-train        | The number of files used to train the model                         | int    | 32       |
+| num-files-eval         | The number of files used to evaluate the model                      | int    | 8        |
+| num-samples-per-file   | The number of samples in each file                                  | int    | 4        |
+| data-folder            | Name of the directory storing the benchmark data                    | string | ./data   |
+| file-prefix            | Prefix in the name of files containing training and evaluation data | string | img      |
+| chunking               | Enable chunking                                                     | bool   | false    |
+| chunk-size             | Chunk size                                                          | int    | 1024     |
+| keep-files             | Does not delete data after the benchmark is finished                | bool   | false    |
+| compression            | Enable compression                                                  | bool   | false    |
+| compression-level      | Compression level from 1 to 9                                       | int    | 4        |
+| batch-size             | Training batch size                                                 | int    | 7        |
+| batch-size-eval        | Evaluation batch size                                               | int    | 2        |
+| shuffle                | Enable samples shuffle                                              | bool   | false    |
+| preprocess-time        | Preprocessing time after reading each sample in seconds             | float  | 0.0      |
+| preprocess-time-stdev  | Standard deviation in preprocessing time in seconds                 | float  | 0.0      |
+| epochs                 | The number of epochs                                                | int    | 5        |
+| computation-time       | Computation time after reading each batch in seconds                | float  | 0.323    |
+| computation-time-stdev | Standard deviation in computation time in seconds                   | float  | 0.0      |
+| random-seed            | Random seed to be used                                              | int    | 42       |
+| eval-time              | Evaluation time after reading each batch in seconds                 | float  | 0.323    |
+| eval-time-stdev        | Standard deviation in evaluation time in seconds                    | float  | 0.0      |
+| epochs-between-evals   | The number of epochs between evaluations                            | int    | 1        |
+| train-data-folder      | Name of the directory containing the training data                  | string | train    |
+| valid-data-folder      | Name of the directory containing the validation data                | string | valid    |
+| records-dataset-name   | Name of the dataset with records                                    | string | records  |
+| labels-dataset-name    | Name of the dataset with labels                                     | string | labels   |
+| seed-change-epoch      | Enable seed changes every epoch                                     | bool   | false    |
+| read-threads           | The number of workers used to read the data                         | int    | 4        |
+| collective-meta        | Enable collective HDF5 metadata operations                          | bool   | false    |
+| collective-data        | Enable collective HDF5 data operations                              | bool   | false    |
+| subfiling              | Enable HDF5 Subfiling Virtual File Driver                           | bool   | false    |
+| output-csv-name        | Name of the output csv file                                         | string | output   |
+| output-ranks-data      | Enable statistics output for each rank                              | bool   | false    |
 
 It should be noted that for each parameter there is a default value that applies if the parameter has not been specified 
 in the configuration file. Thus, by default the benchmark will not run because the generate-data, train and evaluation 
@@ -113,3 +114,4 @@ without changing the order of the files for training.
 - Add support for drop_last customization. Currently, by default, all batches left after MPI ranks distribution are not processed.
 - Replace the use of `fork()` with `MPI_Comm_spawn()` when creating new processes, as using `fork()` with MPI may be unsafe
 - Test support for the Cache VOL connector.
+- Add support for checkpointing by saving the model to a hdf5 file.
diff --git a/dlio/h5bench_dlio.c b/dlio/h5bench_dlio.c
index 152f09c7..d3ddb4d6 100644
--- a/dlio/h5bench_dlio.c
+++ b/dlio/h5bench_dlio.c
@@ -712,6 +712,11 @@ main(int argc, char *argv[])
         snprintf(dir_name, sizeof(dir_name), "%s/%s", config.DATA_FOLDER, config.VALID_DATA_FOLDER);
         create_directory(dir_name);
         printf("OK\n");
+
+        printf("Create directory \"%s\"... ", config.OUTPUT_DATA_FOLDER);
+        snprintf(dir_name, sizeof(dir_name), "%s", config.OUTPUT_DATA_FOLDER);
+        create_directory(dir_name);
+        printf("OK\n");
     }
     init_global_variables();
     MPI_Barrier(MPI_COMM_WORLD);
@@ -728,8 +733,12 @@ main(int argc, char *argv[])
         prepare_data();
         MPI_Barrier(MPI_COMM_WORLD);
 
+        if (config.OUTPUT_RANKS_DATA) {
+            print_rank_data();
+        }
+
         if (MY_RANK == 0) {
-            print_data();
+            print_average_data();
         }
 
         stats_finalize();
diff --git a/dlio/stats.c b/dlio/stats.c
index da54553d..8aae2c45 100644
--- a/dlio/stats.c
+++ b/dlio/stats.c
@@ -216,7 +216,7 @@ prepare_data()
 }
 
 void
-print_data()
+print_average_data()
 {
     // Train
     // TODO: drop_last = false
@@ -238,21 +238,21 @@ print_data()
 
     uint64_t train_total_metadata_time = 0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        train_total_metadata_time += stats[i].metadata_time.train;
+        train_total_metadata_time += global_stats[i].metadata_time.train;
     }
 
     uint64_t train_total_read_time = 0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        train_total_read_time += stats[i].raw_read_time.train;
+        train_total_read_time += global_stats[i].raw_read_time.train;
     }
 
     double train_total_avg_read_rate = 0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        if (stats[i].raw_read_time.train == 0) {
+        if (global_stats[i].raw_read_time.train == 0) {
             continue;
         }
         train_total_avg_read_rate +=
-            (double)train_size_bytes_per_rank / stats[i].raw_read_time.train * 1000000.0;
+            (double)train_size_bytes_per_rank / global_stats[i].raw_read_time.train * 1000000.0;
     }
     train_total_avg_read_rate /= config.EPOCHS;
 
@@ -319,21 +319,21 @@ print_data()
 
     uint64_t eval_total_metadata_time = 0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        eval_total_metadata_time += stats[i].metadata_time.eval;
+        eval_total_metadata_time += global_stats[i].metadata_time.eval;
     }
 
     uint64_t eval_total_read_time = 0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        eval_total_read_time += stats[i].raw_read_time.eval;
+        eval_total_read_time += global_stats[i].raw_read_time.eval;
     }
 
     double eval_total_avg_read_rate = 0;
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        if (stats[i].raw_read_time.eval == 0) {
+        if (global_stats[i].raw_read_time.eval == 0) {
             continue;
         }
         eval_total_avg_read_rate +=
-            (double)eval_size_bytes_per_rank / stats[i].raw_read_time.eval * 1000000.0;
+            (double)eval_size_bytes_per_rank / global_stats[i].raw_read_time.eval * 1000000.0;
     }
     eval_total_avg_read_rate /= config.EPOCHS;
 
@@ -426,7 +426,430 @@ print_data()
 
     printf("===========================================================\n");
 
-    FILE *csv_file = fopen(config.CSV_FILE, "w+");
+    char file_name[256];
+    snprintf(file_name, sizeof(file_name), "%s/%s.csv", config.OUTPUT_DATA_FOLDER, config.OUTPUT_CSV_NAME);
+
+    FILE *csv_file = fopen(file_name, "w+");
+
+    char *units = (char *)malloc(config.EPOCHS * sizeof(char));
+
+    fprintf(csv_file, "metric, value, unit\n");
+    fprintf(csv_file, "operation, dlio,\n");
+    fprintf(csv_file, "ranks, %d,\n", NUM_RANKS);
+    fprintf(csv_file, "read threads, %d,\n", config.READ_THREADS);
+    fprintf(csv_file, "subfiling, %s,\n", config.SUBFILING ? "YES" : "NO");
+    fprintf(csv_file, "chunking, %s,\n", config.DO_CHUNKING ? "YES" : "NO");
+    fprintf(csv_file, "collective meta, %s,\n", config.COLLECTIVE_META ? "YES" : "NO");
+    fprintf(csv_file, "collective data, %s,\n", config.COLLECTIVE_DATA ? "YES" : "NO");
+
+    value = format_human_readable(train_total_size_bytes);
+    fprintf(csv_file, "train total size, %.3lf, %cB\n", value.value, value.unit);
+    value = format_human_readable(train_size_bytes_per_rank);
+    fprintf(csv_file, "train size per rank, %.3lf, %cB\n", value.value, value.unit);
+    fprintf(csv_file, "train emulated compute time per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%.3lf", train_compute_time_per_epoch[i] / 1000000.0);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "s");
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\"\ntrain emulated compute time, %.3lf, s\n", train_total_compute_time / 1000000.0);
+    fprintf(csv_file, "train metadata time per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%.3lf", global_stats[i].metadata_time.train / 1000000.0);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "s");
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\"\ntrain metadata time, %.3lf, s\n", train_total_metadata_time / 1000000.0);
+    fprintf(csv_file, "train raw read time per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%.3lf", global_stats[i].raw_read_time.train / 1000000.0);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "s");
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\"\ntrain total raw read time, %.3lf, s\n", train_total_read_time / 1000000.0);
+    fprintf(csv_file, "train raw read rate per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        if (global_stats[i].raw_read_time.train == 0) {
+            units[i] = ' ';
+            fprintf(csv_file, "NaN");
+        }
+        else {
+            value = format_human_readable((double)train_size_bytes_per_rank / global_stats[i].raw_read_time.train *
+                                          1000000.0);
+            units[i] = value.unit;
+            fprintf(csv_file, "%.3lf", value.value);
+        }
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%cB/s", units[i]);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    value = format_human_readable(train_total_avg_read_rate);
+    fprintf(csv_file, "\"\ntrain avg raw read rate, %.3lf, %cB/s\n", value.value, value.unit);
+    fprintf(csv_file, "train observed time per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%.3lf", global_stats[i].observed_time.train / 1000000.0);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "s");
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\"\ntrain observed time, %.3lf, s\n", train_total_observed_time / 1000000.0);
+    fprintf(csv_file, "train observed rate per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        value    = format_human_readable(train_avg_observed_rate_per_epoch[i]);
+        units[i] = value.unit;
+        fprintf(csv_file, "%.3lf", value.value);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%cB/s", units[i]);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    value = format_human_readable(train_total_avg_observed_rate);
+    fprintf(csv_file, "\"\ntrain avg observed rate, %.3lf, %cB/s\n", value.value, value.unit);
+    fprintf(csv_file, "train throughput samples per second per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%.3lf", global_stats[i].throughput.train);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "samples/s");
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\"\ntrain throughput avg samples per second, %.3lf, samples/s\n",
+            train_throughput_mean_samples_per_second);
+    fprintf(csv_file, "train throughput stdev samples per second, %.3lf, samples/s\n",
+            train_throughput_stdev_samples_per_second);
+    value = format_human_readable(train_io_mean);
+    fprintf(csv_file, "train io avg, %.3lf, %cB/s\n", value.value, value.unit);
+    value = format_human_readable(train_io_stdev);
+    fprintf(csv_file, "train io stdev, %.3lf, %cB/s\n", value.value, value.unit);
+
+    value = format_human_readable(eval_total_size_bytes);
+    fprintf(csv_file, "eval total size, %.3lf, %cB\n", value.value, value.unit);
+    value = format_human_readable(eval_size_bytes_per_rank);
+    fprintf(csv_file, "eval size per rank, %.3lf, %cB\n", value.value, value.unit);
+    fprintf(csv_file, "eval emulated compute time per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%.3lf", eval_compute_time_per_epoch[i] / 1000000.0);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "s");
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\"\neval emulated compute time, %.3lf, s\n", eval_total_compute_time / 1000000.0);
+    fprintf(csv_file, "eval metadata time per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%.3lf", global_stats[i].metadata_time.eval / 1000000.0);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "s");
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\"\neval metadata time, %.3lf, s\n", eval_total_metadata_time / 1000000.0);
+    fprintf(csv_file, "eval raw read time per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%.3lf", global_stats[i].raw_read_time.eval / 1000000.0);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "s");
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+
+    fprintf(csv_file, "\"\neval total raw read time, %.3lf, s\n", eval_total_read_time / 1000000.0);
+    fprintf(csv_file, "eval raw read rate per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        if (global_stats[i].raw_read_time.eval == 0) {
+            units[i] = ' ';
+            fprintf(csv_file, "NaN");
+        }
+        else {
+            value = format_human_readable(eval_size_bytes_per_rank / global_stats[i].raw_read_time.eval * 1000000.0);
+            units[i] = value.unit;
+            fprintf(csv_file, "%.3lf", value.value);
+        }
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%cB/s", units[i]);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    value = format_human_readable(eval_total_avg_read_rate);
+    fprintf(csv_file, "\"\neval avg raw read rate, %.3lf, %cB/s\n", value.value, value.unit);
+    fprintf(csv_file, "eval observed time per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%.3lf", global_stats[i].observed_time.eval / 1000000.0);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "s");
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\"\neval observed time, %.3lf, s\n", eval_total_observed_time / 1000000.0);
+    fprintf(csv_file, "eval observed rate per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        value    = format_human_readable(eval_avg_observed_rate_per_epoch[i]);
+        units[i] = value.unit;
+        fprintf(csv_file, "%.3lf", value.value);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%cB/s", units[i]);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    value = format_human_readable(eval_total_avg_observed_rate);
+    fprintf(csv_file, "\"\neval avg observed rate, %.3lf, %cB/s\n", value.value, value.unit);
+    fprintf(csv_file, "eval throughput samples per second per epoch, \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "%.3lf", global_stats[i].throughput.eval);
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\", \"");
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        fprintf(csv_file, "samples/s");
+        if (i != config.EPOCHS - 1)
+            fprintf(csv_file, ", ");
+    }
+    fprintf(csv_file, "\"\neval throughput avg samples per second, %.3lf, samples/s\n",
+            eval_throughput_mean_samples_per_second);
+    fprintf(csv_file, "eval throughput stdev samples per second, %.3lf, samples/s\n",
+            eval_throughput_stdev_samples_per_second);
+    value = format_human_readable(eval_io_mean);
+    fprintf(csv_file, "eval io avg, %.3lf, %cB/s\n", value.value, value.unit);
+    value = format_human_readable(eval_io_stdev);
+    fprintf(csv_file, "eval io stdev, %.3lf, %cB/s\n", value.value, value.unit);
+
+    fclose(csv_file);
+    free(units);
+    free(train_compute_time_per_epoch);
+    free(eval_compute_time_per_epoch);
+    free(train_avg_observed_rate_per_epoch);
+    free(eval_avg_observed_rate_per_epoch);
+}
+
+void
+print_rank_data()
+{
+    // Train
+    // TODO: drop_last = false
+    uint64_t train_total_size_bytes =
+        (uint64_t)config.NUM_TRAIN_BATCHES_PER_RANK * NUM_RANKS * config.BATCH_SIZE * config.RECORD_LENGTH;
+    uint64_t train_size_bytes_per_rank =
+        (uint64_t)config.NUM_TRAIN_BATCHES_PER_RANK * config.BATCH_SIZE * config.RECORD_LENGTH;
+
+    uint64_t  train_total_compute_time     = 0;
+    uint64_t *train_compute_time_per_epoch = (uint64_t *)malloc(config.EPOCHS * sizeof(uint64_t));
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        unsigned long int compute_time = 0;
+        for (uint32_t j = 0; j < config.NUM_TRAIN_BATCHES_PER_RANK; j++) {
+            compute_time += stats[i].compute.train[j];
+        }
+        train_total_compute_time += compute_time;
+        train_compute_time_per_epoch[i] = compute_time;
+    }
+
+    uint64_t train_total_metadata_time = 0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        train_total_metadata_time += stats[i].metadata_time.train;
+    }
+
+    uint64_t train_total_read_time = 0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        train_total_read_time += stats[i].raw_read_time.train;
+    }
+
+    double train_total_avg_read_rate = 0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        if (stats[i].raw_read_time.train == 0) {
+            continue;
+        }
+        train_total_avg_read_rate +=
+            (double)train_size_bytes_per_rank / stats[i].raw_read_time.train * 1000000.0;
+    }
+    train_total_avg_read_rate /= config.EPOCHS;
+
+    uint64_t train_total_observed_time = 0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        train_total_observed_time += stats[i].observed_time.train;
+    }
+
+    double  train_total_avg_observed_rate     = 0.0;
+    double *train_avg_observed_rate_per_epoch = (double *)malloc(config.EPOCHS * sizeof(double));
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        unsigned long int compute_time = 0;
+        for (uint32_t j = 0; j < config.NUM_TRAIN_BATCHES_PER_RANK; j++) {
+            compute_time += stats[i].compute.train[j];
+        }
+        if ((stats[i].observed_time.train - compute_time) == 0) {
+            train_avg_observed_rate_per_epoch[i] = NAN;
+            continue;
+        }
+        train_avg_observed_rate_per_epoch[i] = (double)train_size_bytes_per_rank /
+                                               (stats[i].observed_time.train - compute_time) *
+                                               1000000.0;
+        train_total_avg_observed_rate += train_avg_observed_rate_per_epoch[i];
+    }
+    train_total_avg_observed_rate /= config.EPOCHS;
+
+    double train_throughput_mean_samples_per_second = 0.0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        train_throughput_mean_samples_per_second += stats[i].throughput.train;
+    }
+    train_throughput_mean_samples_per_second =
+        train_throughput_mean_samples_per_second / (double)config.EPOCHS;
+
+    double train_throughput_stdev_samples_per_second = 0.0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        train_throughput_stdev_samples_per_second +=
+            (stats[i].throughput.train - train_throughput_mean_samples_per_second) *
+            (stats[i].throughput.train - train_throughput_mean_samples_per_second);
+    }
+    train_throughput_stdev_samples_per_second =
+        sqrt(train_throughput_stdev_samples_per_second / (double)config.EPOCHS);
+
+    double train_io_mean = train_throughput_mean_samples_per_second * config.RECORD_LENGTH;
+
+    double train_io_stdev = train_throughput_stdev_samples_per_second * config.RECORD_LENGTH;
+
+    // Evaluation
+    // TODO: drop_last = False
+    uint64_t eval_total_size_bytes = (uint64_t)config.NUM_EVAL_BATCHES_PER_RANK * NUM_RANKS *
+                                     config.BATCH_SIZE_EVAL * config.RECORD_LENGTH;
+    uint64_t eval_size_bytes_per_rank =
+        (uint64_t)config.NUM_EVAL_BATCHES_PER_RANK * config.BATCH_SIZE_EVAL * config.RECORD_LENGTH;
+
+    uint64_t  eval_total_compute_time     = 0;
+    uint64_t *eval_compute_time_per_epoch = (uint64_t *)malloc(config.EPOCHS * sizeof(uint64_t));
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        unsigned long int compute_time = 0;
+        for (uint32_t j = 0; j < config.NUM_EVAL_BATCHES_PER_RANK; j++) {
+            compute_time += stats[i].compute.eval[j];
+        }
+        eval_compute_time_per_epoch[i] = compute_time;
+        eval_total_compute_time += compute_time;
+    }
+
+    uint64_t eval_total_metadata_time = 0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        eval_total_metadata_time += stats[i].metadata_time.eval;
+    }
+
+    uint64_t eval_total_read_time = 0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        eval_total_read_time += stats[i].raw_read_time.eval;
+    }
+
+    double eval_total_avg_read_rate = 0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        if (stats[i].raw_read_time.eval == 0) {
+            continue;
+        }
+        eval_total_avg_read_rate +=
+            (double)eval_size_bytes_per_rank / stats[i].raw_read_time.eval * 1000000.0;
+    }
+    eval_total_avg_read_rate /= config.EPOCHS;
+
+    uint64_t eval_total_observed_time = 0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        eval_total_observed_time += stats[i].observed_time.eval;
+    }
+
+    double  eval_total_avg_observed_rate     = 0.0;
+    double *eval_avg_observed_rate_per_epoch = (double *)malloc(config.EPOCHS * sizeof(double));
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        unsigned long compute_time = 0;
+        for (uint32_t j = 0; j < config.NUM_EVAL_BATCHES_PER_RANK; j++) {
+            compute_time += stats[i].compute.eval[j];
+        }
+        if ((stats[i].observed_time.eval - compute_time) == 0) {
+            eval_avg_observed_rate_per_epoch[i] = NAN;
+            continue;
+        }
+        eval_avg_observed_rate_per_epoch[i] = (double)eval_size_bytes_per_rank /
+                                              (stats[i].observed_time.eval - compute_time) * 1000000.0;
+        eval_total_avg_observed_rate += eval_avg_observed_rate_per_epoch[i];
+    }
+    eval_total_avg_observed_rate /= config.EPOCHS;
+
+    double eval_throughput_mean_samples_per_second = 0.0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        eval_throughput_mean_samples_per_second += stats[i].throughput.eval;
+    }
+    eval_throughput_mean_samples_per_second = eval_throughput_mean_samples_per_second / (double)config.EPOCHS;
+
+    double eval_throughput_stdev_samples_per_second = 0.0;
+    for (uint32_t i = 0; i < config.EPOCHS; i++) {
+        eval_throughput_stdev_samples_per_second +=
+            (stats[i].throughput.eval - eval_throughput_mean_samples_per_second) *
+            (stats[i].throughput.eval - eval_throughput_mean_samples_per_second);
+    }
+    eval_throughput_stdev_samples_per_second =
+        sqrt(eval_throughput_stdev_samples_per_second / (double)config.EPOCHS);
+
+    double eval_io_mean = eval_throughput_mean_samples_per_second * config.RECORD_LENGTH;
+
+    double eval_io_stdev = eval_throughput_stdev_samples_per_second * config.RECORD_LENGTH;
+
+    human_readable value;
+
+    char filename[256];
+    snprintf(filename, sizeof(filename), "%s/%d_%s.csv", config.OUTPUT_DATA_FOLDER, MY_RANK, config.OUTPUT_CSV_NAME);
+    FILE *csv_file = fopen(filename, "w+");
 
     char *units = (char *)malloc(config.EPOCHS * sizeof(char));
 
@@ -490,7 +913,7 @@ print_data()
         }
         else {
             value = format_human_readable((double)train_size_bytes_per_rank / stats[i].raw_read_time.train *
-                                          1000000.0);
+                                             1000000.0);
             units[i] = value.unit;
             fprintf(csv_file, "%.3lf", value.value);
         }
@@ -507,7 +930,7 @@ print_data()
     fprintf(csv_file, "\"\ntrain avg raw read rate, %.3lf, %cB/s\n", value.value, value.unit);
     fprintf(csv_file, "train observed time per epoch, \"");
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        fprintf(csv_file, "%.3lf", global_stats[i].observed_time.train / 1000000.0);
+        fprintf(csv_file, "%.3lf", stats[i].observed_time.train / 1000000.0);
         if (i != config.EPOCHS - 1)
             fprintf(csv_file, ", ");
     }
@@ -536,7 +959,7 @@ print_data()
     fprintf(csv_file, "\"\ntrain avg observed rate, %.3lf, %cB/s\n", value.value, value.unit);
     fprintf(csv_file, "train throughput samples per second per epoch, \"");
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        fprintf(csv_file, "%.3lf", global_stats[i].throughput.train);
+        fprintf(csv_file, "%.3lf", stats[i].throughput.train);
         if (i != config.EPOCHS - 1)
             fprintf(csv_file, ", ");
     }
@@ -623,7 +1046,7 @@ print_data()
     fprintf(csv_file, "\"\neval avg raw read rate, %.3lf, %cB/s\n", value.value, value.unit);
     fprintf(csv_file, "eval observed time per epoch, \"");
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        fprintf(csv_file, "%.3lf", global_stats[i].observed_time.eval / 1000000.0);
+        fprintf(csv_file, "%.3lf", stats[i].observed_time.eval / 1000000.0);
         if (i != config.EPOCHS - 1)
             fprintf(csv_file, ", ");
     }
@@ -652,7 +1075,7 @@ print_data()
     fprintf(csv_file, "\"\neval avg observed rate, %.3lf, %cB/s\n", value.value, value.unit);
     fprintf(csv_file, "eval throughput samples per second per epoch, \"");
     for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        fprintf(csv_file, "%.3lf", global_stats[i].throughput.eval);
+        fprintf(csv_file, "%.3lf", stats[i].throughput.eval);
         if (i != config.EPOCHS - 1)
             fprintf(csv_file, ", ");
     }
diff --git a/dlio/stats.h b/dlio/stats.h
index f82fbaeb..ad55b92c 100644
--- a/dlio/stats.h
+++ b/dlio/stats.h
@@ -61,7 +61,9 @@ void stats_finalize();
 
 void prepare_data();
 
-void print_data();
+void print_average_data();
+
+void print_rank_data();
 
 void batch_loaded_train(uint32_t epoch, uint64_t start_time);
 
diff --git a/dlio/utils.c b/dlio/utils.c
index dfe185df..4acfd255 100644
--- a/dlio/utils.c
+++ b/dlio/utils.c
@@ -29,7 +29,7 @@ config_datatype_t config = {
     .RECORD_LENGTH = 67108864, // should be a square number
                                //   .RECORD_LENGTH_STDEV = 0.0f,
                                //   .RECORD_LENGTH_RESIZE = 0.0f,
-    .NUM_FILES_TRAIN      = 64,
+    .NUM_FILES_TRAIN      = 32,
     .NUM_FILES_EVAL       = 8,
     .NUM_SAMPLES_PER_FILE = 4,
     .DATA_FOLDER          = "./data",
@@ -76,7 +76,9 @@ config_datatype_t config = {
     .VALID_DATA_FOLDER    = "valid",
     .RECORDS_DATASET_NAME = "records",
     .LABELS_DATASET_NAME  = "labels",
-    .CSV_FILE             = "output.csv",
+    .OUTPUT_DATA_FOLDER   = "results",
+    .OUTPUT_CSV_NAME      = "output",
+    .OUTPUT_RANKS_DATA    = false,
 
     // Internal
     .NUM_TRAIN_BATCHES_PER_RANK           = 0,
@@ -294,9 +296,16 @@ parse_args(int argc, char *argv[])
         else if (strcmp(argv[i], "--drop-last") == 0) {
             config.DROP_LAST = true;
         }
-        else if (strcmp(argv[i], "--csv-file") == 0) {
+        else if (strcmp(argv[i], "--output-data-folder") == 0) {
             i++;
-            config.CSV_FILE = argv[i];
+            config.OUTPUT_DATA_FOLDER = argv[i];
+        }
+        else if (strcmp(argv[i], "--output-csv-name") == 0) {
+            i++;
+            config.OUTPUT_CSV_NAME = argv[i];
+        }
+        else if (strcmp(argv[i], "--output-ranks-data") == 0) {
+            config.OUTPUT_RANKS_DATA = true;
         }
         else {
             printf("WARNING: %s not found\n", argv[i]);
diff --git a/dlio/utils.h b/dlio/utils.h
index 1aab60df..835dbd2f 100644
--- a/dlio/utils.h
+++ b/dlio/utils.h
@@ -64,7 +64,9 @@ typedef struct config_datatype {
     char *VALID_DATA_FOLDER;
     char *RECORDS_DATASET_NAME;
     char *LABELS_DATASET_NAME;
-    char *CSV_FILE;
+    char *OUTPUT_DATA_FOLDER;
+    char *OUTPUT_CSV_NAME;
+    bool OUTPUT_RANKS_DATA;
 
     // Internal
     uint32_t NUM_TRAIN_BATCHES_PER_RANK;
diff --git a/samples/sync-dlio.json b/samples/sync-dlio.json
index 894eefd1..577d9fd0 100644
--- a/samples/sync-dlio.json
+++ b/samples/sync-dlio.json
@@ -30,7 +30,9 @@
         "train-data-folder": "train",
         "valid-data-folder": "valid",
         "records-dataset-name": "records",
-        "labels-dataset-name": "labels"
+        "labels-dataset-name": "labels",
+        "output-csv-name": "output",
+        "output-ranks-data": "true"
       }
     },
     {
@@ -57,7 +59,7 @@
         "computation-time-stdev": "0.0",
         "random-seed": "42",
         "eval-time": "0.323",
-        "eval-time-stdev": "0.323",
+        "eval-time-stdev": "0.0",
         "epochs-between-evals": "1",
         "train-data-folder": "train",
         "valid-data-folder": "valid",
@@ -65,7 +67,9 @@
         "labels-dataset-name": "labels",
         "collective-meta": "true",
         "collective-data": "true",
-        "subfiling": "false"
+        "subfiling": "false",
+        "output-csv-name": "output",
+        "output-ranks-data": "true"
       }
     }
   ]
diff --git a/src/h5bench.py b/src/h5bench.py
index 69c8f255..66a65278 100755
--- a/src/h5bench.py
+++ b/src/h5bench.py
@@ -981,6 +981,7 @@ def run_dlio(self, id, setup):
                 'collective-meta',
                 'collective-data',
                 'subfiling',
+                'output-ranks-data',
             ]
 
             # Create the configuration parameter list
@@ -989,14 +990,14 @@ def run_dlio(self, id, setup):
                     if configuration[key].lower() == 'true':
                         parameters.append('--{} '.format(key))
 
-                # Make sure the CSV file and datasets are generated in the temporary path
-                elif key == 'csv-file':
-                    parameters.append('--{} {} '.format(key, '{}/{}/{}'.format(self.directory, id, configuration[key])))
+                # Make sure datasets are generated in the temporary path
                 elif key == 'data-folder':
                     parameters.append('--{} {} '.format(key, '{}/{}'.format(self.directory, configuration[key])))
                 else:
                     parameters.append('--{} {} '.format(key, configuration[key]))
 
+            parameters.append('--output-data-folder {} '.format('{}/{}'.format(self.directory, id)))
+
             if self.prefix:
                 benchmark_path = self.prefix + '/' + self.H5BENCH_DLIO
             else:

From 61f9a808b636f02917117b75322a94b3ce93da7d Mon Sep 17 00:00:00 2001
From: github-actions <github-actions[bot]@users.noreply.github.com>
Date: Thu, 18 Jul 2024 18:43:02 +0000
Subject: [PATCH 24/37] Committing clang-format changes

---
 dlio/stats.c | 21 +++++++++++----------
 dlio/utils.h |  2 +-
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/dlio/stats.c b/dlio/stats.c
index 8aae2c45..529c23dd 100644
--- a/dlio/stats.c
+++ b/dlio/stats.c
@@ -492,8 +492,8 @@ print_average_data()
             fprintf(csv_file, "NaN");
         }
         else {
-            value = format_human_readable((double)train_size_bytes_per_rank / global_stats[i].raw_read_time.train *
-                                          1000000.0);
+            value    = format_human_readable((double)train_size_bytes_per_rank /
+                                          global_stats[i].raw_read_time.train * 1000000.0);
             units[i] = value.unit;
             fprintf(csv_file, "%.3lf", value.value);
         }
@@ -609,7 +609,8 @@ print_average_data()
             fprintf(csv_file, "NaN");
         }
         else {
-            value = format_human_readable(eval_size_bytes_per_rank / global_stats[i].raw_read_time.eval * 1000000.0);
+            value    = format_human_readable(eval_size_bytes_per_rank / global_stats[i].raw_read_time.eval *
+                                          1000000.0);
             units[i] = value.unit;
             fprintf(csv_file, "%.3lf", value.value);
         }
@@ -739,9 +740,8 @@ print_rank_data()
             train_avg_observed_rate_per_epoch[i] = NAN;
             continue;
         }
-        train_avg_observed_rate_per_epoch[i] = (double)train_size_bytes_per_rank /
-                                               (stats[i].observed_time.train - compute_time) *
-                                               1000000.0;
+        train_avg_observed_rate_per_epoch[i] =
+            (double)train_size_bytes_per_rank / (stats[i].observed_time.train - compute_time) * 1000000.0;
         train_total_avg_observed_rate += train_avg_observed_rate_per_epoch[i];
     }
     train_total_avg_observed_rate /= config.EPOCHS;
@@ -820,8 +820,8 @@ print_rank_data()
             eval_avg_observed_rate_per_epoch[i] = NAN;
             continue;
         }
-        eval_avg_observed_rate_per_epoch[i] = (double)eval_size_bytes_per_rank /
-                                              (stats[i].observed_time.eval - compute_time) * 1000000.0;
+        eval_avg_observed_rate_per_epoch[i] =
+            (double)eval_size_bytes_per_rank / (stats[i].observed_time.eval - compute_time) * 1000000.0;
         eval_total_avg_observed_rate += eval_avg_observed_rate_per_epoch[i];
     }
     eval_total_avg_observed_rate /= config.EPOCHS;
@@ -848,7 +848,8 @@ print_rank_data()
     human_readable value;
 
     char filename[256];
-    snprintf(filename, sizeof(filename), "%s/%d_%s.csv", config.OUTPUT_DATA_FOLDER, MY_RANK, config.OUTPUT_CSV_NAME);
+    snprintf(filename, sizeof(filename), "%s/%d_%s.csv", config.OUTPUT_DATA_FOLDER, MY_RANK,
+             config.OUTPUT_CSV_NAME);
     FILE *csv_file = fopen(filename, "w+");
 
     char *units = (char *)malloc(config.EPOCHS * sizeof(char));
@@ -913,7 +914,7 @@ print_rank_data()
         }
         else {
             value = format_human_readable((double)train_size_bytes_per_rank / stats[i].raw_read_time.train *
-                                             1000000.0);
+                                          1000000.0);
             units[i] = value.unit;
             fprintf(csv_file, "%.3lf", value.value);
         }
diff --git a/dlio/utils.h b/dlio/utils.h
index 835dbd2f..6221bbfa 100644
--- a/dlio/utils.h
+++ b/dlio/utils.h
@@ -66,7 +66,7 @@ typedef struct config_datatype {
     char *LABELS_DATASET_NAME;
     char *OUTPUT_DATA_FOLDER;
     char *OUTPUT_CSV_NAME;
-    bool OUTPUT_RANKS_DATA;
+    bool  OUTPUT_RANKS_DATA;
 
     // Internal
     uint32_t NUM_TRAIN_BATCHES_PER_RANK;

From 6d48da13f8d9b27f4e90351f05e3955c4266c1e2 Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Thu, 18 Jul 2024 20:50:43 +0200
Subject: [PATCH 25/37] Update the documentation

---
 docs/source/dlio.rst | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/source/dlio.rst b/docs/source/dlio.rst
index d80190be..29fe1875 100644
--- a/docs/source/dlio.rst
+++ b/docs/source/dlio.rst
@@ -18,7 +18,7 @@ As in the case with other extensions, the following parameters should be specifi
 ``train``                  Enable model training simulation                                      bool     false
 ``evaluation``             Enable model evaluation simulation                                    bool     false
 ``record-length``          Record size of a single sample in bytes                               int      67108864
-``num-files-train``        The number of files used to train the model                           int      64
+``num-files-train``        The number of files used to train the model                           int      32
 ``num-files-eval``         The number of files used to evaluate the model                        int      8
 ``num-samples-per-file``   The number of samples in each file                                    int      4
 ``data-folder``            Name of the directory storing the benchmark data                      string   ./data
@@ -49,7 +49,8 @@ As in the case with other extensions, the following parameters should be specifi
 ``collective-meta``        Enable collective HDF5 metadata operations                            bool     false
 ``collective-data``        Enable collective HDF5 data operations                                bool     false
 ``subfiling``              Enable HDF5 Subfiling Virtual File Driver                             bool     false
-``csv-file``               Name of the output csv file                                           string   output.csv
+``output-csv-name``        Name of the output csv file                                           string   output
+``output-ranks-data ``     Enable statistics output for each rank                                bool     false
 ========================== ===================================================================== ======== ==============
 
 It should be noted that for each parameter there is a default value that applies if the parameter has not been specified
@@ -122,3 +123,4 @@ without changing the order of the files for training.
 * Add support for drop_last customization. Currently, by default, all batches left after MPI ranks distribution are not processed.
 * Replace the use of ``fork()`` with ``MPI_Comm_spawn()`` when creating new processes, as using ``fork()`` with MPI may be unsafe
 * Test support for the Cache VOL connector.
+* Add support for checkpointing by saving the model to a hdf5 file.

From 63af2cd75c11391143316af97721faafbc7ec746 Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Tue, 6 Aug 2024 19:30:44 +0200
Subject: [PATCH 26/37] Add total-training-steps parameter

---
 dlio/README.md      |   1 +
 dlio/h5bench_dlio.c |  44 +++++++-
 dlio/stats.c        | 269 ++++++++++++++++++++++----------------------
 dlio/utils.c        |   9 +-
 dlio/utils.h        |   3 +-
 5 files changed, 185 insertions(+), 141 deletions(-)

diff --git a/dlio/README.md b/dlio/README.md
index 9e09f73b..abcf29ad 100644
--- a/dlio/README.md
+++ b/dlio/README.md
@@ -31,6 +31,7 @@ json file to configure the benchmark:
 | preprocess-time        | Preprocessing time after reading each sample in seconds             | float  | 0.0      |
 | preprocess-time-stdev  | Standard deviation in preprocessing time in seconds                 | float  | 0.0      |
 | epochs                 | The number of epochs                                                | int    | 5        |
+| total-training-steps   | Maximum number of steps per training per epoch                      | int    | -1       |
 | computation-time       | Computation time after reading each batch in seconds                | float  | 0.323    |
 | computation-time-stdev | Standard deviation in computation time in seconds                   | float  | 0.0      |
 | random-seed            | Random seed to be used                                              | int    | 42       |
diff --git a/dlio/h5bench_dlio.c b/dlio/h5bench_dlio.c
index d3ddb4d6..a18085a4 100644
--- a/dlio/h5bench_dlio.c
+++ b/dlio/h5bench_dlio.c
@@ -26,6 +26,7 @@ int      NUM_RANKS, MY_RANK;
 uint32_t GENERATION_SIZE;
 uint32_t DIM;
 hid_t    DCPL, FAPL, DAPL, DXPL;
+MPI_Comm rest_training_steps_comm = MPI_COMM_WORLD;
 
 void
 generate_labels_dataset(hid_t file_id, hid_t filespace, hid_t memspace)
@@ -373,6 +374,9 @@ train_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadat
 
     uint64_t t0 = get_time_usec_return_uint64();
     for (uint32_t i = 0; i < config.NUM_TRAIN_BATCHES_PER_RANK; i++) {
+        if (i == config.TOTAL_TRAINING_STEPS_PER_RANK) {
+            break;
+        }
         for (uint32_t j = 0; j < config.BATCH_SIZE; j++) {
             uint32_t file_num = indices[offset + i * config.BATCH_SIZE + j] / config.NUM_SAMPLES_PER_FILE + 1;
             uint32_t sample_num = indices[offset + i * config.BATCH_SIZE + j] % config.NUM_SAMPLES_PER_FILE;
@@ -391,7 +395,11 @@ train_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadat
 
         uint64_t t = compute(config.COMPUTATION_TIME, config.COMPUTATION_TIME_STDEV);
         batch_processed_train(epoch, t, t0);
-        MPI_Barrier(MPI_COMM_WORLD);
+        if ((MY_RANK < config.TOTAL_TRAINING_STEPS % NUM_RANKS) && (i + 1 == config.TOTAL_TRAINING_STEPS_PER_RANK)) {
+            MPI_Barrier(rest_training_steps_comm);
+        } else {
+            MPI_Barrier(MPI_COMM_WORLD);
+        }
 
         t0 = get_time_usec_return_uint64();
     }
@@ -443,7 +451,10 @@ train_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t
         write(get_train_write_fd(), &batch, sizeof(batch));
     }
 
-    for (uint32_t i = config.READ_THREADS; i < config.NUM_TRAIN_BATCHES_PER_RANK; i++) {
+    for (uint32_t i = config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN; i < config.NUM_TRAIN_BATCHES_PER_RANK; i++) {
+        if (i == config.TOTAL_TRAINING_STEPS_PER_RANK) {
+            break;
+        }
         execution_time_t data_from_child_process;
         uint64_t         t0 = get_time_usec_return_uint64();
         read(get_train_read_fd(), &data_from_child_process, sizeof(data_from_child_process));
@@ -458,7 +469,12 @@ train_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t
 
         uint64_t t = compute(config.COMPUTATION_TIME, config.COMPUTATION_TIME_STDEV);
         batch_processed_train(epoch, t, t0);
-        MPI_Barrier(MPI_COMM_WORLD);
+
+        if ((MY_RANK < config.TOTAL_TRAINING_STEPS % NUM_RANKS) && (i + 1 == config.TOTAL_TRAINING_STEPS_PER_RANK)) {
+            MPI_Barrier(rest_training_steps_comm);
+        } else {
+            MPI_Barrier(MPI_COMM_WORLD);
+        }
     }
 
     for (uint32_t i = 0; i < config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN; i++) {
@@ -473,7 +489,11 @@ train_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t
 
         uint64_t t = compute(config.COMPUTATION_TIME, config.COMPUTATION_TIME_STDEV);
         batch_processed_train(epoch, t, t0);
-        MPI_Barrier(MPI_COMM_WORLD);
+        if ((MY_RANK < config.TOTAL_TRAINING_STEPS % NUM_RANKS) && (i + 1 == config.TOTAL_TRAINING_STEPS_PER_RANK)) {
+            MPI_Barrier(rest_training_steps_comm);
+        } else {
+            MPI_Barrier(MPI_COMM_WORLD);
+        }
     }
 }
 
@@ -592,6 +612,22 @@ init_global_variables()
                                                      ? config.NUM_EVAL_BATCHES_PER_RANK
                                                      : config.READ_THREADS;
 
+    if (config.TOTAL_TRAINING_STEPS != -1 && config.TOTAL_TRAINING_STEPS < config.NUM_TRAIN_BATCHES_PER_RANK * NUM_RANKS) {
+        config.TOTAL_TRAINING_STEPS_PER_RANK = config.TOTAL_TRAINING_STEPS / NUM_RANKS;
+        if (MY_RANK < config.TOTAL_TRAINING_STEPS % NUM_RANKS) {
+            config.TOTAL_TRAINING_STEPS_PER_RANK++;
+            MPI_Comm_split(MPI_COMM_WORLD, 0, MY_RANK, &rest_training_steps_comm);
+        } else {
+            MPI_Comm_split(MPI_COMM_WORLD, MPI_UNDEFINED, MY_RANK, &rest_training_steps_comm);
+        }
+
+        config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN = config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN > config.TOTAL_TRAINING_STEPS_PER_RANK
+                                                          ? config.TOTAL_TRAINING_STEPS_PER_RANK
+                                                          : config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN;
+    } else {
+        config.TOTAL_TRAINING_STEPS = -1;
+    }
+
     srand(config.RANDOM_SEED);
 
     // drop last warning
diff --git a/dlio/stats.c b/dlio/stats.c
index 529c23dd..eb7ff596 100644
--- a/dlio/stats.c
+++ b/dlio/stats.c
@@ -221,9 +221,8 @@ print_average_data()
     // Train
     // TODO: drop_last = false
     uint64_t train_total_size_bytes =
-        (uint64_t)config.NUM_TRAIN_BATCHES_PER_RANK * NUM_RANKS * config.BATCH_SIZE * config.RECORD_LENGTH;
-    uint64_t train_size_bytes_per_rank =
-        (uint64_t)config.NUM_TRAIN_BATCHES_PER_RANK * config.BATCH_SIZE * config.RECORD_LENGTH;
+        (uint64_t)config.BATCH_SIZE * (config.TOTAL_TRAINING_STEPS == -1? config.NUM_TRAIN_BATCHES_PER_RANK * NUM_RANKS: config.TOTAL_TRAINING_STEPS) * config.RECORD_LENGTH;
+    uint64_t train_size_bytes_per_rank = train_total_size_bytes / NUM_RANKS;
 
     uint64_t  train_total_compute_time     = 0;
     uint64_t *train_compute_time_per_epoch = (uint64_t *)malloc(config.EPOCHS * sizeof(uint64_t));
@@ -404,25 +403,27 @@ print_average_data()
     value = format_human_readable(train_io_stdev);
     printf("Training IO standard deviation: %.3f %cB/s\n", value.value, value.unit);
 
-    value = format_human_readable(eval_total_size_bytes);
-    printf("Total evaluation set size: %.3lf %cB\n", value.value, value.unit);
-    value = format_human_readable(eval_size_bytes_per_rank);
-    printf("Evaluation set size per rank: %.3lf %cB\n", value.value, value.unit);
-    printf("Total evaluation emulated compute time: %.3lf s\n", eval_total_compute_time / 1000000.0);
-    printf("Evaluation metadata time: %.3lf s\n", eval_total_metadata_time / 1000000.0);
-    printf("Evaluation raw read time: %.3lf s\n", eval_total_read_time / 1000000.0);
-    value = format_human_readable(eval_total_avg_read_rate);
-    printf("Evaluation average raw read rate: %.3lf %cB/s\n", value.value, value.unit);
-    printf("Observed evaluation completion time: %.3lf s\n", eval_total_observed_time / 1000000.0);
-    value = format_human_readable(eval_total_avg_observed_rate);
-    printf("Observed average evaluation rate: %.3lf %cB/s\n", value.value, value.unit);
-    printf("Evaluation average throughput avg: %.3lf samples/s\n", eval_throughput_mean_samples_per_second);
-    printf("Evaluation throughput standard deviation: %.3lf samples/s\n",
-           eval_throughput_stdev_samples_per_second);
-    value = format_human_readable(eval_io_mean);
-    printf("Evaluation average IO: %.3lf %cB/s\n", value.value, value.unit);
-    value = format_human_readable(eval_io_stdev);
-    printf("Evaluation IO standard deviation: %.3lf %cB/s\n", value.value, value.unit);
+    if (config.DO_EVALUATION) {
+        value = format_human_readable(eval_total_size_bytes);
+        printf("Total evaluation set size: %.3lf %cB\n", value.value, value.unit);
+        value = format_human_readable(eval_size_bytes_per_rank);
+        printf("Evaluation set size per rank: %.3lf %cB\n", value.value, value.unit);
+        printf("Total evaluation emulated compute time: %.3lf s\n", eval_total_compute_time / 1000000.0);
+        printf("Evaluation metadata time: %.3lf s\n", eval_total_metadata_time / 1000000.0);
+        printf("Evaluation raw read time: %.3lf s\n", eval_total_read_time / 1000000.0);
+        value = format_human_readable(eval_total_avg_read_rate);
+        printf("Evaluation average raw read rate: %.3lf %cB/s\n", value.value, value.unit);
+        printf("Observed evaluation completion time: %.3lf s\n", eval_total_observed_time / 1000000.0);
+        value = format_human_readable(eval_total_avg_observed_rate);
+        printf("Observed average evaluation rate: %.3lf %cB/s\n", value.value, value.unit);
+        printf("Evaluation average throughput avg: %.3lf samples/s\n", eval_throughput_mean_samples_per_second);
+        printf("Evaluation throughput standard deviation: %.3lf samples/s\n",
+               eval_throughput_stdev_samples_per_second);
+        value = format_human_readable(eval_io_mean);
+        printf("Evaluation average IO: %.3lf %cB/s\n", value.value, value.unit);
+        value = format_human_readable(eval_io_stdev);
+        printf("Evaluation IO standard deviation: %.3lf %cB/s\n", value.value, value.unit);
+    }
 
     printf("===========================================================\n");
 
@@ -558,122 +559,124 @@ print_average_data()
     value = format_human_readable(train_io_stdev);
     fprintf(csv_file, "train io stdev, %.3lf, %cB/s\n", value.value, value.unit);
 
-    value = format_human_readable(eval_total_size_bytes);
-    fprintf(csv_file, "eval total size, %.3lf, %cB\n", value.value, value.unit);
-    value = format_human_readable(eval_size_bytes_per_rank);
-    fprintf(csv_file, "eval size per rank, %.3lf, %cB\n", value.value, value.unit);
-    fprintf(csv_file, "eval emulated compute time per epoch, \"");
-    for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        fprintf(csv_file, "%.3lf", eval_compute_time_per_epoch[i] / 1000000.0);
-        if (i != config.EPOCHS - 1)
-            fprintf(csv_file, ", ");
-    }
-    fprintf(csv_file, "\", \"");
-    for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        fprintf(csv_file, "s");
-        if (i != config.EPOCHS - 1)
-            fprintf(csv_file, ", ");
-    }
-    fprintf(csv_file, "\"\neval emulated compute time, %.3lf, s\n", eval_total_compute_time / 1000000.0);
-    fprintf(csv_file, "eval metadata time per epoch, \"");
-    for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        fprintf(csv_file, "%.3lf", global_stats[i].metadata_time.eval / 1000000.0);
-        if (i != config.EPOCHS - 1)
-            fprintf(csv_file, ", ");
-    }
-    fprintf(csv_file, "\", \"");
-    for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        fprintf(csv_file, "s");
-        if (i != config.EPOCHS - 1)
-            fprintf(csv_file, ", ");
-    }
-    fprintf(csv_file, "\"\neval metadata time, %.3lf, s\n", eval_total_metadata_time / 1000000.0);
-    fprintf(csv_file, "eval raw read time per epoch, \"");
-    for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        fprintf(csv_file, "%.3lf", global_stats[i].raw_read_time.eval / 1000000.0);
-        if (i != config.EPOCHS - 1)
-            fprintf(csv_file, ", ");
-    }
-    fprintf(csv_file, "\", \"");
-    for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        fprintf(csv_file, "s");
-        if (i != config.EPOCHS - 1)
-            fprintf(csv_file, ", ");
-    }
+    if (config.DO_EVALUATION) {
+        value = format_human_readable(eval_total_size_bytes);
+        fprintf(csv_file, "eval total size, %.3lf, %cB\n", value.value, value.unit);
+        value = format_human_readable(eval_size_bytes_per_rank);
+        fprintf(csv_file, "eval size per rank, %.3lf, %cB\n", value.value, value.unit);
+        fprintf(csv_file, "eval emulated compute time per epoch, \"");
+        for (uint32_t i = 0; i < config.EPOCHS; i++) {
+            fprintf(csv_file, "%.3lf", eval_compute_time_per_epoch[i] / 1000000.0);
+            if (i != config.EPOCHS - 1)
+                fprintf(csv_file, ", ");
+        }
+        fprintf(csv_file, "\", \"");
+        for (uint32_t i = 0; i < config.EPOCHS; i++) {
+            fprintf(csv_file, "s");
+            if (i != config.EPOCHS - 1)
+                fprintf(csv_file, ", ");
+        }
+        fprintf(csv_file, "\"\neval emulated compute time, %.3lf, s\n", eval_total_compute_time / 1000000.0);
+        fprintf(csv_file, "eval metadata time per epoch, \"");
+        for (uint32_t i = 0; i < config.EPOCHS; i++) {
+            fprintf(csv_file, "%.3lf", global_stats[i].metadata_time.eval / 1000000.0);
+            if (i != config.EPOCHS - 1)
+                fprintf(csv_file, ", ");
+        }
+        fprintf(csv_file, "\", \"");
+        for (uint32_t i = 0; i < config.EPOCHS; i++) {
+            fprintf(csv_file, "s");
+            if (i != config.EPOCHS - 1)
+                fprintf(csv_file, ", ");
+        }
+        fprintf(csv_file, "\"\neval metadata time, %.3lf, s\n", eval_total_metadata_time / 1000000.0);
+        fprintf(csv_file, "eval raw read time per epoch, \"");
+        for (uint32_t i = 0; i < config.EPOCHS; i++) {
+            fprintf(csv_file, "%.3lf", global_stats[i].raw_read_time.eval / 1000000.0);
+            if (i != config.EPOCHS - 1)
+                fprintf(csv_file, ", ");
+        }
+        fprintf(csv_file, "\", \"");
+        for (uint32_t i = 0; i < config.EPOCHS; i++) {
+            fprintf(csv_file, "s");
+            if (i != config.EPOCHS - 1)
+                fprintf(csv_file, ", ");
+        }
 
-    fprintf(csv_file, "\"\neval total raw read time, %.3lf, s\n", eval_total_read_time / 1000000.0);
-    fprintf(csv_file, "eval raw read rate per epoch, \"");
-    for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        if (global_stats[i].raw_read_time.eval == 0) {
-            units[i] = ' ';
-            fprintf(csv_file, "NaN");
+        fprintf(csv_file, "\"\neval total raw read time, %.3lf, s\n", eval_total_read_time / 1000000.0);
+        fprintf(csv_file, "eval raw read rate per epoch, \"");
+        for (uint32_t i = 0; i < config.EPOCHS; i++) {
+            if (global_stats[i].raw_read_time.eval == 0) {
+                units[i] = ' ';
+                fprintf(csv_file, "NaN");
+            }
+            else {
+                value = format_human_readable(eval_size_bytes_per_rank / global_stats[i].raw_read_time.eval *
+                                              1000000.0);
+                units[i] = value.unit;
+                fprintf(csv_file, "%.3lf", value.value);
+            }
+            if (i != config.EPOCHS - 1)
+                fprintf(csv_file, ", ");
         }
-        else {
-            value    = format_human_readable(eval_size_bytes_per_rank / global_stats[i].raw_read_time.eval *
-                                          1000000.0);
+        fprintf(csv_file, "\", \"");
+        for (uint32_t i = 0; i < config.EPOCHS; i++) {
+            fprintf(csv_file, "%cB/s", units[i]);
+            if (i != config.EPOCHS - 1)
+                fprintf(csv_file, ", ");
+        }
+        value = format_human_readable(eval_total_avg_read_rate);
+        fprintf(csv_file, "\"\neval avg raw read rate, %.3lf, %cB/s\n", value.value, value.unit);
+        fprintf(csv_file, "eval observed time per epoch, \"");
+        for (uint32_t i = 0; i < config.EPOCHS; i++) {
+            fprintf(csv_file, "%.3lf", global_stats[i].observed_time.eval / 1000000.0);
+            if (i != config.EPOCHS - 1)
+                fprintf(csv_file, ", ");
+        }
+        fprintf(csv_file, "\", \"");
+        for (uint32_t i = 0; i < config.EPOCHS; i++) {
+            fprintf(csv_file, "s");
+            if (i != config.EPOCHS - 1)
+                fprintf(csv_file, ", ");
+        }
+        fprintf(csv_file, "\"\neval observed time, %.3lf, s\n", eval_total_observed_time / 1000000.0);
+        fprintf(csv_file, "eval observed rate per epoch, \"");
+        for (uint32_t i = 0; i < config.EPOCHS; i++) {
+            value    = format_human_readable(eval_avg_observed_rate_per_epoch[i]);
             units[i] = value.unit;
             fprintf(csv_file, "%.3lf", value.value);
+            if (i != config.EPOCHS - 1)
+                fprintf(csv_file, ", ");
         }
-        if (i != config.EPOCHS - 1)
-            fprintf(csv_file, ", ");
-    }
-    fprintf(csv_file, "\", \"");
-    for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        fprintf(csv_file, "%cB/s", units[i]);
-        if (i != config.EPOCHS - 1)
-            fprintf(csv_file, ", ");
-    }
-    value = format_human_readable(eval_total_avg_read_rate);
-    fprintf(csv_file, "\"\neval avg raw read rate, %.3lf, %cB/s\n", value.value, value.unit);
-    fprintf(csv_file, "eval observed time per epoch, \"");
-    for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        fprintf(csv_file, "%.3lf", global_stats[i].observed_time.eval / 1000000.0);
-        if (i != config.EPOCHS - 1)
-            fprintf(csv_file, ", ");
-    }
-    fprintf(csv_file, "\", \"");
-    for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        fprintf(csv_file, "s");
-        if (i != config.EPOCHS - 1)
-            fprintf(csv_file, ", ");
-    }
-    fprintf(csv_file, "\"\neval observed time, %.3lf, s\n", eval_total_observed_time / 1000000.0);
-    fprintf(csv_file, "eval observed rate per epoch, \"");
-    for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        value    = format_human_readable(eval_avg_observed_rate_per_epoch[i]);
-        units[i] = value.unit;
-        fprintf(csv_file, "%.3lf", value.value);
-        if (i != config.EPOCHS - 1)
-            fprintf(csv_file, ", ");
-    }
-    fprintf(csv_file, "\", \"");
-    for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        fprintf(csv_file, "%cB/s", units[i]);
-        if (i != config.EPOCHS - 1)
-            fprintf(csv_file, ", ");
-    }
-    value = format_human_readable(eval_total_avg_observed_rate);
-    fprintf(csv_file, "\"\neval avg observed rate, %.3lf, %cB/s\n", value.value, value.unit);
-    fprintf(csv_file, "eval throughput samples per second per epoch, \"");
-    for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        fprintf(csv_file, "%.3lf", global_stats[i].throughput.eval);
-        if (i != config.EPOCHS - 1)
-            fprintf(csv_file, ", ");
-    }
-    fprintf(csv_file, "\", \"");
-    for (uint32_t i = 0; i < config.EPOCHS; i++) {
-        fprintf(csv_file, "samples/s");
-        if (i != config.EPOCHS - 1)
-            fprintf(csv_file, ", ");
+        fprintf(csv_file, "\", \"");
+        for (uint32_t i = 0; i < config.EPOCHS; i++) {
+            fprintf(csv_file, "%cB/s", units[i]);
+            if (i != config.EPOCHS - 1)
+                fprintf(csv_file, ", ");
+        }
+        value = format_human_readable(eval_total_avg_observed_rate);
+        fprintf(csv_file, "\"\neval avg observed rate, %.3lf, %cB/s\n", value.value, value.unit);
+        fprintf(csv_file, "eval throughput samples per second per epoch, \"");
+        for (uint32_t i = 0; i < config.EPOCHS; i++) {
+            fprintf(csv_file, "%.3lf", global_stats[i].throughput.eval);
+            if (i != config.EPOCHS - 1)
+                fprintf(csv_file, ", ");
+        }
+        fprintf(csv_file, "\", \"");
+        for (uint32_t i = 0; i < config.EPOCHS; i++) {
+            fprintf(csv_file, "samples/s");
+            if (i != config.EPOCHS - 1)
+                fprintf(csv_file, ", ");
+        }
+        fprintf(csv_file, "\"\neval throughput avg samples per second, %.3lf, samples/s\n",
+                eval_throughput_mean_samples_per_second);
+        fprintf(csv_file, "eval throughput stdev samples per second, %.3lf, samples/s\n",
+                eval_throughput_stdev_samples_per_second);
+        value = format_human_readable(eval_io_mean);
+        fprintf(csv_file, "eval io avg, %.3lf, %cB/s\n", value.value, value.unit);
+        value = format_human_readable(eval_io_stdev);
+        fprintf(csv_file, "eval io stdev, %.3lf, %cB/s\n", value.value, value.unit);
     }
-    fprintf(csv_file, "\"\neval throughput avg samples per second, %.3lf, samples/s\n",
-            eval_throughput_mean_samples_per_second);
-    fprintf(csv_file, "eval throughput stdev samples per second, %.3lf, samples/s\n",
-            eval_throughput_stdev_samples_per_second);
-    value = format_human_readable(eval_io_mean);
-    fprintf(csv_file, "eval io avg, %.3lf, %cB/s\n", value.value, value.unit);
-    value = format_human_readable(eval_io_stdev);
-    fprintf(csv_file, "eval io stdev, %.3lf, %cB/s\n", value.value, value.unit);
 
     fclose(csv_file);
     free(units);
@@ -1144,7 +1147,7 @@ end_train(uint32_t epoch, uint64_t metadata_time, uint64_t read_time)
         stats[epoch].throughput.train = NAN;
     }
     else {
-        stats[epoch].throughput.train = (double)config.NUM_TRAIN_BATCHES_PER_RANK * config.BATCH_SIZE *
+        stats[epoch].throughput.train = (double)config.BATCH_SIZE * (config.TOTAL_TRAINING_STEPS_PER_RANK == -1? config.NUM_TRAIN_BATCHES_PER_RANK: config.TOTAL_TRAINING_STEPS_PER_RANK) *
                                         1000000.0 / (end_time - stats[epoch].start_time.train);
     }
     stats[epoch].metadata_time.train = metadata_time;
diff --git a/dlio/utils.c b/dlio/utils.c
index 4acfd255..842d9e68 100644
--- a/dlio/utils.c
+++ b/dlio/utils.c
@@ -46,11 +46,9 @@ config_datatype_t config = {
     .SUBFILING         = false,
 
     // Reader
-    //    .DATA_LOADER = PYTORCH,
     .BATCH_SIZE      = 7,
     .BATCH_SIZE_EVAL = 2,
     .READ_THREADS    = 4,
-    //    .COMPUTATION_THREADS = 1,
     //    .PREFETCH_SIZE = 0,
     .DO_SHUFFLE = false, // sample shuffle vs file_shuffle
                          //    .TRANSFER_SIZE = 262144,
@@ -62,7 +60,8 @@ config_datatype_t config = {
     .EPOCHS                 = 5,
     .COMPUTATION_TIME       = 0.323f,
     .COMPUTATION_TIME_STDEV = 0.000f,
-    //    .TOTAL_TRAINING_STEPS = -1
+    .TOTAL_TRAINING_STEPS = -1,
+    .TOTAL_TRAINING_STEPS_PER_RANK = -1,
     .SEED_CHANGE_EPOCH = false,
     .RANDOM_SEED       = 42,
 
@@ -307,6 +306,10 @@ parse_args(int argc, char *argv[])
         else if (strcmp(argv[i], "--output-ranks-data") == 0) {
             config.OUTPUT_RANKS_DATA = true;
         }
+        else if (strcmp(argv[i], "--total-training-steps") == 0) {
+            i++;
+            config.TOTAL_TRAINING_STEPS = atoi(argv[i]);
+        }
         else {
             printf("WARNING: %s not found\n", argv[i]);
         }
diff --git a/dlio/utils.h b/dlio/utils.h
index 6221bbfa..a00b3d39 100644
--- a/dlio/utils.h
+++ b/dlio/utils.h
@@ -50,7 +50,8 @@ typedef struct config_datatype {
     uint32_t EPOCHS;
     float    COMPUTATION_TIME;
     float    COMPUTATION_TIME_STDEV;
-    //    long int TOTAL_TRAINING_STEPS = -1
+    uint32_t TOTAL_TRAINING_STEPS;
+    uint32_t TOTAL_TRAINING_STEPS_PER_RANK;
     bool SEED_CHANGE_EPOCH;
     int  RANDOM_SEED;
 

From ac6fd8f3424343dba7ff57a2a2ea5d7d5b4c5478 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions[bot]@users.noreply.github.com>
Date: Tue, 6 Aug 2024 17:31:16 +0000
Subject: [PATCH 27/37] Committing clang-format changes

---
 dlio/h5bench_dlio.c | 37 ++++++++++++++++++++++++-------------
 dlio/stats.c        | 15 +++++++++++----
 dlio/utils.c        | 12 ++++++------
 dlio/utils.h        |  4 ++--
 4 files changed, 43 insertions(+), 25 deletions(-)

diff --git a/dlio/h5bench_dlio.c b/dlio/h5bench_dlio.c
index a18085a4..64e2c94f 100644
--- a/dlio/h5bench_dlio.c
+++ b/dlio/h5bench_dlio.c
@@ -395,9 +395,11 @@ train_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadat
 
         uint64_t t = compute(config.COMPUTATION_TIME, config.COMPUTATION_TIME_STDEV);
         batch_processed_train(epoch, t, t0);
-        if ((MY_RANK < config.TOTAL_TRAINING_STEPS % NUM_RANKS) && (i + 1 == config.TOTAL_TRAINING_STEPS_PER_RANK)) {
+        if ((MY_RANK < config.TOTAL_TRAINING_STEPS % NUM_RANKS) &&
+            (i + 1 == config.TOTAL_TRAINING_STEPS_PER_RANK)) {
             MPI_Barrier(rest_training_steps_comm);
-        } else {
+        }
+        else {
             MPI_Barrier(MPI_COMM_WORLD);
         }
 
@@ -451,7 +453,8 @@ train_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t
         write(get_train_write_fd(), &batch, sizeof(batch));
     }
 
-    for (uint32_t i = config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN; i < config.NUM_TRAIN_BATCHES_PER_RANK; i++) {
+    for (uint32_t i = config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN; i < config.NUM_TRAIN_BATCHES_PER_RANK;
+         i++) {
         if (i == config.TOTAL_TRAINING_STEPS_PER_RANK) {
             break;
         }
@@ -470,9 +473,11 @@ train_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t
         uint64_t t = compute(config.COMPUTATION_TIME, config.COMPUTATION_TIME_STDEV);
         batch_processed_train(epoch, t, t0);
 
-        if ((MY_RANK < config.TOTAL_TRAINING_STEPS % NUM_RANKS) && (i + 1 == config.TOTAL_TRAINING_STEPS_PER_RANK)) {
+        if ((MY_RANK < config.TOTAL_TRAINING_STEPS % NUM_RANKS) &&
+            (i + 1 == config.TOTAL_TRAINING_STEPS_PER_RANK)) {
             MPI_Barrier(rest_training_steps_comm);
-        } else {
+        }
+        else {
             MPI_Barrier(MPI_COMM_WORLD);
         }
     }
@@ -489,9 +494,11 @@ train_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t
 
         uint64_t t = compute(config.COMPUTATION_TIME, config.COMPUTATION_TIME_STDEV);
         batch_processed_train(epoch, t, t0);
-        if ((MY_RANK < config.TOTAL_TRAINING_STEPS % NUM_RANKS) && (i + 1 == config.TOTAL_TRAINING_STEPS_PER_RANK)) {
+        if ((MY_RANK < config.TOTAL_TRAINING_STEPS % NUM_RANKS) &&
+            (i + 1 == config.TOTAL_TRAINING_STEPS_PER_RANK)) {
             MPI_Barrier(rest_training_steps_comm);
-        } else {
+        }
+        else {
             MPI_Barrier(MPI_COMM_WORLD);
         }
     }
@@ -612,19 +619,23 @@ init_global_variables()
                                                      ? config.NUM_EVAL_BATCHES_PER_RANK
                                                      : config.READ_THREADS;
 
-    if (config.TOTAL_TRAINING_STEPS != -1 && config.TOTAL_TRAINING_STEPS < config.NUM_TRAIN_BATCHES_PER_RANK * NUM_RANKS) {
+    if (config.TOTAL_TRAINING_STEPS != -1 &&
+        config.TOTAL_TRAINING_STEPS < config.NUM_TRAIN_BATCHES_PER_RANK * NUM_RANKS) {
         config.TOTAL_TRAINING_STEPS_PER_RANK = config.TOTAL_TRAINING_STEPS / NUM_RANKS;
         if (MY_RANK < config.TOTAL_TRAINING_STEPS % NUM_RANKS) {
             config.TOTAL_TRAINING_STEPS_PER_RANK++;
             MPI_Comm_split(MPI_COMM_WORLD, 0, MY_RANK, &rest_training_steps_comm);
-        } else {
+        }
+        else {
             MPI_Comm_split(MPI_COMM_WORLD, MPI_UNDEFINED, MY_RANK, &rest_training_steps_comm);
         }
 
-        config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN = config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN > config.TOTAL_TRAINING_STEPS_PER_RANK
-                                                          ? config.TOTAL_TRAINING_STEPS_PER_RANK
-                                                          : config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN;
-    } else {
+        config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN =
+            config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN > config.TOTAL_TRAINING_STEPS_PER_RANK
+                ? config.TOTAL_TRAINING_STEPS_PER_RANK
+                : config.NUM_OF_ACTUALLY_USED_PROCESSES_TRAIN;
+    }
+    else {
         config.TOTAL_TRAINING_STEPS = -1;
     }
 
diff --git a/dlio/stats.c b/dlio/stats.c
index eb7ff596..d1a52f6c 100644
--- a/dlio/stats.c
+++ b/dlio/stats.c
@@ -221,7 +221,10 @@ print_average_data()
     // Train
     // TODO: drop_last = false
     uint64_t train_total_size_bytes =
-        (uint64_t)config.BATCH_SIZE * (config.TOTAL_TRAINING_STEPS == -1? config.NUM_TRAIN_BATCHES_PER_RANK * NUM_RANKS: config.TOTAL_TRAINING_STEPS) * config.RECORD_LENGTH;
+        (uint64_t)config.BATCH_SIZE *
+        (config.TOTAL_TRAINING_STEPS == -1 ? config.NUM_TRAIN_BATCHES_PER_RANK * NUM_RANKS
+                                           : config.TOTAL_TRAINING_STEPS) *
+        config.RECORD_LENGTH;
     uint64_t train_size_bytes_per_rank = train_total_size_bytes / NUM_RANKS;
 
     uint64_t  train_total_compute_time     = 0;
@@ -416,7 +419,8 @@ print_average_data()
         printf("Observed evaluation completion time: %.3lf s\n", eval_total_observed_time / 1000000.0);
         value = format_human_readable(eval_total_avg_observed_rate);
         printf("Observed average evaluation rate: %.3lf %cB/s\n", value.value, value.unit);
-        printf("Evaluation average throughput avg: %.3lf samples/s\n", eval_throughput_mean_samples_per_second);
+        printf("Evaluation average throughput avg: %.3lf samples/s\n",
+               eval_throughput_mean_samples_per_second);
         printf("Evaluation throughput standard deviation: %.3lf samples/s\n",
                eval_throughput_stdev_samples_per_second);
         value = format_human_readable(eval_io_mean);
@@ -1147,8 +1151,11 @@ end_train(uint32_t epoch, uint64_t metadata_time, uint64_t read_time)
         stats[epoch].throughput.train = NAN;
     }
     else {
-        stats[epoch].throughput.train = (double)config.BATCH_SIZE * (config.TOTAL_TRAINING_STEPS_PER_RANK == -1? config.NUM_TRAIN_BATCHES_PER_RANK: config.TOTAL_TRAINING_STEPS_PER_RANK) *
-                                        1000000.0 / (end_time - stats[epoch].start_time.train);
+        stats[epoch].throughput.train =
+            (double)config.BATCH_SIZE *
+            (config.TOTAL_TRAINING_STEPS_PER_RANK == -1 ? config.NUM_TRAIN_BATCHES_PER_RANK
+                                                        : config.TOTAL_TRAINING_STEPS_PER_RANK) *
+            1000000.0 / (end_time - stats[epoch].start_time.train);
     }
     stats[epoch].metadata_time.train = metadata_time;
     stats[epoch].raw_read_time.train = read_time;
diff --git a/dlio/utils.c b/dlio/utils.c
index 842d9e68..2a6423d2 100644
--- a/dlio/utils.c
+++ b/dlio/utils.c
@@ -57,13 +57,13 @@ config_datatype_t config = {
     .DROP_LAST             = true,
 
     // Train
-    .EPOCHS                 = 5,
-    .COMPUTATION_TIME       = 0.323f,
-    .COMPUTATION_TIME_STDEV = 0.000f,
-    .TOTAL_TRAINING_STEPS = -1,
+    .EPOCHS                        = 5,
+    .COMPUTATION_TIME              = 0.323f,
+    .COMPUTATION_TIME_STDEV        = 0.000f,
+    .TOTAL_TRAINING_STEPS          = -1,
     .TOTAL_TRAINING_STEPS_PER_RANK = -1,
-    .SEED_CHANGE_EPOCH = false,
-    .RANDOM_SEED       = 42,
+    .SEED_CHANGE_EPOCH             = false,
+    .RANDOM_SEED                   = 42,
 
     // Evaluation
     .EVAL_TIME            = 0.323f,
diff --git a/dlio/utils.h b/dlio/utils.h
index a00b3d39..56287f9b 100644
--- a/dlio/utils.h
+++ b/dlio/utils.h
@@ -52,8 +52,8 @@ typedef struct config_datatype {
     float    COMPUTATION_TIME_STDEV;
     uint32_t TOTAL_TRAINING_STEPS;
     uint32_t TOTAL_TRAINING_STEPS_PER_RANK;
-    bool SEED_CHANGE_EPOCH;
-    int  RANDOM_SEED;
+    bool     SEED_CHANGE_EPOCH;
+    int      RANDOM_SEED;
 
     // Evaluation
     float    EVAL_TIME;

From c061bcffb69279b26af0832629ad67d2ef9b0381 Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Wed, 7 Aug 2024 19:38:45 +0200
Subject: [PATCH 28/37] Fix multiple issues identified in code review

---
 .github/workflows/h5bench-hdf5-develop.yml |  4 +-
 dlio/example_output.csv                    | 46 ++++++++++++
 dlio/h5bench_dlio.c                        | 86 ++++------------------
 dlio/stats.c                               | 18 +++--
 dlio/utils.c                               |  6 ++
 dlio/workers.c                             | 10 +++
 docs/source/dlio.rst                       |  1 +
 7 files changed, 93 insertions(+), 78 deletions(-)
 create mode 100644 dlio/example_output.csv

diff --git a/.github/workflows/h5bench-hdf5-develop.yml b/.github/workflows/h5bench-hdf5-develop.yml
index 2d6cd0ac..a14bdf53 100644
--- a/.github/workflows/h5bench-hdf5-develop.yml
+++ b/.github/workflows/h5bench-hdf5-develop.yml
@@ -419,10 +419,8 @@ jobs:
       - name: Test h5bench SYNC dlio
         run: |
           export HDF5_HOME=/opt/hdf5
-          export ABT_HOME=/opt/argobots
-          export ASYNC_HOME=/opt/vol-async
 
-          export LD_LIBRARY_PATH=$ASYNC_HOME/lib:$HDF5_HOME/lib:$ABT_HOME/lib:$LD_LIBRARY_PATH
+          export LD_LIBRARY_PATH=$HDF5_HOME/lib:$LD_LIBRARY_PATH
           
           cd build-sync
           ./h5bench --debug --abort-on-failure ../samples/sync-dlio.json
diff --git a/dlio/example_output.csv b/dlio/example_output.csv
new file mode 100644
index 00000000..e56f28b3
--- /dev/null
+++ b/dlio/example_output.csv
@@ -0,0 +1,46 @@
+metric, value, unit
+operation, dlio,
+ranks, 16,
+read threads, 0,
+subfiling, NO,
+chunking, NO,
+collective meta, YES,
+collective data, YES,
+train total size, 63.000, GB
+train size per rank, 3.938, GB
+train emulated compute time per epoch, "2.907, 2.907, 2.907, 2.907, 2.907", "s, s, s, s, s"
+train emulated compute time, 14.535, s
+train metadata time per epoch, "0.559, 0.502, 0.434, 0.406, 0.482", "s, s, s, s, s"
+train metadata time, 2.383, s
+train raw read time per epoch, "4.205, 3.835, 3.767, 3.758, 3.830", "s, s, s, s, s"
+train total raw read time, 19.395, s
+train raw read rate per epoch, "958.749, 1.026, 1.045, 1.047, 1.027", "MB/s, GB/s, GB/s, GB/s, GB/s"
+train avg raw read rate, 1.017, GB/s
+train observed time per epoch, "8.709, 8.125, 7.789, 7.824, 8.775", "s, s, s, s, s"
+train observed time, 41.223, s
+train observed rate per epoch, "694.948, 772.636, 825.826, 820.059, 687.088", "MB/s, MB/s, MB/s, MB/s, MB/s"
+train avg observed rate, 760.111, MB/s
+train throughput samples per second per epoch, "7.234, 7.753, 8.088, 8.052, 7.179", "samples/s, samples/s, samples/s, samples/s, samples/s"
+train throughput avg samples per second, 7.661, samples/s
+train throughput stdev samples per second, 0.389, samples/s
+train io avg, 490.330, MB/s
+train io stdev, 24.925, MB/s
+eval total size, 16.000, GB
+eval size per rank, 1.000, GB
+eval emulated compute time per epoch, "2.584, 2.584, 2.584, 2.584, 2.584", "s, s, s, s, s"
+eval emulated compute time, 12.920, s
+eval metadata time per epoch, "0.214, 0.151, 0.162, 0.141, 0.181", "s, s, s, s, s"
+eval metadata time, 0.848, s
+eval raw read time per epoch, "0.925, 0.913, 0.875, 0.853, 0.824", "s, s, s, s, s"
+eval total raw read time, 4.390, s
+eval raw read rate per epoch, "1.080, 1.095, 1.143, 1.171, 1.213", "GB/s, GB/s, GB/s, GB/s, GB/s"
+eval avg raw read rate, 1.141, GB/s
+eval observed time per epoch, "4.120, 3.904, 3.881, 3.867, 3.940", "s, s, s, s, s"
+eval observed time, 19.712, s
+eval observed rate per epoch, "666.574, 775.895, 789.646, 797.822, 755.279", "MB/s, MB/s, MB/s, MB/s, MB/s"
+eval avg observed rate, 757.043, MB/s
+eval throughput samples per second per epoch, "3.883, 4.099, 4.123, 4.137, 4.061", "samples/s, samples/s, samples/s, samples/s, samples/s"
+eval throughput avg samples per second, 4.061, samples/s
+eval throughput stdev samples per second, 0.092, samples/s
+eval io avg, 259.877, MB/s
+eval io stdev, 5.907, MB/s
diff --git a/dlio/h5bench_dlio.c b/dlio/h5bench_dlio.c
index 64e2c94f..fddab7a0 100644
--- a/dlio/h5bench_dlio.c
+++ b/dlio/h5bench_dlio.c
@@ -19,6 +19,8 @@
 #include "H5FDioc.h"
 #endif
 
+// Maximum size of randomly generated data per file. If the file size is larger than the specified value,
+// randomly generated data will be written to the file several times in a row. Default value is 2 GB
 #define GENERATION_BUFFER_SIZE 2 * 1073741824lu
 
 // Global variables
@@ -28,6 +30,7 @@ uint32_t DIM;
 hid_t    DCPL, FAPL, DAPL, DXPL;
 MPI_Comm rest_training_steps_comm = MPI_COMM_WORLD;
 
+// Generating a dataset containing training data labels
 void
 generate_labels_dataset(hid_t file_id, hid_t filespace, hid_t memspace)
 {
@@ -53,6 +56,7 @@ generate_labels_dataset(hid_t file_id, hid_t filespace, hid_t memspace)
     H5Dclose(dataset_id);
 }
 
+// Generating a dataset containing random training data
 void
 generate_records_dataset(hid_t file_id, hid_t filespace, hid_t memspace, hid_t extra_memspace)
 {
@@ -93,6 +97,7 @@ generate_records_dataset(hid_t file_id, hid_t filespace, hid_t memspace, hid_t e
     H5Dclose(dataset_id);
 }
 
+// Generating a hdf5 file containing a dataset with random training data and a dataset with labels
 void
 generate_file(const char *file_name, hid_t labels_filespace, hid_t labels_memspace, hid_t records_filespace,
               hid_t records_memspace, hid_t extra_records_memspace)
@@ -106,6 +111,7 @@ generate_file(const char *file_name, hid_t labels_filespace, hid_t labels_memspa
     H5Fclose(file_id);
 }
 
+// Distribution of file generation work among MPI ranks
 void
 generate_data()
 {
@@ -170,6 +176,7 @@ generate_data()
     }
 }
 
+// Read a given sample from a given hdf5 file
 void
 read_sample(const char *file_path, uint32_t sample, uint64_t *metadata_time_out, uint64_t *read_time_out)
 {
@@ -213,6 +220,7 @@ read_sample(const char *file_path, uint32_t sample, uint64_t *metadata_time_out,
     compute(config.PREPROCESS_TIME, config.PREPROCESS_TIME_STDEV);
 }
 
+// Simulation of computations by means of sleep() function
 uint64_t
 compute(float time, float time_stdev)
 {
@@ -224,6 +232,7 @@ compute(float time, float time_stdev)
     return 0;
 }
 
+// Evaluation process simulation without the use of multiprocessing and workers
 void
 eval_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out,
                      uint64_t *local_read_time_out)
@@ -256,43 +265,9 @@ eval_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata
 
         t0 = get_time_usec_return_uint64();
     }
-
-    //    TODO: drop_data = False
-    //    for (uint32_t iteration = MY_RANK; iteration < total_samples - NUM_RANKS * samples_per_rank;
-    //    iteration += NUM_RANKS) {
-    //        uint32_t i = NUM_RANKS * samples_per_rank + iteration;
-    //        uint32_t file_num = indices[i] / config.NUM_SAMPLES_PER_FILE + 1;
-    //        uint32_t sample_num = indices[i] % config.NUM_SAMPLES_PER_FILE;
-    //        char file_path[256];
-    //        snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER,
-    //        config.VALID_DATA_FOLDER, config.FILE_PREFIX, file_num, config.NUM_FILES_EVAL);
-    //
-    //        uint64_t metadata_time = 0, read_time = 0;
-    //        read_sample(file_path, sample_num, &metadata_time, &read_time);
-    //        read_counter++;
-    //
-    //        *local_metadata_time_out += metadata_time;
-    //        *local_read_time_out += read_time;
-    //
-    //        if (read_counter % config.BATCH_SIZE_EVAL == 0){
-    //            batch_loaded_eval(epoch, t0);
-    //
-    //            uint64_t t = compute(config.EVAL_TIME, config.EVAL_TIME_STDEV);
-    //            batch_processed_eval(epoch, t, t0);
-    //            read_counter = 0;
-    //
-    //            t0 = get_time_usec_return_uint64();
-    //        }
-    //    }
-    //
-    //    if (read_counter != 0) {
-    //        batch_loaded_eval(epoch, t0);
-    //
-    //        uint64_t t = compute(config.EVAL_TIME, config.EVAL_TIME_STDEV);
-    //        batch_processed_eval(epoch, t, t0);
-    //    }
 }
 
+// Evaluation process simulation using multiprocessing and workers
 void
 eval_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out)
 {
@@ -345,6 +320,7 @@ eval_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *
     }
 }
 
+// Preparing and selecting a way to simulate the evaluation process
 void
 eval(uint32_t epoch, uint32_t *indices, bool enable_multiprocessing)
 {
@@ -366,6 +342,7 @@ eval(uint32_t epoch, uint32_t *indices, bool enable_multiprocessing)
     end_eval(epoch, eval_metadata_time, eval_read_time);
 }
 
+// Training process simulation without the use of multiprocessing and workers
 void
 train_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadata_time_out,
                       uint64_t *local_read_time_out)
@@ -405,43 +382,9 @@ train_without_workers(uint32_t epoch, uint32_t *indices, uint64_t *local_metadat
 
         t0 = get_time_usec_return_uint64();
     }
-
-    //    TODO: drop_data = True
-    //    for (uint32_t iteration = MY_RANK; iteration < total_samples - NUM_RANKS * samples_per_rank;
-    //    iteration += NUM_RANKS) {
-    //        uint32_t i = NUM_RANKS * samples_per_rank + iteration;
-    //        uint32_t file_num = indices[i] / config.NUM_SAMPLES_PER_FILE + 1;
-    //        uint32_t sample_num = indices[i] % config.NUM_SAMPLES_PER_FILE;
-    //        char file_path[256];
-    //        snprintf(file_path, sizeof(file_path), "%s/%s/%s_%u_of_%u.h5", config.DATA_FOLDER,
-    //        config.TRAIN_DATA_FOLDER, config.FILE_PREFIX, file_num, config.NUM_FILES_TRAIN);
-    //
-    //        uint64_t metadata_time = 0, read_time = 0;
-    //        read_sample(file_path, sample_num, &metadata_time, &read_time);
-    //        read_counter++;
-    //
-    //        *local_metadata_time_out += metadata_time;
-    //        *local_read_time_out += read_time;
-    //
-    //        if (read_counter % config.BATCH_SIZE == 0){
-    //            batch_loaded_train(epoch, t0);
-    //
-    //            uint64_t t = compute(config.COMPUTATION_TIME, config.COMPUTATION_TIME_STDEV);
-    //            batch_processed_train(epoch, t, t0);
-    //
-    //            read_counter = 0;
-    //            t0 = get_time_usec_return_uint64();
-    //        }
-    //    }
-    //
-    //    if (read_counter != 0) {
-    //        batch_loaded_train(epoch, t0);
-    //
-    //        uint64_t t = compute(config.COMPUTATION_TIME, config.COMPUTATION_TIME_STDEV);
-    //        batch_processed_train(epoch, t, t0);
-    //    }
 }
 
+// Training process simulation using multiprocessing and workers
 void
 train_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t *local_read_time_out)
 {
@@ -504,6 +447,7 @@ train_using_workers(uint32_t epoch, uint64_t *local_metadata_time_out, uint64_t
     }
 }
 
+// Preparing and selecting a way to simulate the training process
 void
 train(uint32_t epoch, uint32_t *indices, bool enable_multiprocessing)
 {
@@ -525,6 +469,7 @@ train(uint32_t epoch, uint32_t *indices, bool enable_multiprocessing)
     end_train(epoch, train_metadata_time, train_read_time);
 }
 
+// Starting the benchmark and simulation process of training and evaluation
 void
 run()
 {
@@ -579,6 +524,7 @@ run()
     MPI_Barrier(MPI_COMM_WORLD);
 }
 
+// Initialization of some global variables and settings for benchmark operation
 void
 init_global_variables()
 {
diff --git a/dlio/stats.c b/dlio/stats.c
index d1a52f6c..ae9e63e6 100644
--- a/dlio/stats.c
+++ b/dlio/stats.c
@@ -19,10 +19,10 @@ uint32_t *last_proc_eval;
 uint32_t *last_compute_train;
 uint32_t *last_compute_eval;
 
+// Initialization of variables for storing statistics information
 void
 stats_initialize()
 {
-    //    TODO: drop_last = False
     stats = (struct epoch_data *)malloc(config.EPOCHS * sizeof(struct epoch_data));
     if (stats == NULL) {
         exit(1);
@@ -89,6 +89,7 @@ stats_initialize()
     }
 }
 
+// Release of resources initialized for storing statistics information
 void
 stats_finalize()
 {
@@ -119,6 +120,7 @@ stats_finalize()
     free(global_stats);
 }
 
+//
 void
 prepare_data()
 {
@@ -215,11 +217,11 @@ prepare_data()
     }
 }
 
+// Preparing data obtained during benchmark execution for output
 void
 print_average_data()
 {
     // Train
-    // TODO: drop_last = false
     uint64_t train_total_size_bytes =
         (uint64_t)config.BATCH_SIZE *
         (config.TOTAL_TRAINING_STEPS == -1 ? config.NUM_TRAIN_BATCHES_PER_RANK * NUM_RANKS
@@ -302,7 +304,6 @@ print_average_data()
     double train_io_stdev = train_throughput_stdev_samples_per_second * config.RECORD_LENGTH;
 
     // Evaluation
-    // TODO: drop_last = False
     uint64_t eval_total_size_bytes = (uint64_t)config.NUM_EVAL_BATCHES_PER_RANK * NUM_RANKS *
                                      config.BATCH_SIZE_EVAL * config.RECORD_LENGTH;
     uint64_t eval_size_bytes_per_rank =
@@ -690,11 +691,11 @@ print_average_data()
     free(eval_avg_observed_rate_per_epoch);
 }
 
+// Output collected statistics on the current MPI rank
 void
 print_rank_data()
 {
     // Train
-    // TODO: drop_last = false
     uint64_t train_total_size_bytes =
         (uint64_t)config.NUM_TRAIN_BATCHES_PER_RANK * NUM_RANKS * config.BATCH_SIZE * config.RECORD_LENGTH;
     uint64_t train_size_bytes_per_rank =
@@ -774,7 +775,6 @@ print_rank_data()
     double train_io_stdev = train_throughput_stdev_samples_per_second * config.RECORD_LENGTH;
 
     // Evaluation
-    // TODO: drop_last = False
     uint64_t eval_total_size_bytes = (uint64_t)config.NUM_EVAL_BATCHES_PER_RANK * NUM_RANKS *
                                      config.BATCH_SIZE_EVAL * config.RECORD_LENGTH;
     uint64_t eval_size_bytes_per_rank =
@@ -1110,12 +1110,14 @@ print_rank_data()
     free(eval_avg_observed_rate_per_epoch);
 }
 
+// Saving the time spent on loading a batch during the training process
 void
 batch_loaded_train(uint32_t epoch, uint64_t t0)
 {
     stats[epoch].load.train[last_load_train[epoch]++] = (get_time_usec_return_uint64() - t0);
 }
 
+// Saving the time spent on processing a batch during the trining process
 void
 batch_processed_train(uint32_t epoch, uint64_t computation_time, uint64_t t0)
 {
@@ -1123,12 +1125,14 @@ batch_processed_train(uint32_t epoch, uint64_t computation_time, uint64_t t0)
     stats[epoch].compute.train[last_compute_train[epoch]++] = computation_time;
 }
 
+// Saving the time spent on loading a batch during the evaluation process
 void
 batch_loaded_eval(uint32_t epoch, uint64_t t0)
 {
     stats[epoch].load.eval[last_load_eval[epoch]++] = (get_time_usec_return_uint64() - t0);
 }
 
+// Saving the time spent on processing a batch during the evaluation process
 void
 batch_processed_eval(uint32_t epoch, uint64_t computation_time, uint64_t t0)
 {
@@ -1136,12 +1140,14 @@ batch_processed_eval(uint32_t epoch, uint64_t computation_time, uint64_t t0)
     stats[epoch].compute.eval[last_compute_eval[epoch]++] = computation_time;
 }
 
+// Saving the start time of the training process
 void
 start_train(uint32_t epoch)
 {
     stats[epoch].start_time.train = get_time_usec_return_uint64();
 }
 
+// Saving data on the training process
 void
 end_train(uint32_t epoch, uint64_t metadata_time, uint64_t read_time)
 {
@@ -1161,12 +1167,14 @@ end_train(uint32_t epoch, uint64_t metadata_time, uint64_t read_time)
     stats[epoch].raw_read_time.train = read_time;
 }
 
+// Saving the start time of the evaluation process
 void
 start_eval(uint32_t epoch)
 {
     stats[epoch].start_time.eval = get_time_usec_return_uint64();
 }
 
+// Saving data on the evaluation process
 void
 end_eval(uint32_t epoch, uint64_t metadata_time, uint64_t read_time)
 {
diff --git a/dlio/utils.c b/dlio/utils.c
index 2a6423d2..e911f042 100644
--- a/dlio/utils.c
+++ b/dlio/utils.c
@@ -11,6 +11,7 @@
 
 #include "utils.h"
 
+// Returns the current time in microseconds
 uint64_t
 get_time_usec_return_uint64()
 {
@@ -86,6 +87,7 @@ config_datatype_t config = {
     .NUM_OF_ACTUALLY_USED_PROCESSES_EVAL  = 0,
 };
 
+// Creating a directory with a specified name
 void
 create_directory(const char *folder)
 {
@@ -98,6 +100,7 @@ create_directory(const char *folder)
     }
 }
 
+// Deleting a directory with a specified name
 void
 delete_directory(const char *dir_path)
 {
@@ -137,6 +140,7 @@ delete_directory(const char *dir_path)
     }
 }
 
+// Shuffle the values in the specified array
 void
 shuffle(uint32_t *array, size_t n)
 {
@@ -150,6 +154,7 @@ shuffle(uint32_t *array, size_t n)
     }
 }
 
+// Generation of normally distributed random number
 double
 generate_normal_random(float mean, float stdev)
 {
@@ -159,6 +164,7 @@ generate_normal_random(float mean, float stdev)
     return z0 * stdev + mean;
 }
 
+// Parsing of arguments that the program receives as input
 void
 parse_args(int argc, char *argv[])
 {
diff --git a/dlio/workers.c b/dlio/workers.c
index 7e9f544d..5b6ba6a9 100644
--- a/dlio/workers.c
+++ b/dlio/workers.c
@@ -13,6 +13,7 @@
 int pipe_train_task_fd[2], pipe_train_result_fd[2], pipe_eval_task_fd[2], pipe_eval_result_fd[2];
 int pipe_train_system_fd[2], pipe_eval_system_fd[2];
 
+// Initialization of processes that will be used later on in the simulation of data processing
 void
 init_workers(uint32_t *indices_train, uint32_t *indices_eval)
 {
@@ -79,42 +80,49 @@ init_workers(uint32_t *indices_train, uint32_t *indices_eval)
     close(pipe_train_system_fd[0]);
 }
 
+// Returns the file descriptor opened for reading and used to communicate with training workers
 int
 get_train_read_fd()
 {
     return pipe_train_result_fd[0];
 }
 
+// Returns the file descriptor opened for reading and used to communicate with evaluation workers
 int
 get_eval_read_fd()
 {
     return pipe_eval_result_fd[0];
 }
 
+// Returns the file descriptor opened for writing and used to communicate with training workers
 int
 get_train_write_fd()
 {
     return pipe_train_task_fd[1];
 }
 
+// Returns the file descriptor opened for writing and used to communicate with evaluation workers
 int
 get_eval_write_fd()
 {
     return pipe_eval_task_fd[1];
 }
 
+// Returns the file descriptor opened for writing and used to manage the training workers
 int
 get_train_system_fd()
 {
     return pipe_train_system_fd[1];
 }
 
+// Returns the file descriptor opened for writing and used to manage the evaluation workers
 int
 get_eval_system_fd()
 {
     return pipe_eval_system_fd[1];
 }
 
+// Release all resources used by processes and the processes themselves
 void
 fin_workers()
 {
@@ -139,6 +147,7 @@ fin_workers()
     }
 }
 
+// Command all workers to shuffle data files
 void
 force_workers_to_shuffle(int read_fd, int write_fd, int system_fd)
 {
@@ -156,6 +165,7 @@ force_workers_to_shuffle(int read_fd, int write_fd, int system_fd)
     }
 }
 
+// Starting a worker waiting for commands to read data batches
 void
 run_worker(uint32_t *indices, int pipe_task_fd[2], int pipe_result_fd[2], int pipe_system_fd[2],
            bool is_train_worker)
diff --git a/docs/source/dlio.rst b/docs/source/dlio.rst
index 29fe1875..b8fac7f2 100644
--- a/docs/source/dlio.rst
+++ b/docs/source/dlio.rst
@@ -34,6 +34,7 @@ As in the case with other extensions, the following parameters should be specifi
 ``preprocess-time``        Preprocessing time after reading each sample in seconds               float    0.0
 ``preprocess-time-stdev``  Standard deviation in preprocessing time in seconds                   float    0.0
 ``epochs``                 The number of epochs                                                  int      5
+``total-training-steps``   Maximum number of steps per training per epoch                        int      -1
 ``computation-time``       Computation time after reading each batch in seconds                  float    0.323
 ``computation-time-stdev`` Standard deviation in computation time in seconds                     float    0.0
 ``random-seed``            Random seed to be used                                                int      42

From c9fbed92348d7e4639c90b342fc0c8535a60fb85 Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Wed, 7 Aug 2024 21:14:15 +0200
Subject: [PATCH 29/37] Update the sample configuration file to reduce the
 runtime of the github workflow job

---
 samples/sync-dlio.json | 37 +++++++------------------------------
 1 file changed, 7 insertions(+), 30 deletions(-)

diff --git a/samples/sync-dlio.json b/samples/sync-dlio.json
index 577d9fd0..77add7cf 100644
--- a/samples/sync-dlio.json
+++ b/samples/sync-dlio.json
@@ -12,29 +12,6 @@
   },
   "directory": "storage",
   "benchmarks": [
-    {
-      "benchmark": "dlio",
-      "configuration": {
-        "generate-data": "true",
-        "chunking": "false",
-        "keep-files": "true",
-        "compression": "true",
-        "record-length": "67108864",
-        "num-files-train": "8",
-        "num-files-eval": "2",
-        "num-samples-per-file": "4",
-        "data-folder": "data",
-        "file-prefix": "img",
-        "compression-level": "4",
-        "random-seed": "42",
-        "train-data-folder": "train",
-        "valid-data-folder": "valid",
-        "records-dataset-name": "records",
-        "labels-dataset-name": "labels",
-        "output-csv-name": "output",
-        "output-ranks-data": "true"
-      }
-    },
     {
       "benchmark": "dlio",
       "configuration": {
@@ -43,22 +20,22 @@
         "keep-files": "true",
         "shuffle": "true",
         "seed-change-epoch": "true",
-        "record-length": "67108864",
+        "record-length": "1048576",
         "num-files-train": "8",
         "num-files-eval": "2",
         "num-samples-per-file": "4",
         "data-folder": "./data",
         "file-prefix": "img",
-        "batch-size": "7",
-        "batch-size-eval": "2",
-        "read-threads": "4",
+        "batch-size": "2",
+        "batch-size-eval": "1",
+        "read-threads": "1",
         "preprocess-time": "0.0",
         "preprocess-time-stdev": "0.0",
-        "epochs": "5",
-        "computation-time": "0.323",
+        "epochs": "1",
+        "computation-time": "0.123",
         "computation-time-stdev": "0.0",
         "random-seed": "42",
-        "eval-time": "0.323",
+        "eval-time": "0.123",
         "eval-time-stdev": "0.0",
         "epochs-between-evals": "1",
         "train-data-folder": "train",

From 0b1b0ccb481253f218a6d9676aaf575c8aba77e6 Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Wed, 7 Aug 2024 21:24:25 +0200
Subject: [PATCH 30/37] Update the sample configuration by adding a section
 with data generation

---
 samples/sync-dlio.json | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/samples/sync-dlio.json b/samples/sync-dlio.json
index 77add7cf..ec25d3b7 100644
--- a/samples/sync-dlio.json
+++ b/samples/sync-dlio.json
@@ -12,6 +12,28 @@
   },
   "directory": "storage",
   "benchmarks": [
+    {
+      "benchmark": "dlio",
+      "configuration": {
+        "generate-data": "true",
+        "chunking": "false",
+        "keep-files": "true",
+        "compression": "false",
+        "record-length": "1048576",
+        "num-files-train": "8",
+        "num-files-eval": "2",
+        "num-samples-per-file": "4",
+        "data-folder": "data",
+        "file-prefix": "img",
+        "random-seed": "42",
+        "train-data-folder": "train",
+        "valid-data-folder": "valid",
+        "records-dataset-name": "records",
+        "labels-dataset-name": "labels",
+        "output-csv-name": "output",
+        "output-ranks-data": "true"
+      }
+    },
     {
       "benchmark": "dlio",
       "configuration": {
@@ -24,7 +46,7 @@
         "num-files-train": "8",
         "num-files-eval": "2",
         "num-samples-per-file": "4",
-        "data-folder": "./data",
+        "data-folder": "data",
         "file-prefix": "img",
         "batch-size": "2",
         "batch-size-eval": "1",

From 2cf3cd3752ae06aee401266cc731fbeaa4432620 Mon Sep 17 00:00:00 2001
From: Jean Luca Bez <jeanlucabez@gmail.com>
Date: Wed, 7 Aug 2024 12:38:37 -0700
Subject: [PATCH 31/37] Update h5bench-hdf5-1.10.8.yml

---
 .github/workflows/h5bench-hdf5-1.10.8.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/h5bench-hdf5-1.10.8.yml b/.github/workflows/h5bench-hdf5-1.10.8.yml
index 4df604ec..d4f431c1 100644
--- a/.github/workflows/h5bench-hdf5-1.10.8.yml
+++ b/.github/workflows/h5bench-hdf5-1.10.8.yml
@@ -230,7 +230,7 @@ jobs:
 
       - name: Upload artifact
         if: always()
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v3
         with:
           name: test
           path: build/storage/**/std*

From c4b95342f7598deb969ea824827d3bdf76acf151 Mon Sep 17 00:00:00 2001
From: Jean Luca Bez <jeanlucabez@gmail.com>
Date: Wed, 7 Aug 2024 14:13:14 -0700
Subject: [PATCH 32/37] Update h5bench-hdf5-1.10.4.yml

---
 .github/workflows/h5bench-hdf5-1.10.4.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/h5bench-hdf5-1.10.4.yml b/.github/workflows/h5bench-hdf5-1.10.4.yml
index 25ae373e..2f7d0090 100644
--- a/.github/workflows/h5bench-hdf5-1.10.4.yml
+++ b/.github/workflows/h5bench-hdf5-1.10.4.yml
@@ -1,5 +1,8 @@
 name: h5bench (HDF5 1.10.4)
 
+env:
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+
 on:
   pull_request:
 

From 4e7c11afa35705367a4474b10c27b87ca897182d Mon Sep 17 00:00:00 2001
From: Jean Luca Bez <jeanlucabez@gmail.com>
Date: Wed, 7 Aug 2024 14:28:54 -0700
Subject: [PATCH 33/37] Update h5bench-hdf5-1.10.7.yml

---
 .github/workflows/h5bench-hdf5-1.10.7.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/h5bench-hdf5-1.10.7.yml b/.github/workflows/h5bench-hdf5-1.10.7.yml
index 68bee711..f0631541 100644
--- a/.github/workflows/h5bench-hdf5-1.10.7.yml
+++ b/.github/workflows/h5bench-hdf5-1.10.7.yml
@@ -1,5 +1,8 @@
 name: h5bench (HDF5 1.10.7)
 
+env:
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+
 on:
   pull_request:
 

From 9fd3ae5f795ad6745f1d5d2b1625362b347381dc Mon Sep 17 00:00:00 2001
From: Jean Luca Bez <jeanlucabez@gmail.com>
Date: Wed, 7 Aug 2024 14:29:04 -0700
Subject: [PATCH 34/37] Update h5bench-hdf5-1.10.8.yml

---
 .github/workflows/h5bench-hdf5-1.10.8.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/h5bench-hdf5-1.10.8.yml b/.github/workflows/h5bench-hdf5-1.10.8.yml
index d4f431c1..d31e6666 100644
--- a/.github/workflows/h5bench-hdf5-1.10.8.yml
+++ b/.github/workflows/h5bench-hdf5-1.10.8.yml
@@ -1,5 +1,8 @@
 name: h5bench (HDF5 1.10.8)
 
+env:
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+
 on:
   pull_request:
 

From 56c6d0fb2bef5582b88de5b4eafd1c499d29751a Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Sat, 17 Aug 2024 13:17:50 +0200
Subject: [PATCH 35/37] Update the readthedocs page

---
 docs/source/dlio.rst | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/docs/source/dlio.rst b/docs/source/dlio.rst
index b8fac7f2..acd36728 100644
--- a/docs/source/dlio.rst
+++ b/docs/source/dlio.rst
@@ -3,8 +3,17 @@ DLIO
 
 The benchmark is designed to measure the performance of training and evaluation of deep learning models on data stored
 as HDF5 files. Based on collected and analysed I/O patterns from `DLIO Benchmark <https://github.com/argonne-lcf/dlio_benchmark>`_,
-this benchmark simulates the learning process and evaluation of deep learning models using PyTorch and Tensorflow
-frameworks, while gathering valuable information about system performance.
+this benchmark simulates the learning process and evaluation of deep learning models that use PyTorch and Tensorflow
+frameworks, while gathering valuable information about system performance. Most importantly, this extension allows users
+to test AI workloads without the need to install machine learning libraries, reducing complexity and enhancing the
+usability of the benchmark. Another advantage is that from our experiments, our extension ran faster than DLIO Benchmark,
+which we suspect was due to the difference in the overhead introduced by the C application in our extension and the
+Python application in the original benchmark. While the quicker runtime could be beneficial for faster testing, it also
+suggests that the benchmark might not fully capture the complexity of real AI workloads, such as high metadata
+operations introduced by the use of Python-based libraries. I/O pattern produced by this extension is based on the
+implementation of `DLIO benchmark version 1.1 <https://github.com/argonne-lcf/dlio_benchmark/releases/tag/v1.1>`_.
+Changes in the main DLIO Benchmark configurations after version 1.1 will not be reflected in this h5bench pattern. To
+reproduce them, DLIO Benchmark behavior can be studied using various I/O analysis tools. We recommend using Log VFD.
 
 Configuration
 -------------

From 33281ad628a257e33afea356a95e6110906e5881 Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Sat, 17 Aug 2024 13:21:27 +0200
Subject: [PATCH 36/37] Update the README file

---
 dlio/README.md | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/dlio/README.md b/dlio/README.md
index abcf29ad..cc00b018 100644
--- a/dlio/README.md
+++ b/dlio/README.md
@@ -1,9 +1,19 @@
 # DLIO benchmark
 
-The benchmark is designed to measure the performance of training and evaluation of deep learning models on data stored 
-as HDF5 files. Based on collected and analysed I/O patterns from [DLIO Benchmark](https://github.com/argonne-lcf/dlio_benchmark), 
-this benchmark simulates the learning process and evaluation of deep learning models using PyTorch and Tensorflow 
-frameworks, while gathering valuable information about system performance.
+The benchmark is designed to measure the performance of training and evaluation of deep learning models on data stored
+as HDF5 files. Based on collected and analysed I/O patterns from [DLIO Benchmark](https://github.com/argonne-lcf/dlio_benchmark),
+this benchmark simulates the learning process and evaluation of deep learning models that use PyTorch and Tensorflow
+frameworks, while gathering valuable information about system performance. Most importantly, this extension allows users
+to test AI workloads without the need to install machine learning libraries, reducing complexity and enhancing the
+usability of the benchmark. Another advantage is that from our experiments, our extension ran faster than DLIO Benchmark,
+which we suspect was due to the difference in the overhead introduced by the C application in our extension and the
+Python application in the original benchmark. While the quicker runtime could be beneficial for faster testing, it also
+suggests that the benchmark might not fully capture the complexity of real AI workloads, such as high metadata
+operations introduced by the use of Python-based libraries. I/O pattern produced by this extension is based on the
+implementation of [DLIO benchmark version 1.1](https://github.com/argonne-lcf/dlio_benchmark/releases/tag/v1.1).
+Changes in the main DLIO Benchmark configurations after version 1.1 will not be reflected in this h5bench pattern. To
+reproduce them, DLIO Benchmark behavior can be studied using various I/O analysis tools. We recommend using Log VFD.
+
 
 ## Configuration
 As in the case with other extensions, the following parameters should be specified in the configuration section of the 

From a69307c675cc6b1b41fbcbb3a7d41c6446dcac59 Mon Sep 17 00:00:00 2001
From: Dlyaver Djebarov <dlyaver.djebarov@rwth-aachen.de>
Date: Wed, 28 Aug 2024 14:52:35 +0200
Subject: [PATCH 37/37] Update the documentation

---
 dlio/README.md       | 3 ++-
 docs/source/dlio.rst | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/dlio/README.md b/dlio/README.md
index cc00b018..1f8f8043 100644
--- a/dlio/README.md
+++ b/dlio/README.md
@@ -12,7 +12,8 @@ suggests that the benchmark might not fully capture the complexity of real AI wo
 operations introduced by the use of Python-based libraries. I/O pattern produced by this extension is based on the
 implementation of [DLIO benchmark version 1.1](https://github.com/argonne-lcf/dlio_benchmark/releases/tag/v1.1).
 Changes in the main DLIO Benchmark configurations after version 1.1 will not be reflected in this h5bench pattern. To
-reproduce them, DLIO Benchmark behavior can be studied using various I/O analysis tools. We recommend using Log VFD.
+reproduce them, DLIO Benchmark behavior can be studied using various I/O analysis tools. We recommend using 
+[Log VFD](https://docs.hdfgroup.org/hdf5/v1_14/group___f_a_p_l.html#ga4e03be2fe83ed02b32266a6c81427beb).
 
 
 ## Configuration
diff --git a/docs/source/dlio.rst b/docs/source/dlio.rst
index acd36728..afff6a22 100644
--- a/docs/source/dlio.rst
+++ b/docs/source/dlio.rst
@@ -13,7 +13,8 @@ suggests that the benchmark might not fully capture the complexity of real AI wo
 operations introduced by the use of Python-based libraries. I/O pattern produced by this extension is based on the
 implementation of `DLIO benchmark version 1.1 <https://github.com/argonne-lcf/dlio_benchmark/releases/tag/v1.1>`_.
 Changes in the main DLIO Benchmark configurations after version 1.1 will not be reflected in this h5bench pattern. To
-reproduce them, DLIO Benchmark behavior can be studied using various I/O analysis tools. We recommend using Log VFD.
+reproduce them, DLIO Benchmark behavior can be studied using various I/O analysis tools. We recommend using
+`Log VFD <https://docs.hdfgroup.org/hdf5/v1_14/group___f_a_p_l.html#ga4e03be2fe83ed02b32266a6c81427beb>`_.
 
 Configuration
 -------------