From 66bc6a48d0d77e41beeab79055fc78ace368350f Mon Sep 17 00:00:00 2001
From: Selam Waktola <selamw@google.com>
Date: Wed, 4 Dec 2024 15:47:35 -0800
Subject: [PATCH] Adding tutorial for data loaders on gpu with jax (#109)

---
 docs/source/conf.py                           |    2 +
 .../source/data_loaders_on_cpu_with_jax.ipynb |   10 +-
 docs/source/data_loaders_on_cpu_with_jax.md   |   10 +-
 .../source/data_loaders_on_gpu_with_jax.ipynb | 1176 +++++++++++++++++
 docs/source/data_loaders_on_gpu_with_jax.md   |  650 +++++++++
 docs/source/tutorials.md                      |    1 +
 6 files changed, 1845 insertions(+), 4 deletions(-)
 create mode 100644 docs/source/data_loaders_on_gpu_with_jax.ipynb
 create mode 100644 docs/source/data_loaders_on_gpu_with_jax.md

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 1f9ab6a..ab865dc 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -67,6 +67,7 @@
     'JAX_time_series_classification.md',
     'JAX_transformer_text_classification.md',
     'data_loaders_on_cpu_with_jax.md',
+    'data_loaders_on_gpu_with_jax.md',
 ]
 
 suppress_warnings = [
@@ -104,4 +105,5 @@
     'JAX_time_series_classification.ipynb',
     'JAX_transformer_text_classification.ipynb',
     'data_loaders_on_cpu_with_jax.ipynb',
+    'data_loaders_on_gpu_with_jax.ipynb',
 ]
diff --git a/docs/source/data_loaders_on_cpu_with_jax.ipynb b/docs/source/data_loaders_on_cpu_with_jax.ipynb
index 21bd599..34a8445 100644
--- a/docs/source/data_loaders_on_cpu_with_jax.ipynb
+++ b/docs/source/data_loaders_on_cpu_with_jax.ipynb
@@ -24,7 +24,13 @@
     "- [**Grain**](https://github.com/google/grain)\n",
     "- [**Hugging Face**](https://huggingface.co/docs/datasets/en/use_with_jax#data-loading)\n",
     "\n",
-    "You'll see how to use each of these libraries to efficiently load data for a simple image classification task using the MNIST dataset."
+    "In this tutorial, you'll learn how to efficiently load data using these libraries for a simple image classification task based on the MNIST dataset.\n",
+    "\n",
+    "Compared to GPU or multi-device setups, CPU-based data loading is straightforward as it avoids challenges like GPU memory management and data synchronization across devices. This makes it ideal for smaller-scale tasks or scenarios where data resides exclusively on the CPU.\n",
+    "\n",
+    "If you're looking for GPU-specific data loading advice, see [Data Loaders on GPU](https://jax-ai-stack.readthedocs.io/en/latest/data_loaders_on_gpu_with_jax.html).\n",
+    "\n",
+    "If you're looking for a multi-device data loading strategy, see [Data Loaders on Multi-Device Setups](https://jax-ai-stack.readthedocs.io/en/latest/data_loaders_for_multi_device_setups_with_jax.html)."
    ]
   },
   {
@@ -1489,7 +1495,7 @@
    "source": [
     "## Summary\n",
     "\n",
-    "This notebook has guided you through efficient methods for loading data on a CPU when using JAX. You’ve learned how to leverage popular libraries such as PyTorch DataLoader, TensorFlow Datasets, Grain, and Hugging Face Datasets to streamline the data loading process for your machine learning tasks. Each of these methods offers unique advantages and considerations, allowing you to choose the best approach based on the specific needs of your project."
+    "This notebook has introduced efficient strategies for data loading on a CPU with JAX, demonstrating how to integrate popular libraries like PyTorch DataLoader, TensorFlow Datasets, Grain, and Hugging Face Datasets. Each library offers distinct advantages, enabling you to streamline the data loading process for machine learning tasks. By understanding the strengths of these methods, you can select the approach that best suits your project's specific requirements."
    ]
   }
  ],
diff --git a/docs/source/data_loaders_on_cpu_with_jax.md b/docs/source/data_loaders_on_cpu_with_jax.md
index f565d1d..d26c687 100644
--- a/docs/source/data_loaders_on_cpu_with_jax.md
+++ b/docs/source/data_loaders_on_cpu_with_jax.md
@@ -26,7 +26,13 @@ This tutorial explores different data loading strategies for using **JAX** on a
 - [**Grain**](https://github.com/google/grain)
 - [**Hugging Face**](https://huggingface.co/docs/datasets/en/use_with_jax#data-loading)
 
-You'll see how to use each of these libraries to efficiently load data for a simple image classification task using the MNIST dataset.
+In this tutorial, you'll learn how to efficiently load data using these libraries for a simple image classification task based on the MNIST dataset.
+
+Compared to GPU or multi-device setups, CPU-based data loading is straightforward as it avoids challenges like GPU memory management and data synchronization across devices. This makes it ideal for smaller-scale tasks or scenarios where data resides exclusively on the CPU.
+
+If you're looking for GPU-specific data loading advice, see [Data Loaders on GPU](https://jax-ai-stack.readthedocs.io/en/latest/data_loaders_on_gpu_with_jax.html).
+
+If you're looking for a multi-device data loading strategy, see [Data Loaders on Multi-Device Setups](https://jax-ai-stack.readthedocs.io/en/latest/data_loaders_for_multi_device_setups_with_jax.html).
 
 +++ {"id": "pEsb135zE-Jo"}
 
@@ -682,4 +688,4 @@ train_model(num_epochs, params, hf_training_generator)
 
 ## Summary
 
-This notebook has guided you through efficient methods for loading data on a CPU when using JAX. You’ve learned how to leverage popular libraries such as PyTorch DataLoader, TensorFlow Datasets, Grain, and Hugging Face Datasets to streamline the data loading process for your machine learning tasks. Each of these methods offers unique advantages and considerations, allowing you to choose the best approach based on the specific needs of your project.
+This notebook has introduced efficient strategies for data loading on a CPU with JAX, demonstrating how to integrate popular libraries like PyTorch DataLoader, TensorFlow Datasets, Grain, and Hugging Face Datasets. Each library offers distinct advantages, enabling you to streamline the data loading process for machine learning tasks. By understanding the strengths of these methods, you can select the approach that best suits your project's specific requirements.
diff --git a/docs/source/data_loaders_on_gpu_with_jax.ipynb b/docs/source/data_loaders_on_gpu_with_jax.ipynb
new file mode 100644
index 0000000..40c8ddc
--- /dev/null
+++ b/docs/source/data_loaders_on_gpu_with_jax.ipynb
@@ -0,0 +1,1176 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "PUFGZggH49zp"
+   },
+   "source": [
+    "# Introduction to Data Loaders on GPU with JAX"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "3ia4PKEV5Dr8"
+   },
+   "source": [
+    "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/jax-ml/jax-ai-stack/blob/main/docs/data_loaders_on_gpu_with_jax.ipynb)\n",
+    "\n",
+    "This tutorial explores different data loading strategies for using **JAX** on a single [**GPU**](https://jax.readthedocs.io/en/latest/glossary.html#term-GPU). While JAX doesn't include a built-in data loader, it seamlessly integrates with popular data loading libraries, including:\n",
+    "*   [**PyTorch DataLoader**](https://github.com/pytorch/data)\n",
+    "*   [**TensorFlow Datasets (TFDS)**](https://github.com/tensorflow/datasets)\n",
+    "*   [**Grain**](https://github.com/google/grain)\n",
+    "*   [**Hugging Face**](https://huggingface.co/docs/datasets/en/use_with_jax#data-loading)\n",
+    "\n",
+    "You'll see how to use each of these libraries to efficiently load data for a simple image classification task using the MNIST dataset.\n",
+    "\n",
+    "Compared to [CPU-based loading](https://jax-ai-stack.readthedocs.io/en/latest/data_loaders_on_cpu_with_jax.html), working with a GPU introduces specific considerations like transferring data to the GPU using `device_put`, managing larger batch sizes for faster processing, and efficiently utilizing GPU memory. Unlike multi-device setups, this guide focuses on optimizing data handling for a single GPU.\n",
+    "\n",
+    "\n",
+    "If you're looking for CPU-specific data loading advice, see [Data Loaders on CPU](https://jax-ai-stack.readthedocs.io/en/latest/data_loaders_on_cpu_with_jax.html).\n",
+    "\n",
+    "If you're looking for a multi-device data loading strategy, see [Data Loaders on Multi-Device Setups](https://jax-ai-stack.readthedocs.io/en/latest/data_loaders_for_multi_device_setups_with_jax.html)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "-rsMgVtO6asW"
+   },
+   "source": [
+    "## Import JAX API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "tDJNQ6V-Dg5g"
+   },
+   "outputs": [],
+   "source": [
+    "import jax\n",
+    "import jax.numpy as jnp\n",
+    "from jax import grad, jit, vmap, random, device_put"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "TsFdlkSZKp9S"
+   },
+   "source": [
+    "## Checking GPU Availability for JAX"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "N3sqvaF3KJw1",
+    "outputId": "ab40f542-b8c0-422c-ca68-4ce292817889"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[CudaDevice(id=0)]"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "jax.devices()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "qyJ_WTghDnIc"
+   },
+   "source": [
+    "## Setting Hyperparameters and Initializing Parameters\n",
+    "\n",
+    "You'll define hyperparameters for your model and data loading, including layer sizes, learning rate, batch size, and the data directory. You'll also initialize the weights and biases for a fully-connected neural network."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "qLNOSloFDka_"
+   },
+   "outputs": [],
+   "source": [
+    "# A helper function to randomly initialize weights and biases\n",
+    "# for a dense neural network layer\n",
+    "def random_layer_params(m, n, key, scale=1e-2):\n",
+    "  w_key, b_key = random.split(key)\n",
+    "  return scale * random.normal(w_key, (n, m)), scale * random.normal(b_key, (n,))\n",
+    "\n",
+    "# Function to initialize network parameters for all layers based on defined sizes\n",
+    "def init_network_params(sizes, key):\n",
+    "  keys = random.split(key, len(sizes))\n",
+    "  return [random_layer_params(m, n, k) for m, n, k in zip(sizes[:-1], sizes[1:], keys)]\n",
+    "\n",
+    "layer_sizes = [784, 512, 512, 10]  # Layers of the network\n",
+    "step_size = 0.01                   # Learning rate\n",
+    "num_epochs = 8                     # Number of training epochs\n",
+    "batch_size = 128                   # Batch size for training\n",
+    "n_targets = 10                     # Number of classes (digits 0-9)\n",
+    "num_pixels = 28 * 28               # Each MNIST image is 28x28 pixels\n",
+    "data_dir = '/tmp/mnist_dataset'    # Directory for storing the dataset\n",
+    "\n",
+    "# Initialize network parameters using the defined layer sizes and a random seed\n",
+    "params = init_network_params(layer_sizes, random.PRNGKey(0))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "rHLdqeI7D2WZ"
+   },
+   "source": [
+    "## Model Prediction with Auto-Batching\n",
+    "\n",
+    "In this section, you'll define the `predict` function for your neural network. This function computes the output of the network for a single input image.\n",
+    "\n",
+    "To efficiently process multiple images simultaneously, you'll use [`vmap`](https://jax.readthedocs.io/en/latest/_autosummary/jax.vmap.html#jax.vmap), which allows you to vectorize the `predict` function and apply it across a batch of inputs. This technique, called auto-batching, improves computational efficiency by leveraging hardware acceleration."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "bKIYPSkvD1QV"
+   },
+   "outputs": [],
+   "source": [
+    "from jax.scipy.special import logsumexp\n",
+    "\n",
+    "def relu(x):\n",
+    "  return jnp.maximum(0, x)\n",
+    "\n",
+    "def predict(params, image):\n",
+    "  # per-example predictions\n",
+    "  activations = image\n",
+    "  for w, b in params[:-1]:\n",
+    "    outputs = jnp.dot(w, activations) + b\n",
+    "    activations = relu(outputs)\n",
+    "\n",
+    "  final_w, final_b = params[-1]\n",
+    "  logits = jnp.dot(final_w, activations) + final_b\n",
+    "  return logits - logsumexp(logits)\n",
+    "\n",
+    "# Make a batched version of the `predict` function\n",
+    "batched_predict = vmap(predict, in_axes=(None, 0))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "rLqfeORsERek"
+   },
+   "source": [
+    "## Utility and Loss Functions\n",
+    "\n",
+    "You'll now define utility functions for:\n",
+    "- One-hot encoding: Converts class indices to binary vectors.\n",
+    "- Accuracy calculation: Measures the performance of the model on the dataset.\n",
+    "- Loss computation: Calculates the difference between predictions and targets.\n",
+    "\n",
+    "To optimize performance:\n",
+    "- [`grad`](https://jax.readthedocs.io/en/latest/_autosummary/jax.grad.html#jax.grad) is used to compute gradients of the loss function with respect to network parameters.\n",
+    "- [`jit`](https://jax.readthedocs.io/en/latest/_autosummary/jax.jit.html#jax.jit) compiles the update function, enabling faster execution by leveraging JAX's [XLA](https://openxla.org/xla) compilation.\n",
+    "\n",
+    "- [`device_put`](https://jax.readthedocs.io/en/latest/_autosummary/jax.device_put.html) to transfer the dataset to the GPU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "sA0a06raEQfS"
+   },
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "\n",
+    "def one_hot(x, k, dtype=jnp.float32):\n",
+    "  \"\"\"Create a one-hot encoding of x of size k.\"\"\"\n",
+    "  return jnp.array(x[:, None] == jnp.arange(k), dtype)\n",
+    "\n",
+    "def accuracy(params, images, targets):\n",
+    "  \"\"\"Calculate the accuracy of predictions.\"\"\"\n",
+    "  target_class = jnp.argmax(targets, axis=1)\n",
+    "  predicted_class = jnp.argmax(batched_predict(params, images), axis=1)\n",
+    "  return jnp.mean(predicted_class == target_class)\n",
+    "\n",
+    "def loss(params, images, targets):\n",
+    "  \"\"\"Calculate the loss between predictions and targets.\"\"\"\n",
+    "  preds = batched_predict(params, images)\n",
+    "  return -jnp.mean(preds * targets)\n",
+    "\n",
+    "@jit\n",
+    "def update(params, x, y):\n",
+    "  \"\"\"Update the network parameters using gradient descent.\"\"\"\n",
+    "  grads = grad(loss)(params, x, y)\n",
+    "  return [(w - step_size * dw, b - step_size * db)\n",
+    "          for (w, b), (dw, db) in zip(params, grads)]\n",
+    "\n",
+    "def reshape_and_one_hot(x, y):\n",
+    "    \"\"\"Reshape and one-hot encode the inputs.\"\"\"\n",
+    "    x = jnp.reshape(x, (len(x), num_pixels))\n",
+    "    y = one_hot(y, n_targets)\n",
+    "    return x, y\n",
+    "\n",
+    "def train_model(num_epochs, params, training_generator, data_loader_type='streamed'):\n",
+    "    \"\"\"Train the model for a given number of epochs and device_put for GPU transfer.\"\"\"\n",
+    "    for epoch in range(num_epochs):\n",
+    "        start_time = time.time()\n",
+    "        for x, y in training_generator() if data_loader_type == 'streamed' else training_generator:\n",
+    "            x, y = reshape_and_one_hot(x, y)\n",
+    "            x, y = device_put(x), device_put(y)\n",
+    "            params = update(params, x, y)\n",
+    "\n",
+    "        print(f\"Epoch {epoch + 1} in {time.time() - start_time:.2f} sec: \"\n",
+    "              f\"Train Accuracy: {accuracy(params, train_images, train_labels):.4f}, \"\n",
+    "              f\"Test Accuracy: {accuracy(params, test_images, test_labels):.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Hsionp5IYsQ9"
+   },
+   "source": [
+    "## Loading Data with PyTorch DataLoader\n",
+    "\n",
+    "This section shows how to load the MNIST dataset using PyTorch's DataLoader, convert the data to NumPy arrays, and apply transformations to flatten and cast images."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "uA7XY0OezHse",
+    "outputId": "4c86f455-ff1d-474e-f8e3-7111d9b56996"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.5.1+cu121)\n",
+      "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (0.20.1+cu121)\n",
+      "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch) (3.16.1)\n",
+      "Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch) (4.12.2)\n",
+      "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.4.2)\n",
+      "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.4)\n",
+      "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch) (2024.9.0)\n",
+      "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.10/dist-packages (from torch) (1.13.1)\n",
+      "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy==1.13.1->torch) (1.3.0)\n",
+      "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from torchvision) (1.26.4)\n",
+      "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.10/dist-packages (from torchvision) (11.0.0)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (3.0.2)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install torch torchvision"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "kO5_WzwY59gE"
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from jax.tree_util import tree_map\n",
+    "from torch.utils import data\n",
+    "from torchvision.datasets import MNIST"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "6f6qU8PCc143"
+   },
+   "outputs": [],
+   "source": [
+    "def numpy_collate(batch):\n",
+    "  \"\"\"Collate function to convert a batch of PyTorch data into NumPy arrays.\"\"\"\n",
+    "  return tree_map(np.asarray, data.default_collate(batch))\n",
+    "\n",
+    "class NumpyLoader(data.DataLoader):\n",
+    "    \"\"\"Custom DataLoader to return NumPy arrays from a PyTorch Dataset.\"\"\"\n",
+    "    def __init__(self, dataset, batch_size=1,\n",
+    "                  shuffle=False, sampler=None,\n",
+    "                  batch_sampler=None, num_workers=0,\n",
+    "                  pin_memory=False, drop_last=False,\n",
+    "                  timeout=0, worker_init_fn=None):\n",
+    "      super(self.__class__, self).__init__(dataset,\n",
+    "          batch_size=batch_size,\n",
+    "          shuffle=shuffle,\n",
+    "          sampler=sampler,\n",
+    "          batch_sampler=batch_sampler,\n",
+    "          num_workers=num_workers,\n",
+    "          collate_fn=numpy_collate,\n",
+    "          pin_memory=pin_memory,\n",
+    "          drop_last=drop_last,\n",
+    "          timeout=timeout,\n",
+    "          worker_init_fn=worker_init_fn)\n",
+    "class FlattenAndCast(object):\n",
+    "  \"\"\"Transform class to flatten and cast images to float32.\"\"\"\n",
+    "  def __call__(self, pic):\n",
+    "    return np.ravel(np.array(pic, dtype=jnp.float32))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "mfSnfJND6I8G"
+   },
+   "source": [
+    "### Load Dataset with Transformations\n",
+    "\n",
+    "Standardize the data by flattening the images, casting them to `float32`, and ensuring consistent data types."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Kxbl6bcx6crv"
+   },
+   "outputs": [],
+   "source": [
+    "mnist_dataset = MNIST(data_dir, download=True, transform=FlattenAndCast())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "kbdsqvPZGrsa"
+   },
+   "source": [
+    "### Full Training Dataset for Accuracy Checks\n",
+    "\n",
+    "Convert the entire training dataset to JAX arrays."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "c9ZCJq_rzPck"
+   },
+   "outputs": [],
+   "source": [
+    "train_images = np.array(mnist_dataset.data).reshape(len(mnist_dataset.data), -1)\n",
+    "train_labels = one_hot(np.array(mnist_dataset.targets), n_targets)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "WXUh0BwvG8Ko"
+   },
+   "source": [
+    "### Get Full Test Dataset\n",
+    "\n",
+    "Load and process the full test dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "brlLG4SqGphm"
+   },
+   "outputs": [],
+   "source": [
+    "mnist_dataset_test = MNIST(data_dir, download=True, train=False)\n",
+    "test_images = jnp.array(mnist_dataset_test.data.numpy().reshape(len(mnist_dataset_test.data), -1), dtype=jnp.float32)\n",
+    "test_labels = one_hot(np.array(mnist_dataset_test.targets), n_targets)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "Oz-UVnCxG5E8",
+    "outputId": "53f3fb32-5096-4862-e022-3c3a1d82137a"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train: (60000, 784) (60000, 10)\n",
+      "Test: (10000, 784) (10000, 10)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('Train:', train_images.shape, train_labels.shape)\n",
+    "print('Test:', test_images.shape, test_labels.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "mNjn9dMPitKL"
+   },
+   "source": [
+    "### Training Data Generator\n",
+    "\n",
+    "Define a generator function using PyTorch's DataLoader for batch training.\n",
+    "Setting `num_workers > 0` enables multi-process data loading, which can accelerate data loading for larger datasets or intensive preprocessing tasks. Experiment with different values to find the optimal setting for your hardware and workload.\n",
+    "\n",
+    "Note: When setting `num_workers > 0`, you may see the following `RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.`\n",
+    "This warning can be safely ignored since data loaders do not use JAX within the forked processes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "0LdT8P8aisWF"
+   },
+   "outputs": [],
+   "source": [
+    "def pytorch_training_generator(mnist_dataset):\n",
+    "    return NumpyLoader(mnist_dataset, batch_size=batch_size, num_workers=0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Xzt2x9S1HC3T"
+   },
+   "source": [
+    "### Training Loop (PyTorch DataLoader)\n",
+    "\n",
+    "The training loop uses the PyTorch DataLoader to iterate through batches and update model parameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "SqweRz_98sN8",
+    "outputId": "bdd45256-3f5a-48f7-e45c-378078ac4279"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 1 in 20.23 sec: Train Accuracy: 0.9158, Test Accuracy: 0.9195\n",
+      "Epoch 2 in 14.64 sec: Train Accuracy: 0.9372, Test Accuracy: 0.9385\n",
+      "Epoch 3 in 3.91 sec: Train Accuracy: 0.9492, Test Accuracy: 0.9467\n",
+      "Epoch 4 in 3.85 sec: Train Accuracy: 0.9569, Test Accuracy: 0.9532\n",
+      "Epoch 5 in 4.48 sec: Train Accuracy: 0.9631, Test Accuracy: 0.9577\n",
+      "Epoch 6 in 4.03 sec: Train Accuracy: 0.9675, Test Accuracy: 0.9617\n",
+      "Epoch 7 in 3.86 sec: Train Accuracy: 0.9708, Test Accuracy: 0.9652\n",
+      "Epoch 8 in 4.57 sec: Train Accuracy: 0.9736, Test Accuracy: 0.9671\n"
+     ]
+    }
+   ],
+   "source": [
+    "train_model(num_epochs, params, pytorch_training_generator(mnist_dataset), data_loader_type='iterable')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Nm45ZTo6yrf5"
+   },
+   "source": [
+    "## Loading Data with TensorFlow Datasets (TFDS)\n",
+    "\n",
+    "This section demonstrates how to load the MNIST dataset using TFDS, fetch the full dataset for evaluation, and define a training generator for batch processing. GPU usage is explicitly disabled for TensorFlow."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "sGaQAk1DHMUx"
+   },
+   "outputs": [],
+   "source": [
+    "import tensorflow_datasets as tfds"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ZSc5K0Eiwm4L"
+   },
+   "source": [
+    "### Fetch Full Dataset for Evaluation\n",
+    "\n",
+    "Load the dataset with `tfds.load`, convert it to NumPy arrays, and process it for evaluation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "1hOamw_7C8Pb"
+   },
+   "outputs": [],
+   "source": [
+    "# tfds.load returns tf.Tensors (or tf.data.Datasets if batch_size != -1)\n",
+    "mnist_data, info = tfds.load(name=\"mnist\", batch_size=-1, data_dir=data_dir, with_info=True)\n",
+    "mnist_data = tfds.as_numpy(mnist_data)\n",
+    "train_data, test_data = mnist_data['train'], mnist_data['test']\n",
+    "\n",
+    "# Full train set\n",
+    "train_images, train_labels = train_data['image'], train_data['label']\n",
+    "train_images = jnp.reshape(train_images, (len(train_images), num_pixels))\n",
+    "train_labels = one_hot(train_labels, n_targets)\n",
+    "\n",
+    "# Full test set\n",
+    "test_images, test_labels = test_data['image'], test_data['label']\n",
+    "test_images = jnp.reshape(test_images, (len(test_images), num_pixels))\n",
+    "test_labels = one_hot(test_labels, n_targets)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "Td3PiLdmEf7z",
+    "outputId": "b8c9a32a-9cf0-4dc3-cb51-db21d32c6545"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train: (60000, 784) (60000, 10)\n",
+      "Test: (10000, 784) (10000, 10)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('Train:', train_images.shape, train_labels.shape)\n",
+    "print('Test:', test_images.shape, test_labels.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "dXMvgk6sdq4j"
+   },
+   "source": [
+    "### Define the Training Generator\n",
+    "\n",
+    "Create a generator function to yield batches of data for training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "vX59u8CqEf4J"
+   },
+   "outputs": [],
+   "source": [
+    "def training_generator():\n",
+    "  # as_supervised=True gives us the (image, label) as a tuple instead of a dict\n",
+    "  ds = tfds.load(name='mnist', split='train', as_supervised=True, data_dir=data_dir)\n",
+    "  # You can build up an arbitrary tf.data input pipeline\n",
+    "  ds = ds.batch(batch_size).prefetch(1)\n",
+    "  # tfds.dataset_as_numpy converts the tf.data.Dataset into an iterable of NumPy arrays\n",
+    "  return tfds.as_numpy(ds)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "EAWeUdnuFNBY"
+   },
+   "source": [
+    "### Training Loop (TFDS)\n",
+    "\n",
+    "Use the training generator in a custom training loop."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "h2sO13XDGvq1",
+    "outputId": "f30805bb-e725-46ee-e053-6e97f2af81c5"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 1 in 20.86 sec: Train Accuracy: 0.9253, Test Accuracy: 0.9268\n",
+      "Epoch 2 in 8.56 sec: Train Accuracy: 0.9428, Test Accuracy: 0.9413\n",
+      "Epoch 3 in 5.40 sec: Train Accuracy: 0.9532, Test Accuracy: 0.9511\n",
+      "Epoch 4 in 3.86 sec: Train Accuracy: 0.9598, Test Accuracy: 0.9555\n",
+      "Epoch 5 in 3.88 sec: Train Accuracy: 0.9652, Test Accuracy: 0.9601\n",
+      "Epoch 6 in 10.35 sec: Train Accuracy: 0.9692, Test Accuracy: 0.9631\n",
+      "Epoch 7 in 4.39 sec: Train Accuracy: 0.9726, Test Accuracy: 0.9650\n",
+      "Epoch 8 in 4.77 sec: Train Accuracy: 0.9753, Test Accuracy: 0.9669\n"
+     ]
+    }
+   ],
+   "source": [
+    "train_model(num_epochs, params, training_generator)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "-ryVkrAITS9Z"
+   },
+   "source": [
+    "## Loading Data with Grain\n",
+    "\n",
+    "This section demonstrates how to load MNIST data using Grain, a data-loading library. You'll define a custom dataset class for Grain and set up a Grain DataLoader for efficient training."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "waYhUMUGmhH-"
+   },
+   "source": [
+    "Install Grain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "L78o7eeyGvn5",
+    "outputId": "cb0ce6cf-243b-4183-8f63-646e00232caa"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: grain in /usr/local/lib/python3.10/dist-packages (0.2.2)\n",
+      "Requirement already satisfied: absl-py in /usr/local/lib/python3.10/dist-packages (from grain) (1.4.0)\n",
+      "Requirement already satisfied: array-record in /usr/local/lib/python3.10/dist-packages (from grain) (0.5.1)\n",
+      "Requirement already satisfied: cloudpickle in /usr/local/lib/python3.10/dist-packages (from grain) (3.1.0)\n",
+      "Requirement already satisfied: dm-tree in /usr/local/lib/python3.10/dist-packages (from grain) (0.1.8)\n",
+      "Requirement already satisfied: etils[epath,epy] in /usr/local/lib/python3.10/dist-packages (from grain) (1.10.0)\n",
+      "Requirement already satisfied: jaxtyping in /usr/local/lib/python3.10/dist-packages (from grain) (0.2.36)\n",
+      "Requirement already satisfied: more-itertools>=9.1.0 in /usr/local/lib/python3.10/dist-packages (from grain) (10.5.0)\n",
+      "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from grain) (1.26.4)\n",
+      "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from etils[epath,epy]->grain) (2024.9.0)\n",
+      "Requirement already satisfied: importlib_resources in /usr/local/lib/python3.10/dist-packages (from etils[epath,epy]->grain) (6.4.5)\n",
+      "Requirement already satisfied: typing_extensions in /usr/local/lib/python3.10/dist-packages (from etils[epath,epy]->grain) (4.12.2)\n",
+      "Requirement already satisfied: zipp in /usr/local/lib/python3.10/dist-packages (from etils[epath,epy]->grain) (3.21.0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install grain"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "66bH3ZDJ7Iat"
+   },
+   "source": [
+    "Import Required Libraries (import MNIST dataset from torchvision)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "mS62eVL9Ifmz"
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import grain.python as pygrain\n",
+    "from torchvision.datasets import MNIST"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "0h6mwVrspPA-"
+   },
+   "source": [
+    "### Define Dataset Class\n",
+    "\n",
+    "Create a custom dataset class to load MNIST data for Grain."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "bnrhac5Hh7y1"
+   },
+   "outputs": [],
+   "source": [
+    "class Dataset:\n",
+    "    def __init__(self, data_dir, train=True):\n",
+    "        self.data_dir = data_dir\n",
+    "        self.train = train\n",
+    "        self.load_data()\n",
+    "\n",
+    "    def load_data(self):\n",
+    "        self.dataset = MNIST(self.data_dir, download=True, train=self.train)\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.dataset)\n",
+    "\n",
+    "    def __getitem__(self, index):\n",
+    "        img, label = self.dataset[index]\n",
+    "        return np.ravel(np.array(img, dtype=np.float32)), label"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "53mf8bWEsyTr"
+   },
+   "source": [
+    "### Initialize the Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "pN3oF7-ostGE"
+   },
+   "outputs": [],
+   "source": [
+    "mnist_dataset = Dataset(data_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "GqD-ycgBuwv9"
+   },
+   "source": [
+    "### Get the full train and test dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "f1VnTuX3u_kL"
+   },
+   "outputs": [],
+   "source": [
+    "# Convert training data to JAX arrays and encode labels as one-hot vectors\n",
+    "train_images = jnp.array([mnist_dataset[i][0] for i in range(len(mnist_dataset))], dtype=jnp.float32)\n",
+    "train_labels = one_hot(np.array([mnist_dataset[i][1] for i in range(len(mnist_dataset))]), n_targets)\n",
+    "\n",
+    "# Load test dataset and process it\n",
+    "mnist_dataset_test = MNIST(data_dir, download=True, train=False)\n",
+    "test_images = jnp.array([np.ravel(np.array(mnist_dataset_test[i][0], dtype=np.float32)) for i in range(len(mnist_dataset_test))], dtype=jnp.float32)\n",
+    "test_labels = one_hot(np.array([mnist_dataset_test[i][1] for i in range(len(mnist_dataset_test))]), n_targets)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "a2NHlp9klrQL",
+    "outputId": "c9422190-55e9-400b-bd4e-0e7bf23dc6a1"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train: (60000, 784) (60000, 10)\n",
+      "Test: (10000, 784) (10000, 10)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Train:\", train_images.shape, train_labels.shape)\n",
+    "print(\"Test:\", test_images.shape, test_labels.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "1QPbXt7O0JN-"
+   },
+   "source": [
+    "### Initialize PyGrain DataLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "2jqd1jJt25Bj"
+   },
+   "outputs": [],
+   "source": [
+    "sampler = pygrain.SequentialSampler(\n",
+    "    num_records=len(mnist_dataset),\n",
+    "    shard_options=pygrain.NoSharding()) # Single-device, no sharding\n",
+    "\n",
+    "def pygrain_training_generator():\n",
+    "    return pygrain.DataLoader(\n",
+    "        data_source=mnist_dataset,\n",
+    "        sampler=sampler,\n",
+    "        operations=[pygrain.Batch(batch_size)],\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "mV5z4GLCGKlx"
+   },
+   "source": [
+    "### Training Loop (Grain)\n",
+    "\n",
+    "Run the training loop using the Grain DataLoader."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "9-iANQ-9CcW_",
+    "outputId": "b0e19da2-9e34-4183-c5d8-af66de5efa5c"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 1 in 15.65 sec: Train Accuracy: 0.9158, Test Accuracy: 0.9195\n",
+      "Epoch 2 in 15.03 sec: Train Accuracy: 0.9372, Test Accuracy: 0.9385\n",
+      "Epoch 3 in 14.93 sec: Train Accuracy: 0.9492, Test Accuracy: 0.9467\n",
+      "Epoch 4 in 11.56 sec: Train Accuracy: 0.9569, Test Accuracy: 0.9532\n",
+      "Epoch 5 in 9.33 sec: Train Accuracy: 0.9631, Test Accuracy: 0.9577\n",
+      "Epoch 6 in 9.31 sec: Train Accuracy: 0.9675, Test Accuracy: 0.9617\n",
+      "Epoch 7 in 9.78 sec: Train Accuracy: 0.9708, Test Accuracy: 0.9652\n",
+      "Epoch 8 in 9.80 sec: Train Accuracy: 0.9736, Test Accuracy: 0.9671\n"
+     ]
+    }
+   ],
+   "source": [
+    "train_model(num_epochs, params, pygrain_training_generator)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "o51P6lr86wz-"
+   },
+   "source": [
+    "## Loading Data with Hugging Face\n",
+    "\n",
+    "This section demonstrates loading MNIST data using the Hugging Face `datasets` library. You'll format the dataset for JAX compatibility, prepare flattened images and one-hot-encoded labels, and define a training generator."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "69vrihaOi4Oz"
+   },
+   "source": [
+    "Install the Hugging Face `datasets` library."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "19ipxPhI6oSN",
+    "outputId": "b80b80cd-fc14-4a43-f8a8-2802de4faade"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (3.1.0)\n",
+      "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.16.1)\n",
+      "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.26.4)\n",
+      "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (17.0.0)\n",
+      "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.8)\n",
+      "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.2)\n",
+      "Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.32.3)\n",
+      "Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.6)\n",
+      "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.5.0)\n",
+      "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.16)\n",
+      "Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets) (2024.9.0)\n",
+      "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.11.2)\n",
+      "Requirement already satisfied: huggingface-hub>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.26.2)\n",
+      "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (24.2)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.2)\n",
+      "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.3)\n",
+      "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n",
+      "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (24.2.0)\n",
+      "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.5.0)\n",
+      "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.1.0)\n",
+      "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (0.2.0)\n",
+      "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.17.2)\n",
+      "Requirement already satisfied: async-timeout<6.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.23.0->datasets) (4.12.2)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.4.0)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.10)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2.2.3)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2024.8.30)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n",
+      "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n",
+      "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n",
+      "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "8v1N59p76zn0"
+   },
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "8Gaj11tO7C86"
+   },
+   "source": [
+    "Load the MNIST dataset from Hugging Face and format it as `numpy` arrays for quick access or `jax` to get JAX arrays."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "a22kTvgk6_fJ"
+   },
+   "outputs": [],
+   "source": [
+    "mnist_dataset = load_dataset(\"mnist\", cache_dir=data_dir).with_format(\"numpy\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "tgI7dIaX7JzM"
+   },
+   "source": [
+    "### Extract images and labels\n",
+    "\n",
+    "Get image shape and flatten for model input."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "NHrKatD_7HbH"
+   },
+   "outputs": [],
+   "source": [
+    "train_images = mnist_dataset[\"train\"][\"image\"]\n",
+    "train_labels = mnist_dataset[\"train\"][\"label\"]\n",
+    "test_images = mnist_dataset[\"test\"][\"image\"]\n",
+    "test_labels = mnist_dataset[\"test\"][\"label\"]\n",
+    "\n",
+    "# Extract image shape\n",
+    "image_shape = train_images.shape[1:]\n",
+    "num_features = image_shape[0] * image_shape[1]\n",
+    "\n",
+    "# Flatten the images\n",
+    "train_images = train_images.reshape(-1, num_features)\n",
+    "test_images = test_images.reshape(-1, num_features)\n",
+    "\n",
+    "# One-hot encode the labels\n",
+    "train_labels = one_hot(train_labels, n_targets)\n",
+    "test_labels = one_hot(test_labels, n_targets)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "dITh435Z7Nwb",
+    "outputId": "cc89c1ec-6987-4f1c-90a4-c3b355ea7225"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train: (60000, 784) (60000, 10)\n",
+      "Test: (10000, 784) (10000, 10)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('Train:', train_images.shape, train_labels.shape)\n",
+    "print('Test:', test_images.shape, test_labels.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "kk_4zJlz7T1E"
+   },
+   "source": [
+    "### Define Training Generator\n",
+    "\n",
+    "Set up a generator to yield batches of images and labels for training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "-zLJhogj7RL-"
+   },
+   "outputs": [],
+   "source": [
+    "def hf_training_generator():\n",
+    "    \"\"\"Yield batches for training.\"\"\"\n",
+    "    for batch in mnist_dataset[\"train\"].iter(batch_size):\n",
+    "        x, y = batch[\"image\"], batch[\"label\"]\n",
+    "        yield x, y"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "HIsGfkLI7dvZ"
+   },
+   "source": [
+    "### Training Loop (Hugging Face Datasets)\n",
+    "\n",
+    "Run the training loop using the Hugging Face training generator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "Ui6aLiZP7aLe",
+    "outputId": "c51529e0-563f-4af0-9793-76b5e6f323db"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 1 in 19.06 sec: Train Accuracy: 0.9158, Test Accuracy: 0.9195\n",
+      "Epoch 2 in 8.94 sec: Train Accuracy: 0.9372, Test Accuracy: 0.9385\n",
+      "Epoch 3 in 5.43 sec: Train Accuracy: 0.9492, Test Accuracy: 0.9467\n",
+      "Epoch 4 in 6.41 sec: Train Accuracy: 0.9569, Test Accuracy: 0.9532\n",
+      "Epoch 5 in 5.80 sec: Train Accuracy: 0.9631, Test Accuracy: 0.9577\n",
+      "Epoch 6 in 6.61 sec: Train Accuracy: 0.9675, Test Accuracy: 0.9617\n",
+      "Epoch 7 in 5.49 sec: Train Accuracy: 0.9708, Test Accuracy: 0.9652\n",
+      "Epoch 8 in 6.64 sec: Train Accuracy: 0.9736, Test Accuracy: 0.9671\n"
+     ]
+    }
+   ],
+   "source": [
+    "train_model(num_epochs, params, hf_training_generator)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "rCJq2rvKlKWX"
+   },
+   "source": [
+    "## Summary\n",
+    "\n",
+    "This notebook explored efficient methods for loading data on a GPU with JAX, using libraries such as PyTorch DataLoader, TensorFlow Datasets, Grain, and Hugging Face Datasets. You also learned GPU-specific optimizations, including using `device_put` for data transfer and managing GPU memory, to enhance training efficiency. Each method offers unique benefits, allowing you to choose the best approach based on your project requirements."
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "jupytext": {
+   "formats": "ipynb,md:myst"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/docs/source/data_loaders_on_gpu_with_jax.md b/docs/source/data_loaders_on_gpu_with_jax.md
new file mode 100644
index 0000000..a83ec4c
--- /dev/null
+++ b/docs/source/data_loaders_on_gpu_with_jax.md
@@ -0,0 +1,650 @@
+---
+jupytext:
+  formats: ipynb,md:myst
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.15.2
+kernelspec:
+  display_name: Python 3
+  name: python3
+---
+
++++ {"id": "PUFGZggH49zp"}
+
+# Introduction to Data Loaders on GPU with JAX
+
++++ {"id": "3ia4PKEV5Dr8"}
+
+[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/jax-ml/jax-ai-stack/blob/main/docs/data_loaders_on_gpu_with_jax.ipynb)
+
+This tutorial explores different data loading strategies for using **JAX** on a single [**GPU**](https://jax.readthedocs.io/en/latest/glossary.html#term-GPU). While JAX doesn't include a built-in data loader, it seamlessly integrates with popular data loading libraries, including:
+*   [**PyTorch DataLoader**](https://github.com/pytorch/data)
+*   [**TensorFlow Datasets (TFDS)**](https://github.com/tensorflow/datasets)
+*   [**Grain**](https://github.com/google/grain)
+*   [**Hugging Face**](https://huggingface.co/docs/datasets/en/use_with_jax#data-loading)
+
+You'll see how to use each of these libraries to efficiently load data for a simple image classification task using the MNIST dataset.
+
+Compared to [CPU-based loading](https://jax-ai-stack.readthedocs.io/en/latest/data_loaders_on_cpu_with_jax.html), working with a GPU introduces specific considerations like transferring data to the GPU using `device_put`, managing larger batch sizes for faster processing, and efficiently utilizing GPU memory. Unlike multi-device setups, this guide focuses on optimizing data handling for a single GPU.
+
+
+If you're looking for CPU-specific data loading advice, see [Data Loaders on CPU](https://jax-ai-stack.readthedocs.io/en/latest/data_loaders_on_cpu_with_jax.html).
+
+If you're looking for a multi-device data loading strategy, see [Data Loaders on Multi-Device Setups](https://jax-ai-stack.readthedocs.io/en/latest/data_loaders_for_multi_device_setups_with_jax.html).
+
++++ {"id": "-rsMgVtO6asW"}
+
+## Import JAX API
+
+```{code-cell}
+:id: tDJNQ6V-Dg5g
+
+import jax
+import jax.numpy as jnp
+from jax import grad, jit, vmap, random, device_put
+```
+
++++ {"id": "TsFdlkSZKp9S"}
+
+## Checking GPU Availability for JAX
+
+```{code-cell}
+---
+colab:
+  base_uri: https://localhost:8080/
+id: N3sqvaF3KJw1
+outputId: ab40f542-b8c0-422c-ca68-4ce292817889
+---
+jax.devices()
+```
+
++++ {"id": "qyJ_WTghDnIc"}
+
+## Setting Hyperparameters and Initializing Parameters
+
+You'll define hyperparameters for your model and data loading, including layer sizes, learning rate, batch size, and the data directory. You'll also initialize the weights and biases for a fully-connected neural network.
+
+```{code-cell}
+:id: qLNOSloFDka_
+
+# A helper function to randomly initialize weights and biases
+# for a dense neural network layer
+def random_layer_params(m, n, key, scale=1e-2):
+  w_key, b_key = random.split(key)
+  return scale * random.normal(w_key, (n, m)), scale * random.normal(b_key, (n,))
+
+# Function to initialize network parameters for all layers based on defined sizes
+def init_network_params(sizes, key):
+  keys = random.split(key, len(sizes))
+  return [random_layer_params(m, n, k) for m, n, k in zip(sizes[:-1], sizes[1:], keys)]
+
+layer_sizes = [784, 512, 512, 10]  # Layers of the network
+step_size = 0.01                   # Learning rate
+num_epochs = 8                     # Number of training epochs
+batch_size = 128                   # Batch size for training
+n_targets = 10                     # Number of classes (digits 0-9)
+num_pixels = 28 * 28               # Each MNIST image is 28x28 pixels
+data_dir = '/tmp/mnist_dataset'    # Directory for storing the dataset
+
+# Initialize network parameters using the defined layer sizes and a random seed
+params = init_network_params(layer_sizes, random.PRNGKey(0))
+```
+
++++ {"id": "rHLdqeI7D2WZ"}
+
+## Model Prediction with Auto-Batching
+
+In this section, you'll define the `predict` function for your neural network. This function computes the output of the network for a single input image.
+
+To efficiently process multiple images simultaneously, you'll use [`vmap`](https://jax.readthedocs.io/en/latest/_autosummary/jax.vmap.html#jax.vmap), which allows you to vectorize the `predict` function and apply it across a batch of inputs. This technique, called auto-batching, improves computational efficiency by leveraging hardware acceleration.
+
+```{code-cell}
+:id: bKIYPSkvD1QV
+
+from jax.scipy.special import logsumexp
+
+def relu(x):
+  return jnp.maximum(0, x)
+
+def predict(params, image):
+  # per-example predictions
+  activations = image
+  for w, b in params[:-1]:
+    outputs = jnp.dot(w, activations) + b
+    activations = relu(outputs)
+
+  final_w, final_b = params[-1]
+  logits = jnp.dot(final_w, activations) + final_b
+  return logits - logsumexp(logits)
+
+# Make a batched version of the `predict` function
+batched_predict = vmap(predict, in_axes=(None, 0))
+```
+
++++ {"id": "rLqfeORsERek"}
+
+## Utility and Loss Functions
+
+You'll now define utility functions for:
+- One-hot encoding: Converts class indices to binary vectors.
+- Accuracy calculation: Measures the performance of the model on the dataset.
+- Loss computation: Calculates the difference between predictions and targets.
+
+To optimize performance:
+- [`grad`](https://jax.readthedocs.io/en/latest/_autosummary/jax.grad.html#jax.grad) is used to compute gradients of the loss function with respect to network parameters.
+- [`jit`](https://jax.readthedocs.io/en/latest/_autosummary/jax.jit.html#jax.jit) compiles the update function, enabling faster execution by leveraging JAX's [XLA](https://openxla.org/xla) compilation.
+
+- [`device_put`](https://jax.readthedocs.io/en/latest/_autosummary/jax.device_put.html) to transfer the dataset to the GPU.
+
+```{code-cell}
+:id: sA0a06raEQfS
+
+import time
+
+def one_hot(x, k, dtype=jnp.float32):
+  """Create a one-hot encoding of x of size k."""
+  return jnp.array(x[:, None] == jnp.arange(k), dtype)
+
+def accuracy(params, images, targets):
+  """Calculate the accuracy of predictions."""
+  target_class = jnp.argmax(targets, axis=1)
+  predicted_class = jnp.argmax(batched_predict(params, images), axis=1)
+  return jnp.mean(predicted_class == target_class)
+
+def loss(params, images, targets):
+  """Calculate the loss between predictions and targets."""
+  preds = batched_predict(params, images)
+  return -jnp.mean(preds * targets)
+
+@jit
+def update(params, x, y):
+  """Update the network parameters using gradient descent."""
+  grads = grad(loss)(params, x, y)
+  return [(w - step_size * dw, b - step_size * db)
+          for (w, b), (dw, db) in zip(params, grads)]
+
+def reshape_and_one_hot(x, y):
+    """Reshape and one-hot encode the inputs."""
+    x = jnp.reshape(x, (len(x), num_pixels))
+    y = one_hot(y, n_targets)
+    return x, y
+
+def train_model(num_epochs, params, training_generator, data_loader_type='streamed'):
+    """Train the model for a given number of epochs and device_put for GPU transfer."""
+    for epoch in range(num_epochs):
+        start_time = time.time()
+        for x, y in training_generator() if data_loader_type == 'streamed' else training_generator:
+            x, y = reshape_and_one_hot(x, y)
+            x, y = device_put(x), device_put(y)
+            params = update(params, x, y)
+
+        print(f"Epoch {epoch + 1} in {time.time() - start_time:.2f} sec: "
+              f"Train Accuracy: {accuracy(params, train_images, train_labels):.4f}, "
+              f"Test Accuracy: {accuracy(params, test_images, test_labels):.4f}")
+```
+
++++ {"id": "Hsionp5IYsQ9"}
+
+## Loading Data with PyTorch DataLoader
+
+This section shows how to load the MNIST dataset using PyTorch's DataLoader, convert the data to NumPy arrays, and apply transformations to flatten and cast images.
+
+```{code-cell}
+---
+colab:
+  base_uri: https://localhost:8080/
+id: uA7XY0OezHse
+outputId: 4c86f455-ff1d-474e-f8e3-7111d9b56996
+---
+!pip install torch torchvision
+```
+
+```{code-cell}
+:id: kO5_WzwY59gE
+
+import numpy as np
+from jax.tree_util import tree_map
+from torch.utils import data
+from torchvision.datasets import MNIST
+```
+
+```{code-cell}
+:id: 6f6qU8PCc143
+
+def numpy_collate(batch):
+  """Collate function to convert a batch of PyTorch data into NumPy arrays."""
+  return tree_map(np.asarray, data.default_collate(batch))
+
+class NumpyLoader(data.DataLoader):
+    """Custom DataLoader to return NumPy arrays from a PyTorch Dataset."""
+    def __init__(self, dataset, batch_size=1,
+                  shuffle=False, sampler=None,
+                  batch_sampler=None, num_workers=0,
+                  pin_memory=False, drop_last=False,
+                  timeout=0, worker_init_fn=None):
+      super(self.__class__, self).__init__(dataset,
+          batch_size=batch_size,
+          shuffle=shuffle,
+          sampler=sampler,
+          batch_sampler=batch_sampler,
+          num_workers=num_workers,
+          collate_fn=numpy_collate,
+          pin_memory=pin_memory,
+          drop_last=drop_last,
+          timeout=timeout,
+          worker_init_fn=worker_init_fn)
+class FlattenAndCast(object):
+  """Transform class to flatten and cast images to float32."""
+  def __call__(self, pic):
+    return np.ravel(np.array(pic, dtype=jnp.float32))
+```
+
++++ {"id": "mfSnfJND6I8G"}
+
+### Load Dataset with Transformations
+
+Standardize the data by flattening the images, casting them to `float32`, and ensuring consistent data types.
+
+```{code-cell}
+:id: Kxbl6bcx6crv
+
+mnist_dataset = MNIST(data_dir, download=True, transform=FlattenAndCast())
+```
+
++++ {"id": "kbdsqvPZGrsa"}
+
+### Full Training Dataset for Accuracy Checks
+
+Convert the entire training dataset to JAX arrays.
+
+```{code-cell}
+:id: c9ZCJq_rzPck
+
+train_images = np.array(mnist_dataset.data).reshape(len(mnist_dataset.data), -1)
+train_labels = one_hot(np.array(mnist_dataset.targets), n_targets)
+```
+
++++ {"id": "WXUh0BwvG8Ko"}
+
+### Get Full Test Dataset
+
+Load and process the full test dataset.
+
+```{code-cell}
+:id: brlLG4SqGphm
+
+mnist_dataset_test = MNIST(data_dir, download=True, train=False)
+test_images = jnp.array(mnist_dataset_test.data.numpy().reshape(len(mnist_dataset_test.data), -1), dtype=jnp.float32)
+test_labels = one_hot(np.array(mnist_dataset_test.targets), n_targets)
+```
+
+```{code-cell}
+---
+colab:
+  base_uri: https://localhost:8080/
+id: Oz-UVnCxG5E8
+outputId: 53f3fb32-5096-4862-e022-3c3a1d82137a
+---
+print('Train:', train_images.shape, train_labels.shape)
+print('Test:', test_images.shape, test_labels.shape)
+```
+
++++ {"id": "mNjn9dMPitKL"}
+
+### Training Data Generator
+
+Define a generator function using PyTorch's DataLoader for batch training.
+Setting `num_workers > 0` enables multi-process data loading, which can accelerate data loading for larger datasets or intensive preprocessing tasks. Experiment with different values to find the optimal setting for your hardware and workload.
+
+Note: When setting `num_workers > 0`, you may see the following `RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.`
+This warning can be safely ignored since data loaders do not use JAX within the forked processes.
+
+```{code-cell}
+:id: 0LdT8P8aisWF
+
+def pytorch_training_generator(mnist_dataset):
+    return NumpyLoader(mnist_dataset, batch_size=batch_size, num_workers=0)
+```
+
++++ {"id": "Xzt2x9S1HC3T"}
+
+### Training Loop (PyTorch DataLoader)
+
+The training loop uses the PyTorch DataLoader to iterate through batches and update model parameters.
+
+```{code-cell}
+---
+colab:
+  base_uri: https://localhost:8080/
+id: SqweRz_98sN8
+outputId: bdd45256-3f5a-48f7-e45c-378078ac4279
+---
+train_model(num_epochs, params, pytorch_training_generator(mnist_dataset), data_loader_type='iterable')
+```
+
++++ {"id": "Nm45ZTo6yrf5"}
+
+## Loading Data with TensorFlow Datasets (TFDS)
+
+This section demonstrates how to load the MNIST dataset using TFDS, fetch the full dataset for evaluation, and define a training generator for batch processing. GPU usage is explicitly disabled for TensorFlow.
+
+```{code-cell}
+:id: sGaQAk1DHMUx
+
+import tensorflow_datasets as tfds
+```
+
++++ {"id": "ZSc5K0Eiwm4L"}
+
+### Fetch Full Dataset for Evaluation
+
+Load the dataset with `tfds.load`, convert it to NumPy arrays, and process it for evaluation.
+
+```{code-cell}
+:id: 1hOamw_7C8Pb
+
+# tfds.load returns tf.Tensors (or tf.data.Datasets if batch_size != -1)
+mnist_data, info = tfds.load(name="mnist", batch_size=-1, data_dir=data_dir, with_info=True)
+mnist_data = tfds.as_numpy(mnist_data)
+train_data, test_data = mnist_data['train'], mnist_data['test']
+
+# Full train set
+train_images, train_labels = train_data['image'], train_data['label']
+train_images = jnp.reshape(train_images, (len(train_images), num_pixels))
+train_labels = one_hot(train_labels, n_targets)
+
+# Full test set
+test_images, test_labels = test_data['image'], test_data['label']
+test_images = jnp.reshape(test_images, (len(test_images), num_pixels))
+test_labels = one_hot(test_labels, n_targets)
+```
+
+```{code-cell}
+---
+colab:
+  base_uri: https://localhost:8080/
+id: Td3PiLdmEf7z
+outputId: b8c9a32a-9cf0-4dc3-cb51-db21d32c6545
+---
+print('Train:', train_images.shape, train_labels.shape)
+print('Test:', test_images.shape, test_labels.shape)
+```
+
++++ {"id": "dXMvgk6sdq4j"}
+
+### Define the Training Generator
+
+Create a generator function to yield batches of data for training.
+
+```{code-cell}
+:id: vX59u8CqEf4J
+
+def training_generator():
+  # as_supervised=True gives us the (image, label) as a tuple instead of a dict
+  ds = tfds.load(name='mnist', split='train', as_supervised=True, data_dir=data_dir)
+  # You can build up an arbitrary tf.data input pipeline
+  ds = ds.batch(batch_size).prefetch(1)
+  # tfds.dataset_as_numpy converts the tf.data.Dataset into an iterable of NumPy arrays
+  return tfds.as_numpy(ds)
+```
+
++++ {"id": "EAWeUdnuFNBY"}
+
+### Training Loop (TFDS)
+
+Use the training generator in a custom training loop.
+
+```{code-cell}
+---
+colab:
+  base_uri: https://localhost:8080/
+id: h2sO13XDGvq1
+outputId: f30805bb-e725-46ee-e053-6e97f2af81c5
+---
+train_model(num_epochs, params, training_generator)
+```
+
++++ {"id": "-ryVkrAITS9Z"}
+
+## Loading Data with Grain
+
+This section demonstrates how to load MNIST data using Grain, a data-loading library. You'll define a custom dataset class for Grain and set up a Grain DataLoader for efficient training.
+
++++ {"id": "waYhUMUGmhH-"}
+
+Install Grain
+
+```{code-cell}
+---
+colab:
+  base_uri: https://localhost:8080/
+id: L78o7eeyGvn5
+outputId: cb0ce6cf-243b-4183-8f63-646e00232caa
+---
+!pip install grain
+```
+
++++ {"id": "66bH3ZDJ7Iat"}
+
+Import Required Libraries (import MNIST dataset from torchvision)
+
+```{code-cell}
+:id: mS62eVL9Ifmz
+
+import numpy as np
+import grain.python as pygrain
+from torchvision.datasets import MNIST
+```
+
++++ {"id": "0h6mwVrspPA-"}
+
+### Define Dataset Class
+
+Create a custom dataset class to load MNIST data for Grain.
+
+```{code-cell}
+:id: bnrhac5Hh7y1
+
+class Dataset:
+    def __init__(self, data_dir, train=True):
+        self.data_dir = data_dir
+        self.train = train
+        self.load_data()
+
+    def load_data(self):
+        self.dataset = MNIST(self.data_dir, download=True, train=self.train)
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, index):
+        img, label = self.dataset[index]
+        return np.ravel(np.array(img, dtype=np.float32)), label
+```
+
++++ {"id": "53mf8bWEsyTr"}
+
+### Initialize the Dataset
+
+```{code-cell}
+:id: pN3oF7-ostGE
+
+mnist_dataset = Dataset(data_dir)
+```
+
++++ {"id": "GqD-ycgBuwv9"}
+
+### Get the full train and test dataset
+
+```{code-cell}
+:id: f1VnTuX3u_kL
+
+# Convert training data to JAX arrays and encode labels as one-hot vectors
+train_images = jnp.array([mnist_dataset[i][0] for i in range(len(mnist_dataset))], dtype=jnp.float32)
+train_labels = one_hot(np.array([mnist_dataset[i][1] for i in range(len(mnist_dataset))]), n_targets)
+
+# Load test dataset and process it
+mnist_dataset_test = MNIST(data_dir, download=True, train=False)
+test_images = jnp.array([np.ravel(np.array(mnist_dataset_test[i][0], dtype=np.float32)) for i in range(len(mnist_dataset_test))], dtype=jnp.float32)
+test_labels = one_hot(np.array([mnist_dataset_test[i][1] for i in range(len(mnist_dataset_test))]), n_targets)
+```
+
+```{code-cell}
+---
+colab:
+  base_uri: https://localhost:8080/
+id: a2NHlp9klrQL
+outputId: c9422190-55e9-400b-bd4e-0e7bf23dc6a1
+---
+print("Train:", train_images.shape, train_labels.shape)
+print("Test:", test_images.shape, test_labels.shape)
+```
+
++++ {"id": "1QPbXt7O0JN-"}
+
+### Initialize PyGrain DataLoader
+
+```{code-cell}
+:id: 2jqd1jJt25Bj
+
+sampler = pygrain.SequentialSampler(
+    num_records=len(mnist_dataset),
+    shard_options=pygrain.NoSharding()) # Single-device, no sharding
+
+def pygrain_training_generator():
+    return pygrain.DataLoader(
+        data_source=mnist_dataset,
+        sampler=sampler,
+        operations=[pygrain.Batch(batch_size)],
+    )
+```
+
++++ {"id": "mV5z4GLCGKlx"}
+
+### Training Loop (Grain)
+
+Run the training loop using the Grain DataLoader.
+
+```{code-cell}
+---
+colab:
+  base_uri: https://localhost:8080/
+id: 9-iANQ-9CcW_
+outputId: b0e19da2-9e34-4183-c5d8-af66de5efa5c
+---
+train_model(num_epochs, params, pygrain_training_generator)
+```
+
++++ {"id": "o51P6lr86wz-"}
+
+## Loading Data with Hugging Face
+
+This section demonstrates loading MNIST data using the Hugging Face `datasets` library. You'll format the dataset for JAX compatibility, prepare flattened images and one-hot-encoded labels, and define a training generator.
+
++++ {"id": "69vrihaOi4Oz"}
+
+Install the Hugging Face `datasets` library.
+
+```{code-cell}
+---
+colab:
+  base_uri: https://localhost:8080/
+id: 19ipxPhI6oSN
+outputId: b80b80cd-fc14-4a43-f8a8-2802de4faade
+---
+!pip install datasets
+```
+
+```{code-cell}
+:id: 8v1N59p76zn0
+
+from datasets import load_dataset
+```
+
++++ {"id": "8Gaj11tO7C86"}
+
+Load the MNIST dataset from Hugging Face and format it as `numpy` arrays for quick access or `jax` to get JAX arrays.
+
+```{code-cell}
+:id: a22kTvgk6_fJ
+
+mnist_dataset = load_dataset("mnist", cache_dir=data_dir).with_format("numpy")
+```
+
++++ {"id": "tgI7dIaX7JzM"}
+
+### Extract images and labels
+
+Get image shape and flatten for model input.
+
+```{code-cell}
+:id: NHrKatD_7HbH
+
+train_images = mnist_dataset["train"]["image"]
+train_labels = mnist_dataset["train"]["label"]
+test_images = mnist_dataset["test"]["image"]
+test_labels = mnist_dataset["test"]["label"]
+
+# Extract image shape
+image_shape = train_images.shape[1:]
+num_features = image_shape[0] * image_shape[1]
+
+# Flatten the images
+train_images = train_images.reshape(-1, num_features)
+test_images = test_images.reshape(-1, num_features)
+
+# One-hot encode the labels
+train_labels = one_hot(train_labels, n_targets)
+test_labels = one_hot(test_labels, n_targets)
+```
+
+```{code-cell}
+---
+colab:
+  base_uri: https://localhost:8080/
+id: dITh435Z7Nwb
+outputId: cc89c1ec-6987-4f1c-90a4-c3b355ea7225
+---
+print('Train:', train_images.shape, train_labels.shape)
+print('Test:', test_images.shape, test_labels.shape)
+```
+
++++ {"id": "kk_4zJlz7T1E"}
+
+### Define Training Generator
+
+Set up a generator to yield batches of images and labels for training.
+
+```{code-cell}
+:id: -zLJhogj7RL-
+
+def hf_training_generator():
+    """Yield batches for training."""
+    for batch in mnist_dataset["train"].iter(batch_size):
+        x, y = batch["image"], batch["label"]
+        yield x, y
+```
+
++++ {"id": "HIsGfkLI7dvZ"}
+
+### Training Loop (Hugging Face Datasets)
+
+Run the training loop using the Hugging Face training generator.
+
+```{code-cell}
+---
+colab:
+  base_uri: https://localhost:8080/
+id: Ui6aLiZP7aLe
+outputId: c51529e0-563f-4af0-9793-76b5e6f323db
+---
+train_model(num_epochs, params, hf_training_generator)
+```
+
++++ {"id": "rCJq2rvKlKWX"}
+
+## Summary
+
+This notebook explored efficient methods for loading data on a GPU with JAX, using libraries such as PyTorch DataLoader, TensorFlow Datasets, Grain, and Hugging Face Datasets. You also learned GPU-specific optimizations, including using `device_put` for data transfer and managing GPU memory, to enhance training efficiency. Each method offers unique benefits, allowing you to choose the best approach based on your project requirements.
diff --git a/docs/source/tutorials.md b/docs/source/tutorials.md
index e08b909..ca32a35 100644
--- a/docs/source/tutorials.md
+++ b/docs/source/tutorials.md
@@ -25,6 +25,7 @@ JAX_image_captioning
 JAX_time_series_classification
 JAX_transformer_text_classification
 data_loaders_on_cpu_with_jax
+data_loaders_on_gpu_with_jax
 ```
 
 Once you've gone through this content, you can refer to package-specific