From be14f0644245bacc697d12c1ce29d17edaa2bd84 Mon Sep 17 00:00:00 2001 From: rasbt Date: Wed, 17 Jul 2024 07:38:19 -0500 Subject: [PATCH] explain extra padding token --- ch07/01_main-chapter-code/ch07.ipynb | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ch07/01_main-chapter-code/ch07.ipynb b/ch07/01_main-chapter-code/ch07.ipynb index 33ccf6b3..f57cc879 100644 --- a/ch07/01_main-chapter-code/ch07.ipynb +++ b/ch07/01_main-chapter-code/ch07.ipynb @@ -618,6 +618,8 @@ " device=\"cpu\"\n", "):\n", " # Find the longest sequence in the batch\n", + " # and increase the max length by +1, which will add one extra\n", + " # padding token below\n", " batch_max_length = max(len(item)+1 for item in batch)\n", "\n", " # Pad and prepare inputs\n", @@ -627,13 +629,14 @@ " new_item = item.copy()\n", " # Add an <|endoftext|> token\n", " new_item += [pad_token_id]\n", - " # Pad sequences to max_length\n", - " # this always adds at least 1 additional padding tokens\n", + " # Pad sequences to batch_max_length\n", " padded = (\n", " new_item + [pad_token_id] * \n", " (batch_max_length - len(new_item))\n", " )\n", - " # We remove this extra padded token again here\n", + " # Via padded[:-1], we remove the extra padded token \n", + " # that has been added via the +1 setting in batch_max_length\n", + " # (the extra padding token will be relevant in later codes)\n", " inputs = torch.tensor(padded[:-1])\n", " inputs_lst.append(inputs)\n", "\n",