Skip to content

Commit

Permalink
add GPT2TokenizerFast to BPE comparison (#498)
Browse files Browse the repository at this point in the history
* added HF BPE Fast

* update benchmarks

* add note about performance

* revert accidental changes

---------

Co-authored-by: rasbt <mail@sebastianraschka.com>
  • Loading branch information
d-kleine and rasbt authored Jan 22, 2025
1 parent 0f35e37 commit dce4603
Showing 1 changed file with 92 additions and 16 deletions.
108 changes: 92 additions & 16 deletions ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
"Fetching encoder.json: 1.04Mit [00:00, 3.47Mit/s] \n",
"Fetching vocab.bpe: 457kit [00:00, 2.07Mit/s] \n"
"Fetching encoder.json: 1.04Mit [00:00, 4.13Mit/s] \n",
"Fetching vocab.bpe: 457kit [00:00, 2.56Mit/s] \n"
]
}
],
Expand Down Expand Up @@ -306,6 +306,39 @@
"hf_tokenizer(strings)[\"input_ids\"]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "a6233552",
"metadata": {},
"outputs": [],
"source": [
"from transformers import GPT2TokenizerFast\n",
"\n",
"hf_tokenizer_fast = GPT2TokenizerFast.from_pretrained(\"gpt2\")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "fa5ca643",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hf_tokenizer_fast(strings)[\"input_ids\"]"
]
},
{
"cell_type": "markdown",
"id": "9d0f2e95-8ae8-4606-a8e0-b0fce91cfac9",
Expand All @@ -319,7 +352,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 17,
"id": "b6e6b1a5-9dc0-4b20-9a8b-c02aa0e3191c",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -365,7 +398,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 18,
"id": "04fbd764-ec98-44f1-9b0a-e9db9a3bb91e",
"metadata": {},
"outputs": [],
Expand All @@ -382,7 +415,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 19,
"id": "5a5def88-1d2c-4550-a5e8-ee82b72b92d7",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -413,7 +446,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 20,
"id": "a61bb445-b151-4a2f-8180-d4004c503754",
"metadata": {},
"outputs": [],
Expand All @@ -432,15 +465,15 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 21,
"id": "57f7c0a3-c1fd-4313-af34-68e78eb33653",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3.44 ms ± 54 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
"3.39 ms ± 21.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
Expand All @@ -458,15 +491,15 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 22,
"id": "036dd628-3591-46c9-a5ce-b20b105a8062",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.08 ms ± 4.69 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
"1.08 ms ± 5.99 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
]
}
],
Expand All @@ -484,7 +517,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 23,
"id": "b9c85b58-bfbc-465e-9a7e-477e53d55c90",
"metadata": {},
"outputs": [
Expand All @@ -499,7 +532,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"10.3 ms ± 180 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
"10.2 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
Expand All @@ -509,22 +542,65 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 24,
"id": "7117107f-22a6-46b4-a442-712d50b3ac7a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10.2 ms ± 72.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
"10 ms ± 36.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"%timeit hf_tokenizer(raw_text, max_length=5145, truncation=True)[\"input_ids\"]"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "d6bfc7f0",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Token indices sequence length is longer than the specified maximum sequence length for this model (5145 > 1024). Running this sequence through the model will result in indexing errors\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"3.79 ms ± 48.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"%timeit hf_tokenizer_fast(raw_text)[\"input_ids\"]"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "da57c95a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3.83 ms ± 58.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"%timeit hf_tokenizer_fast(raw_text, max_length=5145, truncation=True)[\"input_ids\"]"
]
},
{
"cell_type": "markdown",
"id": "91ac2876-f36e-498c-bd75-8597a39f2d4b",
Expand All @@ -535,15 +611,15 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 27,
"id": "3b4ff4d5-f2d9-4ea6-a51c-023dbba15429",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.74 ms ± 48.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
"1.59 ms ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
]
}
],
Expand Down

0 comments on commit dce4603

Please sign in to comment.