Skip to content

Commit 1a05832

Browse files
committed
Created using Colaboratory
1 parent 5b1faf8 commit 1a05832

File tree

1 file changed

+123
-2
lines changed

1 file changed

+123
-2
lines changed

Embedding_layer_for_python.ipynb

+123-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"name": "Embedding layer for python.ipynb",
77
"provenance": [],
88
"toc_visible": true,
9-
"authorship_tag": "ABX9TyM3xmkE0c1edQeEECt+lVFq",
9+
"authorship_tag": "ABX9TyMgLBY3fqnOq5XgoOKy7zY8",
1010
"include_colab_link": true
1111
},
1212
"kernelspec": {
@@ -37,7 +37,13 @@
3737
"\r\n",
3838
"The dataset is at http://www.phontron.com/download/conala-corpus-v1.1.zip\r\n",
3939
"\r\n",
40-
"We will do language model on the conala-mined part of the dataset."
40+
"We will do language model on the conala-mined part of the dataset.\r\n",
41+
"\r\n",
42+
"The problem with tokenization is that the python tokenizer tokenizes comments and strings in print() as seperate token. If we make seperate tokens we will have a huge library. Another approach is using character level dictionary. But that increases the model output len and will take a longer time to train embeddings.\r\n",
43+
"\r\n",
44+
"Another problem is that conala dataset has very less newline and \\tab.\r\n",
45+
"\r\n",
46+
"Use the second dataset at https://www.sri.inf.ethz.ch/py150 if need more training."
4147
]
4248
},
4349
{
@@ -122,6 +128,121 @@
122128
"name": "stdout"
123129
}
124130
]
131+
},
132+
{
133+
"cell_type": "code",
134+
"metadata": {
135+
"colab": {
136+
"base_uri": "https://localhost:8080/"
137+
},
138+
"id": "yzR6zFa-OfWJ",
139+
"outputId": "a6ff31dc-dc79-4794-ef55-06e2397d5a98"
140+
},
141+
"source": [
142+
"!head -3333 conala-corpus/conala-mined.jsonl | tail -1"
143+
],
144+
"execution_count": 7,
145+
"outputs": [
146+
{
147+
"output_type": "stream",
148+
"text": [
149+
"{\"parent_answer_post_id\": 21207957, \"prob\": 0.501864841814861, \"snippet\": \"a.to_csv('test.csv', cols=['sum'])\", \"intent\": \"write to csv from DataFrame python pandas\", \"id\": \"21206395_21207957_0\", \"question_id\": 21206395}\n"
150+
],
151+
"name": "stdout"
152+
}
153+
]
154+
},
155+
{
156+
"cell_type": "code",
157+
"metadata": {
158+
"colab": {
159+
"base_uri": "https://localhost:8080/"
160+
},
161+
"id": "Jm6d8_asUsAz",
162+
"outputId": "4b828654-e40f-4926-ee14-0c5f8d976d25"
163+
},
164+
"source": [
165+
"!wget -c http://files.srl.inf.ethz.ch/data/py150_files.tar.gz"
166+
],
167+
"execution_count": 8,
168+
"outputs": [
169+
{
170+
"output_type": "stream",
171+
"text": [
172+
"--2021-03-06 06:56:39-- http://files.srl.inf.ethz.ch/data/py150_files.tar.gz\n",
173+
"Resolving files.srl.inf.ethz.ch (files.srl.inf.ethz.ch)... 129.132.85.35\n",
174+
"Connecting to files.srl.inf.ethz.ch (files.srl.inf.ethz.ch)|129.132.85.35|:80... connected.\n",
175+
"HTTP request sent, awaiting response... 301 Moved Permanently\n",
176+
"Location: https://files.sri.inf.ethz.ch/data/py150_files.tar.gz [following]\n",
177+
"--2021-03-06 06:56:40-- https://files.sri.inf.ethz.ch/data/py150_files.tar.gz\n",
178+
"Resolving files.sri.inf.ethz.ch (files.sri.inf.ethz.ch)... 129.132.85.35\n",
179+
"Connecting to files.sri.inf.ethz.ch (files.sri.inf.ethz.ch)|129.132.85.35|:443... connected.\n",
180+
"HTTP request sent, awaiting response... 200 OK\n",
181+
"Length: 199067128 (190M) [application/x-gzip]\n",
182+
"Saving to: ‘py150_files.tar.gz’\n",
183+
"\n",
184+
"py150_files.tar.gz 100%[===================>] 189.84M 26.0MB/s in 13s \n",
185+
"\n",
186+
"2021-03-06 06:56:54 (14.3 MB/s) - ‘py150_files.tar.gz’ saved [199067128/199067128]\n",
187+
"\n"
188+
],
189+
"name": "stdout"
190+
}
191+
]
192+
},
193+
{
194+
"cell_type": "code",
195+
"metadata": {
196+
"id": "zRIiUGAGU1fN"
197+
},
198+
"source": [
199+
"! tar -xf py150_files.tar.gz"
200+
],
201+
"execution_count": 9,
202+
"outputs": []
203+
},
204+
{
205+
"cell_type": "code",
206+
"metadata": {
207+
"colab": {
208+
"base_uri": "https://localhost:8080/"
209+
},
210+
"id": "oRalq519VEP6",
211+
"outputId": "1be3a3c4-399e-4124-fb7c-37edc7c32aa1"
212+
},
213+
"source": [
214+
"!head -10 python100k_train.txt"
215+
],
216+
"execution_count": 10,
217+
"outputs": [
218+
{
219+
"output_type": "stream",
220+
"text": [
221+
"data/00/wikihouse/urls.py\n",
222+
"data/0rpc/zerorpc-python/zerorpc/events.py\n",
223+
"data/0xadada/dockdj/app/manage.py\n",
224+
"data/1stvamp/hippybot/setup.py\n",
225+
"data/2buntu/2buntu-blog/manage.py\n",
226+
"data/2buntu/2buntu-blog/twobuntu/categories/migrations/0001_initial.py\n",
227+
"data/2buntu/2buntu-blog/twobuntu/news/views.py\n",
228+
"data/2degrees/django-wsgi/django_wsgi/exc.py\n",
229+
"data/2gis/badger-api/common/storage.py\n",
230+
"data/2gis/badger-api/stages/models.py\n"
231+
],
232+
"name": "stdout"
233+
}
234+
]
235+
},
236+
{
237+
"cell_type": "code",
238+
"metadata": {
239+
"id": "qXS-3cg5VQJS"
240+
},
241+
"source": [
242+
"!tar -xf data.tar.gz"
243+
],
244+
"execution_count": 11,
245+
"outputs": []
125246
}
126247
]
127248
}

0 commit comments

Comments
 (0)