Created using Colaboratory

akashe · akashe · commit 1a05832d5261 · 2021-03-06T12:35:35.000+05:30
diff --git a/Embedding_layer_for_python.ipynb b/Embedding_layer_for_python.ipynb
@@ -6,7 +6,7 @@
       "name": "Embedding layer for python.ipynb",
       "provenance": [],
       "toc_visible": true,
-      "authorship_tag": "ABX9TyM3xmkE0c1edQeEECt+lVFq",
+      "authorship_tag": "ABX9TyMgLBY3fqnOq5XgoOKy7zY8",
       "include_colab_link": true
     },
     "kernelspec": {
@@ -37,7 +37,13 @@
         "\r\n",
         "The dataset is at http://www.phontron.com/download/conala-corpus-v1.1.zip\r\n",
         "\r\n",
-        "We will do language model on the  conala-mined part of the dataset."
+        "We will do language model on the  conala-mined part of the dataset.\r\n",
+        "\r\n",
+        "The problem with tokenization is that the python tokenizer tokenizes comments and strings in print() as seperate token. If we make seperate tokens we will have a huge library. Another approach is using character level dictionary. But that increases the model output len and will take a longer time to train embeddings.\r\n",
+        "\r\n",
+        "Another problem is that conala dataset has very less newline and \\tab.\r\n",
+        "\r\n",
+        "Use the second dataset at https://www.sri.inf.ethz.ch/py150 if need more training."
       ]
     },
     {
@@ -122,6 +128,121 @@
           "name": "stdout"
         }
       ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "yzR6zFa-OfWJ",
+        "outputId": "a6ff31dc-dc79-4794-ef55-06e2397d5a98"
+      },
+      "source": [
+        "!head -3333 conala-corpus/conala-mined.jsonl | tail -1"
+      ],
+      "execution_count": 7,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "{\"parent_answer_post_id\": 21207957, \"prob\": 0.501864841814861, \"snippet\": \"a.to_csv('test.csv', cols=['sum'])\", \"intent\": \"write to csv from DataFrame python pandas\", \"id\": \"21206395_21207957_0\", \"question_id\": 21206395}\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Jm6d8_asUsAz",
+        "outputId": "4b828654-e40f-4926-ee14-0c5f8d976d25"
+      },
+      "source": [
+        "!wget -c http://files.srl.inf.ethz.ch/data/py150_files.tar.gz"
+      ],
+      "execution_count": 8,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "--2021-03-06 06:56:39--  http://files.srl.inf.ethz.ch/data/py150_files.tar.gz\n",
+            "Resolving files.srl.inf.ethz.ch (files.srl.inf.ethz.ch)... 129.132.85.35\n",
+            "Connecting to files.srl.inf.ethz.ch (files.srl.inf.ethz.ch)|129.132.85.35|:80... connected.\n",
+            "HTTP request sent, awaiting response... 301 Moved Permanently\n",
+            "Location: https://files.sri.inf.ethz.ch/data/py150_files.tar.gz [following]\n",
+            "--2021-03-06 06:56:40--  https://files.sri.inf.ethz.ch/data/py150_files.tar.gz\n",
+            "Resolving files.sri.inf.ethz.ch (files.sri.inf.ethz.ch)... 129.132.85.35\n",
+            "Connecting to files.sri.inf.ethz.ch (files.sri.inf.ethz.ch)|129.132.85.35|:443... connected.\n",
+            "HTTP request sent, awaiting response... 200 OK\n",
+            "Length: 199067128 (190M) [application/x-gzip]\n",
+            "Saving to: ‘py150_files.tar.gz’\n",
+            "\n",
+            "py150_files.tar.gz  100%[===================>] 189.84M  26.0MB/s    in 13s     \n",
+            "\n",
+            "2021-03-06 06:56:54 (14.3 MB/s) - ‘py150_files.tar.gz’ saved [199067128/199067128]\n",
+            "\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "zRIiUGAGU1fN"
+      },
+      "source": [
+        "! tar -xf py150_files.tar.gz"
+      ],
+      "execution_count": 9,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "oRalq519VEP6",
+        "outputId": "1be3a3c4-399e-4124-fb7c-37edc7c32aa1"
+      },
+      "source": [
+        "!head -10 python100k_train.txt"
+      ],
+      "execution_count": 10,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "data/00/wikihouse/urls.py\n",
+            "data/0rpc/zerorpc-python/zerorpc/events.py\n",
+            "data/0xadada/dockdj/app/manage.py\n",
+            "data/1stvamp/hippybot/setup.py\n",
+            "data/2buntu/2buntu-blog/manage.py\n",
+            "data/2buntu/2buntu-blog/twobuntu/categories/migrations/0001_initial.py\n",
+            "data/2buntu/2buntu-blog/twobuntu/news/views.py\n",
+            "data/2degrees/django-wsgi/django_wsgi/exc.py\n",
+            "data/2gis/badger-api/common/storage.py\n",
+            "data/2gis/badger-api/stages/models.py\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "qXS-3cg5VQJS"
+      },
+      "source": [
+        "!tar -xf data.tar.gz"
+      ],
+      "execution_count": 11,
+      "outputs": []
     }
   ]
 }