From 9b9eb04dab3107f0cb458e4a5e9de6b5bc325a20 Mon Sep 17 00:00:00 2001 From: khu Date: Thu, 24 Jun 2021 23:31:57 -0700 Subject: [PATCH 01/10] Add ORC reader tutorial --- docs/tutorials/_toc.yaml | 3 +- docs/tutorials/orc.ipynb | 337 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 339 insertions(+), 1 deletion(-) create mode 100644 docs/tutorials/orc.ipynb diff --git a/docs/tutorials/_toc.yaml b/docs/tutorials/_toc.yaml index 1c2ee891d..7edeee8bf 100644 --- a/docs/tutorials/_toc.yaml +++ b/docs/tutorials/_toc.yaml @@ -36,4 +36,5 @@ toc: path: /io/tutorials/elasticsearch - title: "Avro" path: /io/tutorials/avro - +- title: "ORC" + path: /io/tutorials/orc diff --git a/docs/tutorials/orc.ipynb b/docs/tutorials/orc.ipynb new file mode 100644 index 000000000..6e85a048d --- /dev/null +++ b/docs/tutorials/orc.ipynb @@ -0,0 +1,337 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Tce3stUlHN0L" + }, + "source": [ + "##### Copyright 2020 The TensorFlow Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "tuOe1ymfHZPu" + }, + "outputs": [], + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qFdPvlXBOdUN" + }, + "source": [ + "# Apache ORC Reader" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MfBg1C5NB3X0" + }, + "source": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " View on TensorFlow.org\n", + " \n", + " Run in Google Colab\n", + " \n", + " View on GitHub\n", + " \n", + " Download notebook\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xHxb-dlhMIzW" + }, + "source": [ + "## Overview\n", + "\n", + "Apache ORC is a popular columnar storage format. tensorflow-io package provides a default implementation of reading Apache ORC files." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MUXex9ctTuDB" + }, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1Eh-iCRVBm0p" + }, + "source": [ + "Install required Packages, and restart runtime\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "g7cxbf1-skn6" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: tensorflow-io in /usr/local/lib/python3.7/dist-packages (0.18.0)\n", + "Requirement already satisfied: tensorflow<2.6.0,>=2.5.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow-io) (2.5.0)\n", + "Requirement already satisfied: tensorflow-io-gcs-filesystem==0.18.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow-io) (0.18.0)\n", + "Requirement already satisfied: h5py~=3.1.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (3.1.0)\n", + "Requirement already satisfied: opt-einsum~=3.3.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (3.3.0)\n", + "Requirement already satisfied: keras-nightly~=2.5.0.dev in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (2.5.0.dev2021032900)\n", + "Requirement already satisfied: google-pasta~=0.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.2.0)\n", + "Requirement already satisfied: flatbuffers~=1.12.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.12)\n", + "Requirement already satisfied: wrapt~=1.12.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.12.1)\n", + "Requirement already satisfied: gast==0.4.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.4.0)\n", + "Requirement already satisfied: tensorflow-estimator<2.6.0,>=2.5.0rc0 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (2.5.0)\n", + "Requirement already satisfied: grpcio~=1.34.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.34.1)\n", + "Requirement already satisfied: protobuf>=3.9.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (3.12.4)\n", + "Requirement already satisfied: absl-py~=0.10 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.12.0)\n", + "Requirement already satisfied: numpy~=1.19.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.19.5)\n", + "Requirement already satisfied: termcolor~=1.1.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.1.0)\n", + "Requirement already satisfied: typing-extensions~=3.7.4 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (3.7.4.3)\n", + "Requirement already satisfied: keras-preprocessing~=1.1.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.1.2)\n", + "Requirement already satisfied: astunparse~=1.6.3 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.6.3)\n", + "Requirement already satisfied: wheel~=0.35 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.36.2)\n", + "Requirement already satisfied: six~=1.15.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.15.0)\n", + "Requirement already satisfied: tensorboard~=2.5 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (2.5.0)\n", + "Requirement already satisfied: cached-property; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from h5py~=3.1.0->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.5.2)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from protobuf>=3.9.2->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (57.0.0)\n", + "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (3.3.4)\n", + "Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.6.1)\n", + "Requirement already satisfied: google-auth<2,>=1.6.3 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.31.0)\n", + "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.4.4)\n", + "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (2.23.0)\n", + "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.8.0)\n", + "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.0.1)\n", + "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from markdown>=2.6.8->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (4.5.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4; python_version >= \"3.6\" in /usr/local/lib/python3.7/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (4.7.2)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.7/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.2.8)\n", + "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (4.2.2)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.7/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.3.0)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (2.10)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (2021.5.30)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.24.3)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (3.0.4)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (3.4.1)\n", + "Requirement already satisfied: pyasn1>=0.1.3 in /usr/local/lib/python3.7/dist-packages (from rsa<5,>=3.1.4; python_version >= \"3.6\"->google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.4.8)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (3.1.1)\n" + ] + } + ], + "source": [ + "!pip install tensorflow-io" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "IqR2PQG4ZaZ0" + }, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "import tensorflow_io as tfio" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EyHfC3nEzseN" + }, + "source": [ + "## Download a sample ORC file" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "zaiXjZiXzrHs" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " % Total % Received % Xferd Average Speed Time Time Time Current\n", + " Dload Upload Total Spent Left Speed\n", + "\r 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0\r100 144 100 144 0 0 1655 0 --:--:-- --:--:-- --:--:-- 1655\n", + "\r100 3328 100 3328 0 0 18592 0 --:--:-- --:--:-- --:--:-- 18592\n", + "-rw-r--r-- 1 root root 3328 Jun 25 06:15 iris.orc\n" + ] + } + ], + "source": [ + "!curl -OL https://github.com/tensorflow/io/raw/master/tests/test_orc/iris.orc\n", + "!ls -l iris.orc" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "ppFAjXAYsj-z" + }, + "outputs": [], + "source": [ + "dataset = tfio.IODataset.from_orc(\"iris.orc\", capacity=15).batch(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "9B1QUKG70Lzs" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[5.1] [3.5] [1.4] [0.2] [b'setosa']\n" + ] + } + ], + "source": [ + "for sepal_length, sepal_width, petal_length, petal_width, species in dataset:\n", + " print(sepal_length.numpy(), sepal_width.numpy(), petal_length.numpy(), petal_width.numpy(), species.numpy())\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "h4u4Z17h0-Cl" + }, + "source": [ + "## Build a model reading ORC with Keras" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "R1OYAybz07dr" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/5\n", + "150/150 [==============================] - 1s 1ms/step - loss: -15.2493 - accuracy: 0.3333\n", + "Epoch 2/5\n", + "150/150 [==============================] - 0s 1ms/step - loss: -15.2493 - accuracy: 0.3333\n", + "Epoch 3/5\n", + "150/150 [==============================] - 0s 2ms/step - loss: -15.2493 - accuracy: 0.3333\n", + "Epoch 4/5\n", + "150/150 [==============================] - 0s 1ms/step - loss: -15.2493 - accuracy: 0.3333\n", + "Epoch 5/5\n", + "150/150 [==============================] - 0s 1ms/step - loss: -15.2493 - accuracy: 0.3333\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "feature_cols = [\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]\n", + "label_cols = [\"species\"]\n", + "\n", + "# select feature columns\n", + "feature_dataset = tfio.IODataset.from_orc(\"iris.orc\", columns=feature_cols)\n", + "# select label columns\n", + "label_dataset = tfio.IODataset.from_orc(\"iris.orc\", columns=label_cols)\n", + "\n", + "@tf.function\n", + "def species_float_conversion(x):\n", + " if x == \"virginica\":\n", + " return 1.0\n", + " if x == \"versicolor\":\n", + " return 2.0\n", + " if x == \"setosa\":\n", + " return 3.0\n", + " return 4.0\n", + "\n", + "label_dataset = label_dataset.map(species_float_conversion)\n", + "dataset = tf.data.Dataset.zip((feature_dataset, label_dataset))\n", + "dataset = dataset.batch(1)\n", + "\n", + "def pack_features_vector(features, labels):\n", + " \"\"\"Pack the features into a single array.\"\"\"\n", + " features = tf.stack(list(features), axis=1)\n", + " return features, labels\n", + "\n", + "dataset = dataset.map(pack_features_vector)\n", + "\n", + "model = tf.keras.Sequential(\n", + " [\n", + " tf.keras.layers.Dense(\n", + " 10, activation=tf.nn.relu, input_shape=(4,)\n", + " ), # input shape required\n", + " tf.keras.layers.Dense(10, activation=tf.nn.relu),\n", + " tf.keras.layers.Dense(3),\n", + " ]\n", + ")\n", + "\n", + "model.compile(optimizer=\"adam\", loss=\"binary_crossentropy\", metrics=[\"accuracy\"])\n", + "model.fit(dataset, epochs=5)" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "Tce3stUlHN0L" + ], + "name": "orc.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From ac94c1c5790509dc71622a599c04e9cbcc8ce313 Mon Sep 17 00:00:00 2001 From: khu Date: Fri, 25 Jun 2021 00:06:18 -0700 Subject: [PATCH 02/10] clean up notebook --- docs/tutorials/orc.ipynb | 243 +++++++++++++++++++-------------------- 1 file changed, 118 insertions(+), 125 deletions(-) diff --git a/docs/tutorials/orc.ipynb b/docs/tutorials/orc.ipynb index 6e85a048d..c38be1e7c 100644 --- a/docs/tutorials/orc.ipynb +++ b/docs/tutorials/orc.ipynb @@ -1,4 +1,20 @@ { + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "orc.ipynb", + "provenance": [], + "collapsed_sections": [ + "Tce3stUlHN0L" + ], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, "cells": [ { "cell_type": "markdown", @@ -6,17 +22,15 @@ "id": "Tce3stUlHN0L" }, "source": [ - "##### Copyright 2020 The TensorFlow Authors." + "##### Copyright 2021 The TensorFlow Authors." ] }, { "cell_type": "code", - "execution_count": null, "metadata": { "cellView": "form", "id": "tuOe1ymfHZPu" }, - "outputs": [], "source": [ "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", "# you may not use this file except in compliance with the License.\n", @@ -29,7 +43,9 @@ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", "# See the License for the specific language governing permissions and\n", "# limitations under the License." - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -93,76 +109,47 @@ }, { "cell_type": "code", - "execution_count": 1, "metadata": { - "id": "g7cxbf1-skn6" + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "g7cxbf1-skn6", + "outputId": "9ee938ea-cf8c-4523-b731-287f5845017a" }, + "source": [ + "!pip install tensorflow-io" + ], + "execution_count": 1, "outputs": [ { - "name": "stdout", "output_type": "stream", "text": [ - "Requirement already satisfied: tensorflow-io in /usr/local/lib/python3.7/dist-packages (0.18.0)\n", - "Requirement already satisfied: tensorflow<2.6.0,>=2.5.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow-io) (2.5.0)\n", - "Requirement already satisfied: tensorflow-io-gcs-filesystem==0.18.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow-io) (0.18.0)\n", - "Requirement already satisfied: h5py~=3.1.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (3.1.0)\n", - "Requirement already satisfied: opt-einsum~=3.3.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (3.3.0)\n", - "Requirement already satisfied: keras-nightly~=2.5.0.dev in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (2.5.0.dev2021032900)\n", - "Requirement already satisfied: google-pasta~=0.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.2.0)\n", - "Requirement already satisfied: flatbuffers~=1.12.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.12)\n", - "Requirement already satisfied: wrapt~=1.12.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.12.1)\n", - "Requirement already satisfied: gast==0.4.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.4.0)\n", - "Requirement already satisfied: tensorflow-estimator<2.6.0,>=2.5.0rc0 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (2.5.0)\n", - "Requirement already satisfied: grpcio~=1.34.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.34.1)\n", - "Requirement already satisfied: protobuf>=3.9.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (3.12.4)\n", - "Requirement already satisfied: absl-py~=0.10 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.12.0)\n", - "Requirement already satisfied: numpy~=1.19.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.19.5)\n", - "Requirement already satisfied: termcolor~=1.1.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.1.0)\n", - "Requirement already satisfied: typing-extensions~=3.7.4 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (3.7.4.3)\n", - "Requirement already satisfied: keras-preprocessing~=1.1.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.1.2)\n", - "Requirement already satisfied: astunparse~=1.6.3 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.6.3)\n", - "Requirement already satisfied: wheel~=0.35 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.36.2)\n", - "Requirement already satisfied: six~=1.15.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.15.0)\n", - "Requirement already satisfied: tensorboard~=2.5 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (2.5.0)\n", - "Requirement already satisfied: cached-property; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from h5py~=3.1.0->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.5.2)\n", - "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from protobuf>=3.9.2->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (57.0.0)\n", - "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (3.3.4)\n", - "Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.6.1)\n", - "Requirement already satisfied: google-auth<2,>=1.6.3 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.31.0)\n", - "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.4.4)\n", - "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (2.23.0)\n", - "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.8.0)\n", - "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.0.1)\n", - "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from markdown>=2.6.8->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (4.5.0)\n", - "Requirement already satisfied: rsa<5,>=3.1.4; python_version >= \"3.6\" in /usr/local/lib/python3.7/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (4.7.2)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.7/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.2.8)\n", - "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (4.2.2)\n", - "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.7/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.3.0)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (2.10)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (2021.5.30)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (1.24.3)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (3.0.4)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (3.4.1)\n", - "Requirement already satisfied: pyasn1>=0.1.3 in /usr/local/lib/python3.7/dist-packages (from rsa<5,>=3.1.4; python_version >= \"3.6\"->google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.4.8)\n", - "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (3.1.1)\n" - ] + "Collecting tensorflow-io\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e6/d2/6fd39a3519e325037462721092248b468ccbeeeb5dc870cea072655ee4b0/tensorflow_io-0.18.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1MB)\n", + "\u001b[K |████████████████████████████████| 24.1MB 118kB/s \n", + "\u001b[?25hCollecting tensorflow-io-gcs-filesystem==0.18.0\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/27/37/6cedfcc52f1d53a79a60204fc89d1f7ca099c5d3a999d4640a2fe407e91b/tensorflow_io_gcs_filesystem-0.18.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.5MB)\n", + "\u001b[K |████████████████████████████████| 2.5MB 36.8MB/s \n", + "\u001b[?25hRequirement already satisfied: tensorflow<2.6.0,>=2.5.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow-io) (2.5.0)\n", + "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.7/dist-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.4.8)\n", + "Installing collected packages: tensorflow-io-gcs-filesystem, tensorflow-io\n", + "Successfully installed tensorflow-io-0.18.0 tensorflow-io-gcs-filesystem-0.18.0\n" + ], + "name": "stdout" } - ], - "source": [ - "!pip install tensorflow-io" ] }, { "cell_type": "code", - "execution_count": 2, "metadata": { "id": "IqR2PQG4ZaZ0" }, - "outputs": [], "source": [ "import tensorflow as tf\n", "import tensorflow_io as tfio" - ] + ], + "execution_count": 2, + "outputs": [] }, { "cell_type": "markdown", @@ -175,13 +162,20 @@ }, { "cell_type": "code", - "execution_count": 3, "metadata": { - "id": "zaiXjZiXzrHs" + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "zaiXjZiXzrHs", + "outputId": "c838eb24-80f7-4862-f9b4-d5a3b7302125" }, + "source": [ + "!curl -OL https://github.com/tensorflow/io/raw/master/tests/test_orc/iris.orc\n", + "!ls -l iris.orc" + ], + "execution_count": 3, "outputs": [ { - "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", @@ -189,44 +183,54 @@ "\r 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0\r100 144 100 144 0 0 1655 0 --:--:-- --:--:-- --:--:-- 1655\n", "\r100 3328 100 3328 0 0 18592 0 --:--:-- --:--:-- --:--:-- 18592\n", "-rw-r--r-- 1 root root 3328 Jun 25 06:15 iris.orc\n" - ] + ], + "name": "stdout" } - ], + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7DG9JTJ0-bzg" + }, "source": [ - "!curl -OL https://github.com/tensorflow/io/raw/master/tests/test_orc/iris.orc\n", - "!ls -l iris.orc" + "## Create a dataset from the file" ] }, { "cell_type": "code", - "execution_count": 4, "metadata": { "id": "ppFAjXAYsj-z" }, - "outputs": [], "source": [ "dataset = tfio.IODataset.from_orc(\"iris.orc\", capacity=15).batch(1)" - ] + ], + "execution_count": 4, + "outputs": [] }, { "cell_type": "code", - "execution_count": 9, "metadata": { - "id": "9B1QUKG70Lzs" + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9B1QUKG70Lzs", + "outputId": "acf21714-2113-4f8a-ce2e-18ddbd44ef7d" }, + "source": [ + "for sepal_length, sepal_width, petal_length, petal_width, species in dataset:\n", + " print(sepal_length.numpy(), sepal_width.numpy(), petal_length.numpy(), petal_width.numpy(), species.numpy())\n", + " break" + ], + "execution_count": 9, "outputs": [ { - "name": "stdout", "output_type": "stream", "text": [ "[5.1] [3.5] [1.4] [0.2] [b'setosa']\n" - ] + ], + "name": "stdout" } - ], - "source": [ - "for sepal_length, sepal_width, petal_length, petal_width, species in dataset:\n", - " print(sepal_length.numpy(), sepal_width.numpy(), petal_length.numpy(), petal_width.numpy(), species.numpy())\n", - " break" ] }, { @@ -240,40 +244,13 @@ }, { "cell_type": "code", - "execution_count": 10, "metadata": { - "id": "R1OYAybz07dr" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 1/5\n", - "150/150 [==============================] - 1s 1ms/step - loss: -15.2493 - accuracy: 0.3333\n", - "Epoch 2/5\n", - "150/150 [==============================] - 0s 1ms/step - loss: -15.2493 - accuracy: 0.3333\n", - "Epoch 3/5\n", - "150/150 [==============================] - 0s 2ms/step - loss: -15.2493 - accuracy: 0.3333\n", - "Epoch 4/5\n", - "150/150 [==============================] - 0s 1ms/step - loss: -15.2493 - accuracy: 0.3333\n", - "Epoch 5/5\n", - "150/150 [==============================] - 0s 1ms/step - loss: -15.2493 - accuracy: 0.3333\n" - ] + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 10, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], + "id": "R1OYAybz07dr", + "outputId": "bd298eae-efd9-4970-98f0-3b8bf5941c64" + }, "source": [ "feature_cols = [\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]\n", "label_cols = [\"species\"]\n", @@ -316,22 +293,38 @@ "\n", "model.compile(optimizer=\"adam\", loss=\"binary_crossentropy\", metrics=[\"accuracy\"])\n", "model.fit(dataset, epochs=5)" - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [ - "Tce3stUlHN0L" ], - "name": "orc.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Epoch 1/5\n", + "150/150 [==============================] - 1s 1ms/step - loss: -15.2493 - accuracy: 0.3333\n", + "Epoch 2/5\n", + "150/150 [==============================] - 0s 1ms/step - loss: -15.2493 - accuracy: 0.3333\n", + "Epoch 3/5\n", + "150/150 [==============================] - 0s 2ms/step - loss: -15.2493 - accuracy: 0.3333\n", + "Epoch 4/5\n", + "150/150 [==============================] - 0s 1ms/step - loss: -15.2493 - accuracy: 0.3333\n", + "Epoch 5/5\n", + "150/150 [==============================] - 0s 1ms/step - loss: -15.2493 - accuracy: 0.3333\n" + ], + "name": "stdout" + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 10 + } + ] } - }, - "nbformat": 4, - "nbformat_minor": 0 -} + ] +} \ No newline at end of file From 5603d422876617ac9d4c71534c499ad7ad680618 Mon Sep 17 00:00:00 2001 From: khu Date: Fri, 25 Jun 2021 11:07:26 -0700 Subject: [PATCH 03/10] address comments --- docs/tutorials/orc.ipynb | 284 ++++++++++++++++++++++++--------------- 1 file changed, 176 insertions(+), 108 deletions(-) diff --git a/docs/tutorials/orc.ipynb b/docs/tutorials/orc.ipynb index c38be1e7c..defed0cf3 100644 --- a/docs/tutorials/orc.ipynb +++ b/docs/tutorials/orc.ipynb @@ -1,20 +1,4 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "orc.ipynb", - "provenance": [], - "collapsed_sections": [ - "Tce3stUlHN0L" - ], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, "cells": [ { "cell_type": "markdown", @@ -27,10 +11,12 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "cellView": "form", "id": "tuOe1ymfHZPu" }, + "outputs": [], "source": [ "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", "# you may not use this file except in compliance with the License.\n", @@ -43,9 +29,7 @@ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", "# See the License for the specific language governing permissions and\n", "# limitations under the License." - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -109,47 +93,47 @@ }, { "cell_type": "code", + "execution_count": 1, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "g7cxbf1-skn6", - "outputId": "9ee938ea-cf8c-4523-b731-287f5845017a" + "id": "g7cxbf1-skn6" }, - "source": [ - "!pip install tensorflow-io" - ], - "execution_count": 1, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "Collecting tensorflow-io\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e6/d2/6fd39a3519e325037462721092248b468ccbeeeb5dc870cea072655ee4b0/tensorflow_io-0.18.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1MB)\n", - "\u001b[K |████████████████████████████████| 24.1MB 118kB/s \n", - "\u001b[?25hCollecting tensorflow-io-gcs-filesystem==0.18.0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/27/37/6cedfcc52f1d53a79a60204fc89d1f7ca099c5d3a999d4640a2fe407e91b/tensorflow_io_gcs_filesystem-0.18.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.5MB)\n", - "\u001b[K |████████████████████████████████| 2.5MB 36.8MB/s \n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d2/b7/b76c28a422ebaf1c3d97aa6553e8620cc3b0d91976415b4ca255176c7946/tensorflow_io-0.19.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (22.7MB)\n", + "\u001b[K |████████████████████████████████| 22.7MB 128kB/s \n", + "\u001b[?25hCollecting tensorflow-io-gcs-filesystem==0.19.0\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/5d/3a/5c1cc819ff1adfd47fa119a8b904a12207c64bdb1f61f2ef726f03a0cdc6/tensorflow_io_gcs_filesystem-0.19.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.3MB)\n", + "\u001b[K |████████████████████████████████| 2.3MB 31.6MB/s \n", "\u001b[?25hRequirement already satisfied: tensorflow<2.6.0,>=2.5.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow-io) (2.5.0)\n", - "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.7/dist-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.4.8)\n", + "Requirement already satisfied: absl-py~=0.10 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.12.0)\n", + "Requirement already satisfied: tensorboard~=2.5 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (2.5.0)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (3.4.1)\n", + "Requirement already satisfied: pyasn1>=0.1.3 in /usr/local/lib/python3.7/dist-packages (from rsa<5,>=3.1.4; python_version >= \"3.6\"->google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.4.8)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (3.1.1)\n", "Installing collected packages: tensorflow-io-gcs-filesystem, tensorflow-io\n", - "Successfully installed tensorflow-io-0.18.0 tensorflow-io-gcs-filesystem-0.18.0\n" - ], - "name": "stdout" + "Successfully installed tensorflow-io-0.19.0 tensorflow-io-gcs-filesystem-0.19.0\n" + ] } + ], + "source": [ + "!pip install tensorflow-io" ] }, { "cell_type": "code", + "execution_count": 2, "metadata": { "id": "IqR2PQG4ZaZ0" }, + "outputs": [], "source": [ "import tensorflow as tf\n", "import tensorflow_io as tfio" - ], - "execution_count": 2, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -162,30 +146,26 @@ }, { "cell_type": "code", + "execution_count": 3, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "zaiXjZiXzrHs", - "outputId": "c838eb24-80f7-4862-f9b4-d5a3b7302125" + "id": "zaiXjZiXzrHs" }, - "source": [ - "!curl -OL https://github.com/tensorflow/io/raw/master/tests/test_orc/iris.orc\n", - "!ls -l iris.orc" - ], - "execution_count": 3, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", - "\r 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0\r100 144 100 144 0 0 1655 0 --:--:-- --:--:-- --:--:-- 1655\n", - "\r100 3328 100 3328 0 0 18592 0 --:--:-- --:--:-- --:--:-- 18592\n", - "-rw-r--r-- 1 root root 3328 Jun 25 06:15 iris.orc\n" - ], - "name": "stdout" + "100 144 100 144 0 0 862 0 --:--:-- --:--:-- --:--:-- 862\n", + "100 3328 100 3328 0 0 9618 0 --:--:-- --:--:-- --:--:-- 0\n", + "-rw-r--r-- 1 root root 3328 Jun 25 18:05 iris.orc\n" + ] } + ], + "source": [ + "!curl -OL https://github.com/tensorflow/io/raw/master/tests/test_orc/iris.orc\n", + "!ls -l iris.orc" ] }, { @@ -199,38 +179,43 @@ }, { "cell_type": "code", + "execution_count": 4, "metadata": { "id": "ppFAjXAYsj-z" }, + "outputs": [], "source": [ "dataset = tfio.IODataset.from_orc(\"iris.orc\", capacity=15).batch(1)" - ], - "execution_count": 4, - "outputs": [] + ] }, { - "cell_type": "code", + "cell_type": "markdown", "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "9B1QUKG70Lzs", - "outputId": "acf21714-2113-4f8a-ce2e-18ddbd44ef7d" + "id": "4xPr3f4LVdeN" }, "source": [ - "for sepal_length, sepal_width, petal_length, petal_width, species in dataset:\n", - " print(sepal_length.numpy(), sepal_width.numpy(), petal_length.numpy(), petal_width.numpy(), species.numpy())\n", - " break" - ], - "execution_count": 9, + "Examine the dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "9B1QUKG70Lzs" + }, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "[5.1] [3.5] [1.4] [0.2] [b'setosa']\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "for sepal_length, sepal_width, petal_length, petal_width, species in dataset:\n", + " print(sepal_length.numpy(), sepal_width.numpy(), petal_length.numpy(), petal_width.numpy(), species.numpy())\n", + " break" ] }, { @@ -242,15 +227,31 @@ "## Build a model reading ORC with Keras" ] }, + { + "cell_type": "markdown", + "metadata": { + "id": "03qncHJPVNK3" + }, + "source": [ + "Let's walk through an end-to-end example of tf.keras model training with ORC dataset based on iris dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nDgkfWFRVjKz" + }, + "source": [ + "Configure which columns are features, and which column is label:" + ] + }, { "cell_type": "code", + "execution_count": 6, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "R1OYAybz07dr", - "outputId": "bd298eae-efd9-4970-98f0-3b8bf5941c64" + "id": "R1OYAybz07dr" }, + "outputs": [], "source": [ "feature_cols = [\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]\n", "label_cols = [\"species\"]\n", @@ -258,8 +259,26 @@ "# select feature columns\n", "feature_dataset = tfio.IODataset.from_orc(\"iris.orc\", columns=feature_cols)\n", "# select label columns\n", - "label_dataset = tfio.IODataset.from_orc(\"iris.orc\", columns=label_cols)\n", - "\n", + "label_dataset = tfio.IODataset.from_orc(\"iris.orc\", columns=label_cols)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GSYMP48vVvV0" + }, + "source": [ + "A util function to map species to float numbers for model training:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "TQvuE7OgVs1q" + }, + "outputs": [], + "source": [ "@tf.function\n", "def species_float_conversion(x):\n", " if x == \"virginica\":\n", @@ -268,8 +287,26 @@ " return 2.0\n", " if x == \"setosa\":\n", " return 3.0\n", - " return 4.0\n", - "\n", + " return 4.0\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "U9iy27hXV-yv" + }, + "source": [ + "Dataset preprocessing:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "lpf0w41iWAZ4" + }, + "outputs": [], + "source": [ "label_dataset = label_dataset.map(species_float_conversion)\n", "dataset = tf.data.Dataset.zip((feature_dataset, label_dataset))\n", "dataset = dataset.batch(1)\n", @@ -279,52 +316,83 @@ " features = tf.stack(list(features), axis=1)\n", " return features, labels\n", "\n", - "dataset = dataset.map(pack_features_vector)\n", - "\n", - "model = tf.keras.Sequential(\n", - " [\n", - " tf.keras.layers.Dense(\n", - " 10, activation=tf.nn.relu, input_shape=(4,)\n", - " ), # input shape required\n", - " tf.keras.layers.Dense(10, activation=tf.nn.relu),\n", - " tf.keras.layers.Dense(3),\n", - " ]\n", - ")\n", - "\n", - "model.compile(optimizer=\"adam\", loss=\"binary_crossentropy\", metrics=[\"accuracy\"])\n", - "model.fit(dataset, epochs=5)" - ], - "execution_count": 10, + "dataset = dataset.map(pack_features_vector)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oVB9Q0B-WDn4" + }, + "source": [ + "Finally, build the keras model and train it!" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "tToy0FoOWG-9" + }, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/5\n", - "150/150 [==============================] - 1s 1ms/step - loss: -15.2493 - accuracy: 0.3333\n", + "150/150 [==============================] - 1s 2ms/step - loss: 15.6634 - accuracy: 0.3333\n", "Epoch 2/5\n", - "150/150 [==============================] - 0s 1ms/step - loss: -15.2493 - accuracy: 0.3333\n", + "150/150 [==============================] - 0s 2ms/step - loss: 15.4835 - accuracy: 0.3333\n", "Epoch 3/5\n", - "150/150 [==============================] - 0s 2ms/step - loss: -15.2493 - accuracy: 0.3333\n", + "150/150 [==============================] - 0s 2ms/step - loss: 15.4835 - accuracy: 0.3333\n", "Epoch 4/5\n", - "150/150 [==============================] - 0s 1ms/step - loss: -15.2493 - accuracy: 0.3333\n", + "150/150 [==============================] - 0s 2ms/step - loss: 15.4835 - accuracy: 0.3333\n", "Epoch 5/5\n", - "150/150 [==============================] - 0s 1ms/step - loss: -15.2493 - accuracy: 0.3333\n" - ], - "name": "stdout" + "150/150 [==============================] - 0s 2ms/step - loss: 15.4835 - accuracy: 0.3333\n" + ] }, { - "output_type": "execute_result", "data": { "text/plain": [ - "" + "" ] }, + "execution_count": 9, "metadata": { "tags": [] }, - "execution_count": 10 + "output_type": "execute_result" } + ], + "source": [ + "model = tf.keras.Sequential(\n", + " [\n", + " tf.keras.layers.Dense(\n", + " 10, activation=tf.nn.relu, input_shape=(4,)\n", + " ), # input shape required\n", + " tf.keras.layers.Dense(10, activation=tf.nn.relu),\n", + " tf.keras.layers.Dense(3),\n", + " ]\n", + ")\n", + "\n", + "model.compile(optimizer=\"adam\", loss=\"binary_crossentropy\", metrics=[\"accuracy\"])\n", + "model.fit(dataset, epochs=5)" ] } - ] -} \ No newline at end of file + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "Tce3stUlHN0L" + ], + "name": "orc.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From e5b313f85fa15c35ca2dd30756b6679da44f5767 Mon Sep 17 00:00:00 2001 From: khu Date: Sat, 26 Jun 2021 10:55:32 -0700 Subject: [PATCH 04/10] address comments --- docs/tutorials/orc.ipynb | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/docs/tutorials/orc.ipynb b/docs/tutorials/orc.ipynb index defed0cf3..e4131f57d 100644 --- a/docs/tutorials/orc.ipynb +++ b/docs/tutorials/orc.ipynb @@ -70,7 +70,7 @@ "source": [ "## Overview\n", "\n", - "Apache ORC is a popular columnar storage format. tensorflow-io package provides a default implementation of reading Apache ORC files." + "Apache ORC is a popular columnar storage format. tensorflow-io package provides a default implementation of reading [Apache ORC](https://orc.apache.org/) files." ] }, { @@ -88,7 +88,7 @@ "id": "1Eh-iCRVBm0p" }, "source": [ - "Install required Packages, and restart runtime\n" + "Install required packages, and restart runtime\n" ] }, { @@ -110,7 +110,6 @@ "\u001b[K |████████████████████████████████| 2.3MB 31.6MB/s \n", "\u001b[?25hRequirement already satisfied: tensorflow<2.6.0,>=2.5.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow-io) (2.5.0)\n", "Requirement already satisfied: absl-py~=0.10 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.12.0)\n", - "Requirement already satisfied: tensorboard~=2.5 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (2.5.0)\n", "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (3.4.1)\n", "Requirement already satisfied: pyasn1>=0.1.3 in /usr/local/lib/python3.7/dist-packages (from rsa<5,>=3.1.4; python_version >= \"3.6\"->google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.4.8)\n", "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (3.1.1)\n", @@ -141,7 +140,7 @@ "id": "EyHfC3nEzseN" }, "source": [ - "## Download a sample ORC file" + "### Download a sample ORC file" ] }, { @@ -174,7 +173,7 @@ "id": "7DG9JTJ0-bzg" }, "source": [ - "## Create a dataset from the file" + "### Create a dataset from the file" ] }, { @@ -236,6 +235,15 @@ "Let's walk through an end-to-end example of tf.keras model training with ORC dataset based on iris dataset." ] }, + { + "cell_type": "markdown", + "metadata": { + "id": "tDkpKRMVcPfb" + }, + "source": [ + "### Data preprocessing" + ] + }, { "cell_type": "markdown", "metadata": { @@ -319,6 +327,15 @@ "dataset = dataset.map(pack_features_vector)" ] }, + { + "cell_type": "markdown", + "metadata": { + "id": "R1Tyf3AodC2Y" + }, + "source": [ + "### Build, compile and train the model" + ] + }, { "cell_type": "markdown", "metadata": { From feacbdd4363f988fd8ecac8527b005a174e8819f Mon Sep 17 00:00:00 2001 From: khu Date: Sat, 26 Jun 2021 22:36:32 -0700 Subject: [PATCH 05/10] address comments --- docs/tutorials/orc.ipynb | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/docs/tutorials/orc.ipynb b/docs/tutorials/orc.ipynb index e4131f57d..d261c9631 100644 --- a/docs/tutorials/orc.ipynb +++ b/docs/tutorials/orc.ipynb @@ -173,7 +173,7 @@ "id": "7DG9JTJ0-bzg" }, "source": [ - "### Create a dataset from the file" + "## Create a dataset from the file" ] }, { @@ -212,18 +212,8 @@ } ], "source": [ - "for sepal_length, sepal_width, petal_length, petal_width, species in dataset:\n", - " print(sepal_length.numpy(), sepal_width.numpy(), petal_length.numpy(), petal_width.numpy(), species.numpy())\n", - " break" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "h4u4Z17h0-Cl" - }, - "source": [ - "## Build a model reading ORC with Keras" + "for item in dataset.take(1):\n", + " print(item)\n" ] }, { @@ -298,15 +288,6 @@ " return 4.0\n" ] }, - { - "cell_type": "markdown", - "metadata": { - "id": "U9iy27hXV-yv" - }, - "source": [ - "Dataset preprocessing:" - ] - }, { "cell_type": "code", "execution_count": 8, @@ -333,7 +314,7 @@ "id": "R1Tyf3AodC2Y" }, "source": [ - "### Build, compile and train the model" + "## Build, compile and train the model" ] }, { From cfeb22d75e604f40c3fccaaac53de7c1fe002a53 Mon Sep 17 00:00:00 2001 From: khu Date: Tue, 29 Jun 2021 22:43:59 -0700 Subject: [PATCH 06/10] address comment: remove outputs and add desc for dataset --- docs/tutorials/orc.ipynb | 175 +++++++++++++-------------------------- 1 file changed, 58 insertions(+), 117 deletions(-) diff --git a/docs/tutorials/orc.ipynb b/docs/tutorials/orc.ipynb index d261c9631..c0054b0c8 100644 --- a/docs/tutorials/orc.ipynb +++ b/docs/tutorials/orc.ipynb @@ -1,4 +1,20 @@ { + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "orc.ipynb", + "provenance": [], + "collapsed_sections": [ + "Tce3stUlHN0L" + ], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, "cells": [ { "cell_type": "markdown", @@ -11,12 +27,10 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "cellView": "form", "id": "tuOe1ymfHZPu" }, - "outputs": [], "source": [ "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", "# you may not use this file except in compliance with the License.\n", @@ -29,7 +43,9 @@ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", "# See the License for the specific language governing permissions and\n", "# limitations under the License." - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -93,46 +109,26 @@ }, { "cell_type": "code", - "execution_count": 1, "metadata": { "id": "g7cxbf1-skn6" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting tensorflow-io\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d2/b7/b76c28a422ebaf1c3d97aa6553e8620cc3b0d91976415b4ca255176c7946/tensorflow_io-0.19.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (22.7MB)\n", - "\u001b[K |████████████████████████████████| 22.7MB 128kB/s \n", - "\u001b[?25hCollecting tensorflow-io-gcs-filesystem==0.19.0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/5d/3a/5c1cc819ff1adfd47fa119a8b904a12207c64bdb1f61f2ef726f03a0cdc6/tensorflow_io_gcs_filesystem-0.19.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.3MB)\n", - "\u001b[K |████████████████████████████████| 2.3MB 31.6MB/s \n", - "\u001b[?25hRequirement already satisfied: tensorflow<2.6.0,>=2.5.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow-io) (2.5.0)\n", - "Requirement already satisfied: absl-py~=0.10 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.12.0)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (3.4.1)\n", - "Requirement already satisfied: pyasn1>=0.1.3 in /usr/local/lib/python3.7/dist-packages (from rsa<5,>=3.1.4; python_version >= \"3.6\"->google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (0.4.8)\n", - "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.5->tensorflow<2.6.0,>=2.5.0->tensorflow-io) (3.1.1)\n", - "Installing collected packages: tensorflow-io-gcs-filesystem, tensorflow-io\n", - "Successfully installed tensorflow-io-0.19.0 tensorflow-io-gcs-filesystem-0.19.0\n" - ] - } - ], "source": [ "!pip install tensorflow-io" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "code", - "execution_count": 2, "metadata": { "id": "IqR2PQG4ZaZ0" }, - "outputs": [], "source": [ "import tensorflow as tf\n", "import tensorflow_io as tfio" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -140,32 +136,29 @@ "id": "EyHfC3nEzseN" }, "source": [ - "### Download a sample ORC file" + "### Download a sample dataset file in ORC" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZjEeF6Fva8UO" + }, + "source": [ + "The dataset we use here is the [Iris Data Set](https://archive.ics.uci.edu/ml/datasets/iris) from UCI. The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. It has 4 attributes: (1) sepal length, (2) sepal width, (3) petal length, (4) petal width, and the last column contains the class label." ] }, { "cell_type": "code", - "execution_count": 3, "metadata": { "id": "zaiXjZiXzrHs" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " % Total % Received % Xferd Average Speed Time Time Time Current\n", - " Dload Upload Total Spent Left Speed\n", - "100 144 100 144 0 0 862 0 --:--:-- --:--:-- --:--:-- 862\n", - "100 3328 100 3328 0 0 9618 0 --:--:-- --:--:-- --:--:-- 0\n", - "-rw-r--r-- 1 root root 3328 Jun 25 18:05 iris.orc\n" - ] - } - ], "source": [ "!curl -OL https://github.com/tensorflow/io/raw/master/tests/test_orc/iris.orc\n", "!ls -l iris.orc" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -178,14 +171,14 @@ }, { "cell_type": "code", - "execution_count": 4, "metadata": { "id": "ppFAjXAYsj-z" }, - "outputs": [], "source": [ "dataset = tfio.IODataset.from_orc(\"iris.orc\", capacity=15).batch(1)" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -198,23 +191,15 @@ }, { "cell_type": "code", - "execution_count": 5, "metadata": { "id": "9B1QUKG70Lzs" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[5.1] [3.5] [1.4] [0.2] [b'setosa']\n" - ] - } - ], "source": [ "for item in dataset.take(1):\n", " print(item)\n" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -245,11 +230,9 @@ }, { "cell_type": "code", - "execution_count": 6, "metadata": { "id": "R1OYAybz07dr" }, - "outputs": [], "source": [ "feature_cols = [\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]\n", "label_cols = [\"species\"]\n", @@ -258,7 +241,9 @@ "feature_dataset = tfio.IODataset.from_orc(\"iris.orc\", columns=feature_cols)\n", "# select label columns\n", "label_dataset = tfio.IODataset.from_orc(\"iris.orc\", columns=label_cols)" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -271,11 +256,9 @@ }, { "cell_type": "code", - "execution_count": 7, "metadata": { "id": "TQvuE7OgVs1q" }, - "outputs": [], "source": [ "@tf.function\n", "def species_float_conversion(x):\n", @@ -286,15 +269,15 @@ " if x == \"setosa\":\n", " return 3.0\n", " return 4.0\n" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "code", - "execution_count": 8, "metadata": { "id": "lpf0w41iWAZ4" }, - "outputs": [], "source": [ "label_dataset = label_dataset.map(species_float_conversion)\n", "dataset = tf.data.Dataset.zip((feature_dataset, label_dataset))\n", @@ -306,7 +289,9 @@ " return features, labels\n", "\n", "dataset = dataset.map(pack_features_vector)" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -323,45 +308,14 @@ "id": "oVB9Q0B-WDn4" }, "source": [ - "Finally, build the keras model and train it!" + "Finally, we are ready to build the model and train it! We will build a 3 layer keras model to predict the class of the iris plant from the dataset we just processed." ] }, { "cell_type": "code", - "execution_count": 9, "metadata": { "id": "tToy0FoOWG-9" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 1/5\n", - "150/150 [==============================] - 1s 2ms/step - loss: 15.6634 - accuracy: 0.3333\n", - "Epoch 2/5\n", - "150/150 [==============================] - 0s 2ms/step - loss: 15.4835 - accuracy: 0.3333\n", - "Epoch 3/5\n", - "150/150 [==============================] - 0s 2ms/step - loss: 15.4835 - accuracy: 0.3333\n", - "Epoch 4/5\n", - "150/150 [==============================] - 0s 2ms/step - loss: 15.4835 - accuracy: 0.3333\n", - "Epoch 5/5\n", - "150/150 [==============================] - 0s 2ms/step - loss: 15.4835 - accuracy: 0.3333\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], "source": [ "model = tf.keras.Sequential(\n", " [\n", @@ -375,22 +329,9 @@ "\n", "model.compile(optimizer=\"adam\", loss=\"binary_crossentropy\", metrics=[\"accuracy\"])\n", "model.fit(dataset, epochs=5)" - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [ - "Tce3stUlHN0L" ], - "name": "orc.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" + "execution_count": null, + "outputs": [] } - }, - "nbformat": 4, - "nbformat_minor": 0 -} + ] +} \ No newline at end of file From d3730dbfbf90814ceab7e45c2848c841da44d3b5 Mon Sep 17 00:00:00 2001 From: khu Date: Tue, 29 Jun 2021 23:51:33 -0700 Subject: [PATCH 07/10] fix lint --- docs/tutorials/orc.ipynb | 93 ++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 47 deletions(-) diff --git a/docs/tutorials/orc.ipynb b/docs/tutorials/orc.ipynb index c0054b0c8..d0da160c8 100644 --- a/docs/tutorials/orc.ipynb +++ b/docs/tutorials/orc.ipynb @@ -1,20 +1,4 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "orc.ipynb", - "provenance": [], - "collapsed_sections": [ - "Tce3stUlHN0L" - ], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, "cells": [ { "cell_type": "markdown", @@ -27,10 +11,12 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "cellView": "form", "id": "tuOe1ymfHZPu" }, + "outputs": [], "source": [ "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", "# you may not use this file except in compliance with the License.\n", @@ -43,9 +29,7 @@ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", "# See the License for the specific language governing permissions and\n", "# limitations under the License." - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -109,26 +93,26 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "g7cxbf1-skn6" }, + "outputs": [], "source": [ "!pip install tensorflow-io" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "IqR2PQG4ZaZ0" }, + "outputs": [], "source": [ "import tensorflow as tf\n", "import tensorflow_io as tfio" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -150,15 +134,15 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "zaiXjZiXzrHs" }, + "outputs": [], "source": [ "!curl -OL https://github.com/tensorflow/io/raw/master/tests/test_orc/iris.orc\n", "!ls -l iris.orc" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -171,14 +155,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "ppFAjXAYsj-z" }, + "outputs": [], "source": [ "dataset = tfio.IODataset.from_orc(\"iris.orc\", capacity=15).batch(1)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -191,15 +175,15 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "9B1QUKG70Lzs" }, + "outputs": [], "source": [ "for item in dataset.take(1):\n", " print(item)\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -230,9 +214,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "R1OYAybz07dr" }, + "outputs": [], "source": [ "feature_cols = [\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]\n", "label_cols = [\"species\"]\n", @@ -241,9 +227,7 @@ "feature_dataset = tfio.IODataset.from_orc(\"iris.orc\", columns=feature_cols)\n", "# select label columns\n", "label_dataset = tfio.IODataset.from_orc(\"iris.orc\", columns=label_cols)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -256,9 +240,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "TQvuE7OgVs1q" }, + "outputs": [], "source": [ "@tf.function\n", "def species_float_conversion(x):\n", @@ -269,15 +255,15 @@ " if x == \"setosa\":\n", " return 3.0\n", " return 4.0\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "lpf0w41iWAZ4" }, + "outputs": [], "source": [ "label_dataset = label_dataset.map(species_float_conversion)\n", "dataset = tf.data.Dataset.zip((feature_dataset, label_dataset))\n", @@ -289,9 +275,7 @@ " return features, labels\n", "\n", "dataset = dataset.map(pack_features_vector)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -313,9 +297,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "tToy0FoOWG-9" }, + "outputs": [], "source": [ "model = tf.keras.Sequential(\n", " [\n", @@ -329,9 +315,22 @@ "\n", "model.compile(optimizer=\"adam\", loss=\"binary_crossentropy\", metrics=[\"accuracy\"])\n", "model.fit(dataset, epochs=5)" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "Tce3stUlHN0L" ], - "execution_count": null, - "outputs": [] + "name": "orc.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} From a50511b6b3cba9464a3e6675723e7db5c5c06e7c Mon Sep 17 00:00:00 2001 From: khu Date: Wed, 30 Jun 2021 00:03:37 -0700 Subject: [PATCH 08/10] fix lint: Prefer second person instead of first person. --- docs/tutorials/orc.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/tutorials/orc.ipynb b/docs/tutorials/orc.ipynb index d0da160c8..50ffbd305 100644 --- a/docs/tutorials/orc.ipynb +++ b/docs/tutorials/orc.ipynb @@ -129,7 +129,7 @@ "id": "ZjEeF6Fva8UO" }, "source": [ - "The dataset we use here is the [Iris Data Set](https://archive.ics.uci.edu/ml/datasets/iris) from UCI. The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. It has 4 attributes: (1) sepal length, (2) sepal width, (3) petal length, (4) petal width, and the last column contains the class label." + "The dataset you will use here is the [Iris Data Set](https://archive.ics.uci.edu/ml/datasets/iris) from UCI. The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. It has 4 attributes: (1) sepal length, (2) sepal width, (3) petal length, (4) petal width, and the last column contains the class label." ] }, { @@ -292,7 +292,7 @@ "id": "oVB9Q0B-WDn4" }, "source": [ - "Finally, we are ready to build the model and train it! We will build a 3 layer keras model to predict the class of the iris plant from the dataset we just processed." + "Finally, you are ready to build the model and train it! You will build a 3 layer keras model to predict the class of the iris plant from the dataset you just processed." ] }, { From ee0c3bfd6a4a2af425920cc3e9698396a9e5588b Mon Sep 17 00:00:00 2001 From: khu Date: Mon, 12 Jul 2021 09:17:24 -0700 Subject: [PATCH 09/10] address comments --- docs/tutorials/orc.ipynb | 41 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/docs/tutorials/orc.ipynb b/docs/tutorials/orc.ipynb index 50ffbd305..0ec678410 100644 --- a/docs/tutorials/orc.ipynb +++ b/docs/tutorials/orc.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "cellView": "form", "id": "tuOe1ymfHZPu" @@ -93,7 +93,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "id": "g7cxbf1-skn6" }, @@ -104,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "id": "IqR2PQG4ZaZ0" }, @@ -134,7 +134,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "id": "zaiXjZiXzrHs" }, @@ -155,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "metadata": { "id": "ppFAjXAYsj-z" }, @@ -175,7 +175,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "metadata": { "id": "9B1QUKG70Lzs" }, @@ -214,7 +214,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 47, "metadata": { "id": "R1OYAybz07dr" }, @@ -240,32 +240,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 48, "metadata": { "id": "TQvuE7OgVs1q" }, "outputs": [], "source": [ - "@tf.function\n", - "def species_float_conversion(x):\n", - " if x == \"virginica\":\n", - " return 1.0\n", - " if x == \"versicolor\":\n", - " return 2.0\n", - " if x == \"setosa\":\n", - " return 3.0\n", - " return 4.0\n" + "vocab_init = tf.lookup.KeyValueTensorInitializer(\n", + " keys=tf.constant([\"virginica\", \"versicolor\", \"setosa\"]),\n", + " values=tf.constant([0, 1, 2], dtype=tf.int64))\n", + "vocab_table = tf.lookup.StaticVocabularyTable(\n", + " init,\n", + " num_oov_buckets=4)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 49, "metadata": { "id": "lpf0w41iWAZ4" }, "outputs": [], "source": [ - "label_dataset = label_dataset.map(species_float_conversion)\n", + "label_dataset = label_dataset.map(vocab_table.lookup)\n", "dataset = tf.data.Dataset.zip((feature_dataset, label_dataset))\n", "dataset = dataset.batch(1)\n", "\n", @@ -297,7 +294,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 50, "metadata": { "id": "tToy0FoOWG-9" }, @@ -307,13 +304,13 @@ " [\n", " tf.keras.layers.Dense(\n", " 10, activation=tf.nn.relu, input_shape=(4,)\n", - " ), # input shape required\n", + " ),\n", " tf.keras.layers.Dense(10, activation=tf.nn.relu),\n", " tf.keras.layers.Dense(3),\n", " ]\n", ")\n", "\n", - "model.compile(optimizer=\"adam\", loss=\"binary_crossentropy\", metrics=[\"accuracy\"])\n", + "model.compile(optimizer=\"adam\", loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[\"accuracy\"])\n", "model.fit(dataset, epochs=5)" ] } From 9b92e55a8d9ad432192bb349bcee54f5733eea4f Mon Sep 17 00:00:00 2001 From: Keqiu Hu Date: Wed, 14 Jul 2021 23:51:59 -0700 Subject: [PATCH 10/10] fix typo --- docs/tutorials/orc.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/orc.ipynb b/docs/tutorials/orc.ipynb index 0ec678410..e94ac6cf3 100644 --- a/docs/tutorials/orc.ipynb +++ b/docs/tutorials/orc.ipynb @@ -250,7 +250,7 @@ " keys=tf.constant([\"virginica\", \"versicolor\", \"setosa\"]),\n", " values=tf.constant([0, 1, 2], dtype=tf.int64))\n", "vocab_table = tf.lookup.StaticVocabularyTable(\n", - " init,\n", + " vocab_init,\n", " num_oov_buckets=4)" ] },