From 0ca829117a82f9e57c91c6f9cfa9376bbd183a28 Mon Sep 17 00:00:00 2001 From: Tyler Simpson <94872173+TylerJSimpson@users.noreply.github.com> Date: Sun, 26 Feb 2023 18:52:01 -0500 Subject: [PATCH] Add files via upload --- week_5/homework/week5_homework.ipynb | 200 ++++++++++++++++----------- 1 file changed, 122 insertions(+), 78 deletions(-) diff --git a/week_5/homework/week5_homework.ipynb b/week_5/homework/week5_homework.ipynb index ec64355..1005e67 100644 --- a/week_5/homework/week5_homework.ipynb +++ b/week_5/homework/week5_homework.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 99, + "execution_count": 11, "id": "442eaf62", "metadata": {}, "outputs": [], @@ -25,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 12, "id": "7c73fdba", "metadata": {}, "outputs": [ @@ -35,7 +35,7 @@ "'3.3.1'" ] }, - "execution_count": 4, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -62,26 +62,10 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 13, "id": "ebc9c270", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Setting default log level to \"WARN\".\n", - "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "23/02/26 22:10:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" - ] - } - ], + "outputs": [], "source": [ "spark = SparkSession.builder \\\n", " .master(\"local[*]\") \\\n", @@ -158,7 +142,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "id": "38f093a9", "metadata": {}, "outputs": [ @@ -184,7 +168,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 5, "id": "acc1cbc9", "metadata": {}, "outputs": [], @@ -196,7 +180,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 6, "id": "aed84a70", "metadata": {}, "outputs": [], @@ -210,7 +194,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 7, "id": "92850d04", "metadata": {}, "outputs": [ @@ -238,9 +222,11 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 8, "id": "43b4819d", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -252,7 +238,7 @@ " Row(dispatching_base_num='B02510', pickup_datetime=datetime.datetime(2021, 6, 1, 0, 45, 42), dropoff_datetime=datetime.datetime(2021, 6, 1, 1, 3, 33), PULocationID='144', DOLocationID='146', SR_Flag='N', Affiliated_base_number=None, pickup_date=datetime.date(2021, 6, 1), dropoff_date=datetime.date(2021, 6, 1))]" ] }, - "execution_count": 55, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -271,7 +257,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 9, "id": "544bb798", "metadata": {}, "outputs": [ @@ -281,13 +267,13 @@ "DataFrame[dispatching_base_num: string, pickup_datetime: timestamp, dropoff_datetime: timestamp, PULocationID: string, DOLocationID: string, SR_Flag: string, Affiliated_base_number: string, pickup_date: date, dropoff_date: date]" ] }, - "execution_count": 56, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_cleaned.repartition(24)" + "df_cleaned.repartition(12)" ] }, { @@ -300,7 +286,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 10, "id": "6c986c22", "metadata": {}, "outputs": [ @@ -334,7 +320,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 14, "id": "d15e9977", "metadata": {}, "outputs": [], @@ -344,7 +330,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 15, "id": "6383eb77", "metadata": {}, "outputs": [ @@ -362,7 +348,7 @@ " 'dropoff_date']" ] }, - "execution_count": 59, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -373,7 +359,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 16, "id": "06be3b26", "metadata": {}, "outputs": [ @@ -387,7 +373,7 @@ " Row(dispatching_base_num='B02875', pickup_datetime=datetime.datetime(2021, 6, 13, 21, 56, 9), dropoff_datetime=datetime.datetime(2021, 6, 13, 22, 12, 10), PULocationID='254', DOLocationID='265', SR_Flag='N', Affiliated_base_number='B02875', pickup_date=datetime.date(2021, 6, 13), dropoff_date=datetime.date(2021, 6, 13))]" ] }, - "execution_count": 60, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -398,7 +384,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 17, "id": "c301c311", "metadata": {}, "outputs": [ @@ -426,7 +412,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 18, "id": "ee8ce669", "metadata": {}, "outputs": [ @@ -445,7 +431,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 19, "id": "4477f366", "metadata": {}, "outputs": [ @@ -489,7 +475,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 21, "id": "500cd1fc", "metadata": {}, "outputs": [ @@ -497,37 +483,37 @@ "name": "stderr", "output_type": "stream", "text": [ - "[Stage 53:=================================> (4 + 3) / 7]\r" + "[Stage 11:=========================================> (5 + 2) / 7]\r" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "+-----------+---------------------------------------------------------------------------------------------------------------------+\n", - "|pickup_date|((unix_timestamp(dropoff_datetime, yyyy-MM-dd HH:mm:ss) - unix_timestamp(pickup_datetime, yyyy-MM-dd HH:mm:ss)) / 60)|\n", - "+-----------+---------------------------------------------------------------------------------------------------------------------+\n", - "| 2021-06-25| 4012.733333333333|\n", - "| 2021-06-22| 1532.9833333333333|\n", - "| 2021-06-27| 1198.85|\n", - "| 2021-06-26| 1091.8333333333333|\n", - "| 2021-06-23| 988.0166666666667|\n", - "| 2021-06-23| 856.1333333333333|\n", - "| 2021-06-24| 834.5833333333334|\n", - "| 2021-06-04| 700.2|\n", - "| 2021-06-27| 681.95|\n", - "| 2021-06-20| 659.0666666666667|\n", - "| 2021-06-01| 616.05|\n", - "| 2021-06-01| 597.9833333333333|\n", - "| 2021-06-28| 597.9833333333333|\n", - "| 2021-06-27| 578.2666666666667|\n", - "| 2021-06-18| 577.4666666666667|\n", - "| 2021-06-08| 568.8166666666667|\n", - "| 2021-06-11| 568.3|\n", - "| 2021-06-15| 564.1333333333333|\n", - "| 2021-06-25| 563.6166666666667|\n", - "| 2021-06-04| 562.6166666666667|\n", - "+-----------+---------------------------------------------------------------------------------------------------------------------+\n", + "+-----------+----------------------------------------------------------------------------------------------------------------------------+\n", + "|pickup_date|(((unix_timestamp(dropoff_datetime, yyyy-MM-dd HH:mm:ss) - unix_timestamp(pickup_datetime, yyyy-MM-dd HH:mm:ss)) / 60) / 60)|\n", + "+-----------+----------------------------------------------------------------------------------------------------------------------------+\n", + "| 2021-06-25| 66.87888888888888|\n", + "| 2021-06-22| 25.549722222222222|\n", + "| 2021-06-27| 19.980833333333333|\n", + "| 2021-06-26| 18.19722222222222|\n", + "| 2021-06-23| 16.466944444444444|\n", + "| 2021-06-23| 14.268888888888888|\n", + "| 2021-06-24| 13.909722222222223|\n", + "| 2021-06-04| 11.67|\n", + "| 2021-06-27| 11.365833333333335|\n", + "| 2021-06-20| 10.984444444444446|\n", + "| 2021-06-01| 10.2675|\n", + "| 2021-06-28| 9.96638888888889|\n", + "| 2021-06-01| 9.96638888888889|\n", + "| 2021-06-27| 9.637777777777778|\n", + "| 2021-06-18| 9.624444444444444|\n", + "| 2021-06-08| 9.48027777777778|\n", + "| 2021-06-11| 9.471666666666666|\n", + "| 2021-06-15| 9.402222222222223|\n", + "| 2021-06-25| 9.393611111111111|\n", + "| 2021-06-04| 9.376944444444444|\n", + "+-----------+----------------------------------------------------------------------------------------------------------------------------+\n", "only showing top 20 rows\n", "\n" ] @@ -544,7 +530,7 @@ "source": [ "spark.sql(\"\"\"\n", "SELECT pickup_date,\n", - " (unix_timestamp(dropoff_datetime)-unix_timestamp(pickup_datetime))/60\n", + " (unix_timestamp(dropoff_datetime)-unix_timestamp(pickup_datetime))/60/60\n", "FROM fhvhv_2021_06 \n", "GROUP BY 1,2\n", "ORDER BY 2 DESC\n", @@ -654,7 +640,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 22, "id": "bcf51060", "metadata": {}, "outputs": [], @@ -664,7 +650,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 23, "id": "f6f248e6", "metadata": {}, "outputs": [ @@ -674,7 +660,7 @@ "['LocationID', 'Borough', 'Zone', 'service_zone']" ] }, - "execution_count": 78, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -685,7 +671,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 24, "id": "758ba987", "metadata": {}, "outputs": [ @@ -699,7 +685,7 @@ " Row(LocationID='5', Borough='Staten Island', Zone='Arden Heights', service_zone='Boro Zone')]" ] }, - "execution_count": 79, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -710,7 +696,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 25, "id": "fb7cab21", "metadata": {}, "outputs": [ @@ -733,7 +719,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 26, "id": "9df83500", "metadata": {}, "outputs": [], @@ -744,7 +730,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 27, "id": "4fdcf805", "metadata": {}, "outputs": [], @@ -756,7 +742,7 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 28, "id": "76440592", "metadata": {}, "outputs": [ @@ -780,7 +766,7 @@ " |-- service_zone: string (nullable = true)\n", " |-- LocationID: string (nullable = true)\n", " |-- Borough: string (nullable = true)\n", - " |-- Zone: string (nullable = true)\n", + " |-- DO_Zone: string (nullable = true)\n", " |-- service_zone: string (nullable = true)\n", "\n" ] @@ -792,7 +778,7 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 29, "id": "d4421d9c", "metadata": {}, "outputs": [], @@ -849,6 +835,64 @@ ";\n", "\"\"\").show(5,truncate=False)" ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "72ba4e09", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 18:=========================================> (5 + 2) / 7]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------------------+------+\n", + "|PU_Zone |Count |\n", + "+-------------------+------+\n", + "|Crown Heights North|231279|\n", + "|East Village |221244|\n", + "|JFK Airport |188867|\n", + "|Bushwick South |187929|\n", + "|East New York |186780|\n", + "+-------------------+------+\n", + "only showing top 5 rows\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " \r" + ] + } + ], + "source": [ + "spark.sql(\"\"\"\n", + "SELECT DISTINCT(PU_Zone) AS PU_Zone,\n", + " COUNT(PU_Zone) AS Count\n", + "FROM fhvhv_2021_06_result \n", + "GROUP BY 1\n", + "ORDER BY 2 DESC\n", + ";\n", + "\"\"\").show(5,truncate=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "871c7faf", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {