From fff9444da55d046fb33d6caaac6bf9b9048cf2a2 Mon Sep 17 00:00:00 2001 From: Jelle Treep <40466121+jelletreep@users.noreply.github.com> Date: Wed, 27 Sep 2023 17:50:29 +0200 Subject: [PATCH] last minor revisions from pilot (#79) --- book/Introduction_to_python_1.ipynb | 2 +- book/Introduction_to_python_2.ipynb | 4 +- book/Introduction_to_python_3.ipynb | 12 +- book/Introduction_to_python_4.ipynb | 2 +- book/_quarto.yml | 4 + book/data-science-with-pandas-1.ipynb | 43 +- book/data-science-with-pandas-2.ipynb | 14 +- book/data-science-with-pandas-3.ipynb | 17 +- book/data-science-with-pandas-4.ipynb | 10 +- book/installation-and-setup.qmd | 19 +- .../afternoon_exercises_solutions.ipynb | 1337 +++++++++++++++++ .../morning_exercises_solutions.ipynb | 301 +++- course_materials/afternoon_exercises.ipynb | 174 ++- .../empty_notebook_for_coding_along.ipynb | 92 ++ course_materials/morning_exercises.ipynb | 45 +- .../afternoon_exercises_solutions.ipynb | 425 ------ 16 files changed, 1887 insertions(+), 614 deletions(-) create mode 100644 book/solutions/afternoon_exercises_solutions.ipynb rename {course_materials => book}/solutions/morning_exercises_solutions.ipynb (62%) create mode 100644 course_materials/empty_notebook_for_coding_along.ipynb delete mode 100644 course_materials/solutions/afternoon_exercises_solutions.ipynb diff --git a/book/Introduction_to_python_1.ipynb b/book/Introduction_to_python_1.ipynb index 4b2f6ef..bb93656 100644 --- a/book/Introduction_to_python_1.ipynb +++ b/book/Introduction_to_python_1.ipynb @@ -273,7 +273,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.8.10" } }, "nbformat": 4, diff --git a/book/Introduction_to_python_2.ipynb b/book/Introduction_to_python_2.ipynb index fca6fd1..3ecbe26 100644 --- a/book/Introduction_to_python_2.ipynb +++ b/book/Introduction_to_python_2.ipynb @@ -76,7 +76,7 @@ "When we want to make use of a function (referred to as calling the function), we type the name of the function followed by parentheses. Between the parentheses we can pass arguments.\n", "\n", "**Arguments**\n", - "We typically provide a function with 'arguments' to tell python which values or variables are used to perform the body of the function. In the example below `type` is the function name and `pi_value` is the argument." + "Arguments are used by a function to perform the body of the function with the value of this argument. In the example below `type` is the function name and `pi_value` is the argument." ] }, { @@ -397,7 +397,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.8.10" } }, "nbformat": 4, diff --git a/book/Introduction_to_python_3.ipynb b/book/Introduction_to_python_3.ipynb index 118aca0..a585a31 100644 --- a/book/Introduction_to_python_3.ipynb +++ b/book/Introduction_to_python_3.ipynb @@ -522,7 +522,7 @@ "source": [ "## Dictionaries\n", "\n", - "A dictionary is another way to store multiple items into one object. In dictionaries, however, this is done with keys and values. This can be useful for several reasons, one example is to store model settings, parameters or variable values for multiple scenarios." + "A dictionary is another way to store multiple items into one object. In dictionaries, however, this is done with keys and values. In dictionaries, keys are typically use to look up values. A good analogy may be the contact list in your phone where you use a name (key) to lookup a phone number (value). This can be useful for several reasons, one example is to store model settings, parameters or variable values for multiple scenarios. " ] }, { @@ -532,7 +532,7 @@ "metadata": {}, "outputs": [], "source": [ - "my_dict = {'one': 'first', 'two': 'second'}\n", + "my_dict = {'one': 1, 'two': 2}\n", "my_dict" ] }, @@ -569,7 +569,7 @@ "metadata": {}, "outputs": [], "source": [ - "my_dict['third'] = 'three'\n", + "my_dict['three'] = 3\n", "my_dict" ] }, @@ -578,7 +578,7 @@ "id": "1043ec01", "metadata": {}, "source": [ - "Dictionary items are key-value pairs. The keys are changeable and unique. The values are changable, but not necessarily unique." + "Dictionary items are key-value pairs. The keys are changeable and always have to be unique (within a dictionary object). The values within a dictionary are changable, and don't have to be unique." ] }, { @@ -588,7 +588,7 @@ "metadata": {}, "outputs": [], "source": [ - "my_dict['two'] = 'three'\n", + "my_dict['two'] = 5\n", "my_dict" ] }, @@ -738,7 +738,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.8.10" } }, "nbformat": 4, diff --git a/book/Introduction_to_python_4.ipynb b/book/Introduction_to_python_4.ipynb index 603ecc1..f3ac93f 100644 --- a/book/Introduction_to_python_4.ipynb +++ b/book/Introduction_to_python_4.ipynb @@ -168,7 +168,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.8.10" } }, "nbformat": 4, diff --git a/book/_quarto.yml b/book/_quarto.yml index 69525cf..0a3b4cf 100644 --- a/book/_quarto.yml +++ b/book/_quarto.yml @@ -28,6 +28,10 @@ book: - data-science-with-pandas-2.ipynb - data-science-with-pandas-3.ipynb - data-science-with-pandas-4.ipynb + - part: "Exercises solutions" + chapters: + - solutions/morning_exercises_solutions.ipynb + - solutions/afternoon_exercises_solutions.ipynb - what-next.qmd - references.qmd repo-url: https://github.com/UtrechtUniversity/workshop-introduction-to-python diff --git a/book/data-science-with-pandas-1.ipynb b/book/data-science-with-pandas-1.ipynb index 800957c..3d1cdc8 100644 --- a/book/data-science-with-pandas-1.ipynb +++ b/book/data-science-with-pandas-1.ipynb @@ -25,7 +25,7 @@ "\n", "A library (aka package) is a collection of files (aka python scripts) that contains **functions** that can be used to perform specific tasks. A library may also contain data. The functions in a library are typically related and used for a specific purpose, e.g. there are libraries for plotting, handling audio data and machine learning and many many more. Some libraries are built into python, but most packages need to be installed before you can use it.\n", "\n", - "Important to add: libraries are developed and maintained by other Python users. A popular library like Pandas has a large user base and the maintainers are supported by several funders, which makes it a reliable library that is updated very frequently. But this is not always the case, on the other side of the spectrum, a library can also be published once and not maintained at all.\n", + "Libraries are developed and maintained by other Python users. That is why there are so many packages and this is great: there is a huge variety of functions available that you can use instead of programming them yourself. But it is important to be aware that the quality can differ. A popular library like Pandas has a large user base and the maintainers are supported by several funders, which makes it a reliable library that is updated very frequently. But this is not always the case, on the other side of the spectrum, a library can also be published once, badly designed/documented and/or not maintained at all. To check the quality of a library, you can check e.g. the number of downloads, the number of contributors, the number of open issues, the date of the last update and the number of stars on GitHub.\n", "\n", "## Pandas\n", "The python library [**Pandas**](https://pandas.pydata.org/about/) is a popular open-source data analysis and data manipulation library for Python which was developed in 2008. The library has some similarities with R, mainly related to the DataFrame data type that is used to handle table like datasets.\n", @@ -212,29 +212,6 @@ "It is, however, enough for a quick exploration of how the dataset looks like in terms of columns names, values, and potential reading errors." ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "Exercise 1\n", - " \n", - "Now go to the Jupyter Dashboard in your internet browser and open the notebook `afternoon_exercises.ipynb` and do exercise 1.\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "::: {.callout-note}\n", - "As you can see in this exercise a DataFrame object comes with several methods that can be applied to the DataFrame. A method is similar to a function, but it can only be applied to the object it belongs to and has a different notation than a function.\n", - "\n", - "Compare the notation of the function `len`: `len(surveys_df)` \n", - "with the DataFrame specific method `shape`: `surveys_df.shape`\n", - ":::" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -356,12 +333,24 @@ "metadata": {}, "source": [ "
\n", - "Exercise 2\n", + "Exercises 0 and 1\n", " \n", - "Now go to the Jupyter Dashboard in your internet browser and continue with exercise 2.\n", + "Now go to the Jupyter Dashboard in your internet browser and continue with exercises 0 and 1.\n", "
" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "::: {.callout-note}\n", + "As you can see in this exercise a DataFrame object comes with several methods that can be applied to the DataFrame. A method is similar to a function, but it can only be applied to the object it belongs to and has a different notation than a function.\n", + "\n", + "Compare the notation of the function `len`: `len(surveys_df)` \n", + "with the DataFrame specific method `shape`: `surveys_df.shape`\n", + ":::" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -439,7 +428,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.8.10" }, "vscode": { "interpreter": { diff --git a/book/data-science-with-pandas-2.ipynb b/book/data-science-with-pandas-2.ipynb index 3aa722a..6f5bc4e 100644 --- a/book/data-science-with-pandas-2.ipynb +++ b/book/data-science-with-pandas-2.ipynb @@ -353,9 +353,9 @@ "metadata": {}, "source": [ "
\n", - "Exercise 3 to 5\n", + "Exercise 2 to 4\n", " \n", - "Now go to the Jupyter Dashboard in your internet browser and continue with the afternoon exercises 3 to 5." + "Now go to the Jupyter Dashboard in your internet browser and continue with the afternoon exercises 2 to 4." ] }, { @@ -669,9 +669,9 @@ "metadata": {}, "source": [ "
\n", - "Exercise 6 and 7\n", + "Exercise 5 and 6\n", "\n", - "Now go to the Jupyter Dashboard in your internet browser and continue with the afternoon exercises 6 and 7." + "Now go to the Jupyter Dashboard in your internet browser and continue with the afternoon exercises 5 and 6." ] }, { @@ -869,9 +869,9 @@ "metadata": {}, "source": [ "
\n", - "Exercise 8 to 10\n", + "Exercise 7 to 9\n", " \n", - "Now go to the Jupyter Dashboard in your internet browser and continue with the afternoon exercises 8 to 10." + "Now go to the Jupyter Dashboard in your internet browser and continue with the afternoon exercises 7 to 9." ] }, { @@ -899,7 +899,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.8.10" } }, "nbformat": 4, diff --git a/book/data-science-with-pandas-3.ipynb b/book/data-science-with-pandas-3.ipynb index e7400c7..d3bf853 100644 --- a/book/data-science-with-pandas-3.ipynb +++ b/book/data-science-with-pandas-3.ipynb @@ -193,17 +193,6 @@ "vertical_stack.reset_index()" ] }, - { - "cell_type": "markdown", - "id": "f1db04a6-8ae2-44e4-9798-adc1b108c9d8", - "metadata": {}, - "source": [ - "
\n", - "Exercise 11\n", - " \n", - "Now go to the Jupyter Dashboard in your internet browser and continue with exercise 11.\n" - ] - }, { "cell_type": "markdown", "id": "7bc415a6", @@ -315,9 +304,9 @@ "metadata": {}, "source": [ "
\n", - "Exercise 12\n", + "Exercise 10 and 11\n", " \n", - "Now go to the Jupyter Dashboard in your internet browser and continue with exercise 12.\n" + "Now go to the Jupyter Dashboard in your internet browser and continue with exercise 10 and 11.\n" ] }, { @@ -508,7 +497,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.8.10" } }, "nbformat": 4, diff --git a/book/data-science-with-pandas-4.ipynb b/book/data-science-with-pandas-4.ipynb index a4a9abe..99aced6 100644 --- a/book/data-science-with-pandas-4.ipynb +++ b/book/data-science-with-pandas-4.ipynb @@ -439,9 +439,9 @@ "metadata": {}, "source": [ "
\n", - "Exercise 13\n", + "Exercise 12\n", " \n", - "Now go to the Jupyter Dashboard in your internet browser and continue with exercise 13.\n", + "Now go to the Jupyter Dashboard in your internet browser and continue with exercise 12.\n", "
" ] }, @@ -488,9 +488,9 @@ "metadata": {}, "source": [ "
\n", - "Exercise 14\n", + "Exercise 13\n", " \n", - "Now go to the Jupyter Dashboard in your internet browser and continue with exercise 14.\n", + "Now go to the Jupyter Dashboard in your internet browser and continue with exercise 13.\n", "
" ] }, @@ -564,7 +564,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.8.10" }, "toc": { "base_numbering": 1, diff --git a/book/installation-and-setup.qmd b/book/installation-and-setup.qmd index 08be8bf..6a0ace9 100644 --- a/book/installation-and-setup.qmd +++ b/book/installation-and-setup.qmd @@ -72,7 +72,7 @@ version 9 and below, are not). 3. Move the downloaded zip to the folder where you want to store these course materials. 4. Unzip the zip file. -In your `python-workshop` you will see two folders called `data` and `solutions` and the following files: +In your `python-workshop` you will see a folders called `data` and the following files: ``` python-workshop @@ -82,11 +82,9 @@ python-workshop │ ├── species.csv │ ├── surveys.csv │ └── plots.csv -├── solutions -│ ├── morning_exercises_solutions.ipynb -│ └── afternoon_exercises_solutions.ipynb ├── morning_exercises.ipynb -└── afternoon_exercises.ipynb +├── afternoon_exercises.ipynb +└── empty_notebook_for_code_along.ipynb ``` ## Launch Python interface @@ -139,9 +137,12 @@ Make your choice and click "Ok, and don't show again" button. 2. Find the "Notebook" tab and click on the "Launch" button. Anaconda will open a new browser window or tab with a Notebook Dashboard showing you the contents of your Home (or User) folder. -3. Navigate to the `python-workshop` directory by clicking on the directory names leading to it. -`Desktop` and then `python-workshop`: -4. Launch the notebook by clicking on the "New" button and then selecting "Python 3". +3. Navigate to the `data` directory by clicking on the directory names leading to it. +`Desktop`, `python-workshop`, then `data`: +4. Launch the notebook called `empty_notebook_for_code_along.ipynb` by clicking on it. +5. Run the first code cell just below "Test installation" by clicking on it and then click on the 'play button'. +If the output of the cell displays 4 version numbers and the words "No errors! Ready to code!" instead of an error message, your installation is successful. If not, contact us at [RDM walk in hours][walk-in-hours] or reply to the welcome email. + ::: ### Option B: IPython interpreter @@ -176,7 +177,7 @@ This workshop will make use of the following Python packages: - `matplotlib` - `numpy` -Anaconda Navigator comes with these packages, so you are ready to go. If you are using another option to work with Python, you need to install these packages. If you need help with this, please ask the internet, your colleagues or us (see welcome email). +Anaconda Navigator comes with these packages, so you are ready to go. If you are using another option to work with Python, you need to install these packages. We then assume that you know how to install packages, otherwise we recommend using Anaconda instead. If for some reason you cannot work with Anaconda and you need help with installing packages, we are happy to help during the [RDM walk in hours](https://www.uu.nl/en/research/research-data-management/workshops/walk-in-hours-research-data-and-software), or email us by replying to the welcome email. ## References {.unnumbered} diff --git a/book/solutions/afternoon_exercises_solutions.ipynb b/book/solutions/afternoon_exercises_solutions.ipynb new file mode 100644 index 0000000..7c53bbe --- /dev/null +++ b/book/solutions/afternoon_exercises_solutions.ipynb @@ -0,0 +1,1337 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "14f401b8-2d1d-4a01-a3c8-ed2c59e66d29", + "metadata": {}, + "source": [ + "---\n", + "format: html\n", + "execute: \n", + " enabled: true\n", + " error: true\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "67d94b6e", + "metadata": {}, + "source": [ + "# Afternoon Exercises: Working with data {.unnumbered}" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2c6372a4", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "surveys_df = pd.read_csv('../../course_materials/data/surveys.csv') # in your notebook the path should be 'data/surveys.csv'" + ] + }, + { + "cell_type": "markdown", + "id": "507d1c2c-98b8-4a67-b2e5-b3a3031a5b5f", + "metadata": {}, + "source": [ + "### Exercise 0\n", + "\n", + "Type the following commands and check the outputs. Can you tell what each command does? What is the difference between commands with and without parenthesis?\n", + "\n", + "```python\n", + "surveys_df.shape # Answer: the dimensions of the dataframe\n", + "surveys_df.columns # Answer: the column names of the dataframe\n", + "surveys_df.index # Answer: the index (row labels) of the dataframe\n", + "surveys_df.dtypes # Answer: the data types of each column\n", + "surveys_df.head() # Answer: the first n rows of the dataframe\n", + "surveys_df.tail() # Answer: the last n rows of the dataframe\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "0514bad4-1266-4553-9752-29dcf3166d21", + "metadata": {}, + "source": [ + "### Exercise 1\n", + "Perform some basic statistics on the weight column. For practical reasons, it can be useful to first create a variable `weight` that contains the just the weight column. It will make the code look a bit cleaner. Can you tell what each method listed below does? Look at our explorative plot, do the statistics make sense?\n", + "\n", + "```python\n", + "weight=surveys_df['weight'] # Answer: creates a new variable that contains the weight column\n", + "weight.min() # Answer: the minimum value of the weight column\n", + "weight.max() # Answer: the maximum value of the weight column\n", + "weight.mean() # Answer: the mean value of the weight column\n", + "weight.std() # Answer: the standard deviation of the weight column\n", + "weight.count() # Answer: the number of non-NaN values in the weight column\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "2bcb581f", + "metadata": {}, + "source": [ + "### Exercise 2\n", + "- Swap the order of column names in `surveys_df[['plot_id', 'species_id']]`\n", + "- Repeat one of the column names like `surveys_df[['plot_id', 'plot_id', 'species_id']]`.\n", + "What do the results look like and why? \n", + "\n", + "> Answer: the column names are repeated and the data is displayed twice. Column names do not have to be unique.\n", + "\n", + "- Which error occurs in `surveys_df['plot_id', 'species_id']` and why? \n", + "\n", + "> Answer: KeyError: ('plot_id', 'species_id'). The column names are not in a list. We need double square brackets to select multiple columns.\n", + "\n", + "- Which error occurs in `surveys_df['speciess']`? \n", + "\n", + "> Answer: KeyError: 'speciess'. The column name does not exist. Typo." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3731f5fd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " species_id plot_id\n", + "0 NL 2\n", + "1 NL 3\n", + "2 DM 2\n", + "3 DM 7\n", + "4 DM 3\n", + "... ... ...\n", + "35544 AH 15\n", + "35545 AH 15\n", + "35546 RM 10\n", + "35547 DO 7\n", + "35548 NaN 5\n", + "\n", + "[35549 rows x 2 columns]\n" + ] + } + ], + "source": [ + "print(surveys_df[['species_id', 'plot_id']])" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b71788b1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
plot_idplot_idspecies_id
022NL
133NL
222DM
377DM
433DM
............
355441515AH
355451515AH
355461010RM
3554777DO
3554855NaN
\n", + "

35549 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " plot_id plot_id species_id\n", + "0 2 2 NL\n", + "1 3 3 NL\n", + "2 2 2 DM\n", + "3 7 7 DM\n", + "4 3 3 DM\n", + "... ... ... ...\n", + "35544 15 15 AH\n", + "35545 15 15 AH\n", + "35546 10 10 RM\n", + "35547 7 7 DO\n", + "35548 5 5 NaN\n", + "\n", + "[35549 rows x 3 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "surveys_df[['plot_id', 'plot_id', 'species_id']]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "bb68a995", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Unexpected exception formatting exception. Falling back to standard exception\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py\", line 3802, in get_loc\n", + " return self._engine.get_loc(casted_key)\n", + " File \"pandas/_libs/index.pyx\", line 138, in pandas._libs.index.IndexEngine.get_loc\n", + " File \"pandas/_libs/index.pyx\", line 165, in pandas._libs.index.IndexEngine.get_loc\n", + " File \"pandas/_libs/hashtable_class_helper.pxi\", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", + " File \"pandas/_libs/hashtable_class_helper.pxi\", line 5753, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", + "KeyError: ('plot_id', 'species_id')\n", + "\n", + "The above exception was the direct cause of the following exception:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 3460, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"/tmp/ipykernel_16478/997369930.py\", line 1, in \n", + " surveys_df['plot_id', 'species_id']\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/frame.py\", line 3807, in __getitem__\n", + " indexer = self.columns.get_loc(key)\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py\", line 3804, in get_loc\n", + " raise KeyError(key) from err\n", + "KeyError: ('plot_id', 'species_id')\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 2057, in showtraceback\n", + " stb = self.InteractiveTB.structured_traceback(\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1288, in structured_traceback\n", + " return FormattedTB.structured_traceback(\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1177, in structured_traceback\n", + " return VerboseTB.structured_traceback(\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1030, in structured_traceback\n", + " formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 935, in format_exception_as_a_whole\n", + " self.get_records(etb, number_of_lines_of_context, tb_offset) if etb else []\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 987, in get_records\n", + " style = stack_data.style_with_executing_node(style, self._tb_highlight)\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/stack_data/core.py\", line 455, in style_with_executing_node\n", + " class NewStyle(style):\n", + " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 91, in __new__\n", + " ndef[4] = colorformat(styledef[3:])\n", + " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 58, in colorformat\n", + " assert False, \"wrong color format %r\" % text\n", + "AssertionError: wrong color format 'ansiyellow'\n" + ] + } + ], + "source": [ + "surveys_df['plot_id', 'species_id'] " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b74edc72", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Unexpected exception formatting exception. Falling back to standard exception\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py\", line 3802, in get_loc\n", + " return self._engine.get_loc(casted_key)\n", + " File \"pandas/_libs/index.pyx\", line 138, in pandas._libs.index.IndexEngine.get_loc\n", + " File \"pandas/_libs/index.pyx\", line 165, in pandas._libs.index.IndexEngine.get_loc\n", + " File \"pandas/_libs/hashtable_class_helper.pxi\", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", + " File \"pandas/_libs/hashtable_class_helper.pxi\", line 5753, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", + "KeyError: 'speciess'\n", + "\n", + "The above exception was the direct cause of the following exception:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 3460, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"/tmp/ipykernel_16478/158671928.py\", line 1, in \n", + " surveys_df['speciess']\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/frame.py\", line 3807, in __getitem__\n", + " indexer = self.columns.get_loc(key)\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py\", line 3804, in get_loc\n", + " raise KeyError(key) from err\n", + "KeyError: 'speciess'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 2057, in showtraceback\n", + " stb = self.InteractiveTB.structured_traceback(\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1288, in structured_traceback\n", + " return FormattedTB.structured_traceback(\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1177, in structured_traceback\n", + " return VerboseTB.structured_traceback(\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1030, in structured_traceback\n", + " formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 935, in format_exception_as_a_whole\n", + " self.get_records(etb, number_of_lines_of_context, tb_offset) if etb else []\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 987, in get_records\n", + " style = stack_data.style_with_executing_node(style, self._tb_highlight)\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/stack_data/core.py\", line 455, in style_with_executing_node\n", + " class NewStyle(style):\n", + " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 91, in __new__\n", + " ndef[4] = colorformat(styledef[3:])\n", + " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 58, in colorformat\n", + " assert False, \"wrong color format %r\" % text\n", + "AssertionError: wrong color format 'ansiyellow'\n" + ] + } + ], + "source": [ + "surveys_df['speciess']" + ] + }, + { + "cell_type": "markdown", + "id": "119b576b", + "metadata": {}, + "source": [ + "### Exercise 3\n", + "What happens when you call:\n", + "\n", + "- `surveys_df[0:1]` Answer: shows the first row of the dataframe\n", + "- `surveys_df[:4]` Answer: shows the first 4 rows of the dataframe from index 0 to index 3\n", + "- `surveys_df[:-1]` Answer: shows all rows of the dataframe except the last row" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bd3b4c74", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
record_idmonthdayyearplot_idspecies_idsexhindfoot_lengthweight
0171619772NLM32.0NaN
1271619773NLM33.0NaN
2371619772DMF37.0NaN
3471619777DMM36.0NaN
4571619773DMM35.0NaN
..............................
35543355441231200215USNaNNaNNaN
35544355451231200215AHNaNNaNNaN
35545355461231200215AHNaNNaNNaN
35546355471231200210RMF15.014.0
3554735548123120027DOM36.051.0
\n", + "

35548 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " record_id month day year plot_id species_id sex hindfoot_length \\\n", + "0 1 7 16 1977 2 NL M 32.0 \n", + "1 2 7 16 1977 3 NL M 33.0 \n", + "2 3 7 16 1977 2 DM F 37.0 \n", + "3 4 7 16 1977 7 DM M 36.0 \n", + "4 5 7 16 1977 3 DM M 35.0 \n", + "... ... ... ... ... ... ... ... ... \n", + "35543 35544 12 31 2002 15 US NaN NaN \n", + "35544 35545 12 31 2002 15 AH NaN NaN \n", + "35545 35546 12 31 2002 15 AH NaN NaN \n", + "35546 35547 12 31 2002 10 RM F 15.0 \n", + "35547 35548 12 31 2002 7 DO M 36.0 \n", + "\n", + " weight \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "35543 NaN \n", + "35544 NaN \n", + "35545 NaN \n", + "35546 14.0 \n", + "35547 51.0 \n", + "\n", + "[35548 rows x 9 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "surveys_df[0:1]\n", + "surveys_df[:4] \n", + "surveys_df[:-1] " + ] + }, + { + "cell_type": "markdown", + "id": "4fc54c12", + "metadata": {}, + "source": [ + "### Exercise 4\n", + "What happens in the following two examples?\n", + "\n", + "- ```surveys_df.iloc[0:4, 1:4]```;\n", + "- ```surveys_df.loc[0:4, 1:4]```.\n", + "\n", + "How are the two commands different? \n", + "\n", + "> Answer: iloc uses integer indices, loc uses labels. The first command will return the first 4 rows and the columns with integer indices 1, 2 and 3. The second command will return the first 4 rows and the columns with labels 1, 2 and 3. Since there are no columns with these labels, the second command will return an error." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6680a6fe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " month day year\n", + "0 7 16 1977\n", + "1 7 16 1977\n", + "2 7 16 1977\n", + "3 7 16 1977\n", + "Unexpected exception formatting exception. Falling back to standard exception\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 3460, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"/tmp/ipykernel_16478/1314140639.py\", line 2, in \n", + " surveys_df.loc[0:4, 1:4] # the function loc works with indices for rows (0:4),\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexing.py\", line 1067, in __getitem__\n", + " return self._getitem_tuple(key)\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexing.py\", line 1256, in _getitem_tuple\n", + " return self._getitem_tuple_same_dim(tup)\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexing.py\", line 924, in _getitem_tuple_same_dim\n", + " retval = getattr(retval, self.name)._getitem_axis(key, axis=i)\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexing.py\", line 1290, in _getitem_axis\n", + " return self._get_slice_axis(key, axis=axis)\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexing.py\", line 1324, in _get_slice_axis\n", + " indexer = labels.slice_indexer(slice_obj.start, slice_obj.stop, slice_obj.step)\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py\", line 6559, in slice_indexer\n", + " start_slice, end_slice = self.slice_locs(start, end, step=step)\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py\", line 6767, in slice_locs\n", + " start_slice = self.get_slice_bound(start, \"left\")\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py\", line 6676, in get_slice_bound\n", + " label = self._maybe_cast_slice_bound(label, side)\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py\", line 6623, in _maybe_cast_slice_bound\n", + " raise self._invalid_indexer(\"slice\", label)\n", + "TypeError: cannot do slice indexing on Index with these indexers [1] of type int\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 2057, in showtraceback\n", + " stb = self.InteractiveTB.structured_traceback(\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1288, in structured_traceback\n", + " return FormattedTB.structured_traceback(\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1177, in structured_traceback\n", + " return VerboseTB.structured_traceback(\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1030, in structured_traceback\n", + " formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 935, in format_exception_as_a_whole\n", + " self.get_records(etb, number_of_lines_of_context, tb_offset) if etb else []\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 987, in get_records\n", + " style = stack_data.style_with_executing_node(style, self._tb_highlight)\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/stack_data/core.py\", line 455, in style_with_executing_node\n", + " class NewStyle(style):\n", + " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 91, in __new__\n", + " ndef[4] = colorformat(styledef[3:])\n", + " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 58, in colorformat\n", + " assert False, \"wrong color format %r\" % text\n", + "AssertionError: wrong color format 'ansiyellow'\n" + ] + } + ], + "source": [ + "print(surveys_df.iloc[0:4, 1:4])\n", + "surveys_df.loc[0:4, 1:4] " + ] + }, + { + "cell_type": "markdown", + "id": "7b12cdaf", + "metadata": {}, + "source": [ + "### Exercise 5\n", + "- Create a new DataFrame that only contains observations from the original with sex values that are not female or male. Print the number of rows in this new DataFrame. Verify the result by comparing the number of rows in the new DataFrame with the number of rows in the surveys DataFrame where sex is NaN (hint: there is a function `isnull`)." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "53f7777a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of rows: 2511\n", + "Unique values in column 'sex': [nan]\n" + ] + } + ], + "source": [ + "df = surveys_df[(surveys_df['sex'] != 'M') & (surveys_df['sex'] != 'F')]\n", + "print(\"Number of rows not female or male:\", len(df))\n", + "print(\"Number of rows NaN:\", len(surveys_df['sex'].isnull()))\n", + "print(\"Unique values in column 'sex':\", df['sex'].unique())" + ] + }, + { + "cell_type": "markdown", + "id": "b34b321b", + "metadata": {}, + "source": [ + "### Exercise 6: Putting it all together \n", + "1. Clean the column *sex* (leave out samples of which we do not know whether they are male or female) and save the result as a new dataframe `clean_df`.\n", + "2. Fill undefined *weight* values with the mean of all valid weights in `surveys_df`.\n", + "3. Calculate the average weight of that new DataFrame `clean_df`" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "570cbd6c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average weight of surveys_df: 42.672428212991356\n", + "Average weight of clean_df: 42.60316325896464\n" + ] + } + ], + "source": [ + "# Step 1\n", + "# sex is 'F' or 'M'. The `|` means or.\n", + "clean_df = surveys_df[(surveys_df['sex']=='F') | (surveys_df['sex']=='M')]\n", + "# Alternative solution: select columns where 'not' sex is null. The `~` means not.\n", + "clean_df = surveys_df[~(surveys_df['sex'].isnull())]\n", + "\n", + "# Step 2\n", + "clean_df.weight.fillna(surveys_df.weight.mean())\n", + "\n", + "# Step 3\n", + "print(\"Average weight of surveys_df:\", surveys_df.weight.mean())\n", + "print(\"Average weight of clean_df:\", clean_df.weight.mean())" + ] + }, + { + "cell_type": "markdown", + "id": "ccb33c2e", + "metadata": {}, + "source": [ + "### Exercise 7\n", + "Let's see in which plots animals get more food. Calculate the average weight per plot! Complete the code below." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "733aba46", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "plot_id\n", + "1 51.822911\n", + "2 52.251688\n", + "3 32.654386\n", + "4 47.928189\n", + "5 40.947802\n", + "6 36.738893\n", + "7 20.663009\n", + "8 47.758001\n", + "9 51.432358\n", + "10 18.541219\n", + "11 43.451757\n", + "12 49.496169\n", + "13 40.445660\n", + "14 46.277199\n", + "15 27.042578\n", + "16 24.585417\n", + "17 47.889593\n", + "18 40.005922\n", + "19 21.105166\n", + "20 48.665303\n", + "21 24.627794\n", + "22 54.146379\n", + "23 19.634146\n", + "24 43.679167\n", + "Name: weight, dtype: float64" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grouped_data = surveys_df.groupby(\"plot_id\")\n", + "grouped_data['weight'].mean()" + ] + }, + { + "cell_type": "markdown", + "id": "2bccb9da", + "metadata": {}, + "source": [ + "### Exercise 8\n", + "See below a more complex grouping example. Investigate the group keys and row indexes for this more complex grouping example. \n", + "Why are there more than 48 groups? Answer: nan values are not ignored when grouping.\n", + "Calculate the average weight per group.\n", + "What happened to the third group and why does it not turn up in our statistics? Answer: the third group contains only nan values and is therefore not included in the statistics." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ee674ae9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "72\n" + ] + }, + { + "data": { + "text/plain": [ + "dict_keys([('F', 1), ('F', 2), ('F', 3), ('F', 4), ('F', 5), ('F', 6), ('F', 7), ('F', 8), ('F', 9), ('F', 10), ('F', 11), ('F', 12), ('F', 13), ('F', 14), ('F', 15), ('F', 16), ('F', 17), ('F', 18), ('F', 19), ('F', 20), ('F', 21), ('F', 22), ('F', 23), ('F', 24), ('M', 1), ('M', 2), ('M', 3), ('M', 4), ('M', 5), ('M', 6), ('M', 7), ('M', 8), ('M', 9), ('M', 10), ('M', 11), ('M', 12), ('M', 13), ('M', 14), ('M', 15), ('M', 16), ('M', 17), ('M', 18), ('M', 19), ('M', 20), ('M', 21), ('M', 22), ('M', 23), ('M', 24), (nan, 1), (nan, 2), (nan, 3), (nan, 4), (nan, 5), (nan, 6), (nan, 7), (nan, 8), (nan, 9), (nan, 10), (nan, 11), (nan, 12), (nan, 13), (nan, 14), (nan, 15), (nan, 16), (nan, 17), (nan, 18), (nan, 19), (nan, 20), (nan, 21), (nan, 22), (nan, 23), (nan, 24)])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grouped_data = surveys_df.groupby(['sex', 'plot_id'])\n", + "print(len(grouped_data.groups))\n", + "grouped_data.groups.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3d9b5ba0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "sex plot_id\n", + "F 1 46.311138\n", + " 2 52.561845\n", + " 3 31.215349\n", + " 4 46.818824\n", + " 5 40.974806\n", + " 6 36.352288\n", + " 7 20.006135\n", + " 8 45.623011\n", + " 9 53.618469\n", + " 10 17.094203\n", + " 11 43.515075\n", + " 12 49.831731\n", + " 13 40.524590\n", + " 14 47.355491\n", + " 15 26.670236\n", + " 16 25.810427\n", + " 17 48.176201\n", + " 18 36.963514\n", + " 19 21.978599\n", + " 20 52.624406\n", + " 21 25.974832\n", + " 22 53.647059\n", + " 23 20.564417\n", + " 24 47.914405\n", + "M 1 55.950560\n", + " 2 51.391382\n", + " 3 34.163241\n", + " 4 48.888119\n", + " 5 40.708551\n", + " 6 36.867388\n", + " 7 21.194719\n", + " 8 49.641372\n", + " 9 49.519309\n", + " 10 19.971223\n", + " 11 43.366197\n", + " 12 48.909710\n", + " 13 40.097754\n", + " 14 45.159378\n", + " 15 27.523691\n", + " 16 23.811321\n", + " 17 47.558853\n", + " 18 43.546952\n", + " 19 20.306878\n", + " 20 44.197279\n", + " 21 22.772622\n", + " 22 54.572531\n", + " 23 18.941463\n", + " 24 39.321503\n", + "Name: weight, dtype: float64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grouped_data['weight'].mean()" + ] + }, + { + "cell_type": "markdown", + "id": "2336ac85", + "metadata": {}, + "source": [] + }, + { + "cell_type": "markdown", + "id": "b0f1ab75", + "metadata": {}, + "source": [ + "### Exercise 9\n", + "Would it make sense to group our data frame by the column *weight*? Why or why not?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbff11b3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of rows: 35549\n", + "256\n", + "255\n" + ] + } + ], + "source": [ + "# In real life nearly every sample has a unique value. So nearly every sample would \n", + "# be placed in an own group.\n", + "# In our training data you can see that there are quite some values for weight. So\n", + "# usually it is not a good idea to categorise (group) data on such values.\n", + "print(\"Number of rows:\", len(surveys_df))\n", + "print(len(surveys_df['weight'].unique())) #includes nan\n", + "print(len(surveys_df.groupby(['weight']).groups)) #does not include nan" + ] + }, + { + "cell_type": "markdown", + "id": "79ae54cf", + "metadata": {}, + "source": [ + "### Exercise 10\n", + "In the given example of vertical concatenation, you concatenated two DataFrames with the same columns. What would happen if the two DataFrames to concatenate have different column number and names?\n", + "\n", + " 1. Create a new DataFrame using the last 10 rows of the species DataFrame (`species_df`);\n", + " 2. Concatenate vertically `surveys_df_sub_first10` and your just created DataFrame;\n", + " 3. Print the concatenated DataFrame info on the screen. How may rows does it have? What happened to the columns? Explain why you get this result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18b13126", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Unexpected exception formatting exception. Falling back to standard exception\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 3460, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"/tmp/ipykernel_16478/1143081575.py\", line 1, in \n", + " species_df = pd.read_csv(\"../data/species.csv\")\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/util/_decorators.py\", line 211, in wrapper\n", + " return func(*args, **kwargs)\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/util/_decorators.py\", line 331, in wrapper\n", + " return func(*args, **kwargs)\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py\", line 950, in read_csv\n", + " return _read(filepath_or_buffer, kwds)\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py\", line 605, in _read\n", + " parser = TextFileReader(filepath_or_buffer, **kwds)\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py\", line 1442, in __init__\n", + " self._engine = self._make_engine(f, self.engine)\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py\", line 1735, in _make_engine\n", + " self.handles = get_handle(\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/io/common.py\", line 856, in get_handle\n", + " handle = open(\n", + "FileNotFoundError: [Errno 2] No such file or directory: '../data/species.csv'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 2057, in showtraceback\n", + " stb = self.InteractiveTB.structured_traceback(\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1288, in structured_traceback\n", + " return FormattedTB.structured_traceback(\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1177, in structured_traceback\n", + " return VerboseTB.structured_traceback(\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1030, in structured_traceback\n", + " formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 935, in format_exception_as_a_whole\n", + " self.get_records(etb, number_of_lines_of_context, tb_offset) if etb else []\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 987, in get_records\n", + " style = stack_data.style_with_executing_node(style, self._tb_highlight)\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/stack_data/core.py\", line 455, in style_with_executing_node\n", + " class NewStyle(style):\n", + " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 91, in __new__\n", + " ndef[4] = colorformat(styledef[3:])\n", + " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 58, in colorformat\n", + " assert False, \"wrong color format %r\" % text\n", + "AssertionError: wrong color format 'ansiyellow'\n" + ] + } + ], + "source": [ + "species_df = pd.read_csv(\"../../course_materials/data/species.csv\")\n", + "species_df_sub_last10 = species_df.tail(10)\n", + "\n", + "surveys_df_sub_first10 = surveys_df.head(10)\n", + "vert_concat = pd.concat([surveys_df_sub_first10, species_df_sub_last10], axis=0)\n", + "\n", + "vert_concat" + ] + }, + { + "cell_type": "markdown", + "id": "aa6d4ee8", + "metadata": {}, + "source": [ + "We get a total of 20 rows and 12 columns. The original dataframes together had a total of 13 columns. As they both have a column `species_id`, this one is collapsed. All other columns are padded with `NaN` values.\n", + "We expect 20 rows, as we are putting two DataFrames of 10 rows one after the other. The padding of the columns happens because these two DataFrames do not have the same column names. To keep all the information that was in the original DataFrames, the padding of columns that occur in only one of the two is necessary." + ] + }, + { + "cell_type": "markdown", + "id": "9478026d", + "metadata": {}, + "source": [ + "### Exercise 11\n", + "In the given example of horizontal concatenation, you first concatenated two DataFrame with different indices, then reset the indices of the second one. Based on the outcome of these two cases, try to answer the following questions:\n", + " 1. What happens when you concatenate horizontally two DataFrames with different indexing?\n", + " 2. What happens when you concatenate horizontally two DataFrames with the same columns?\n", + " 3. What happens when you try to select a column of the `horizontal_stack` DataFrame we just created?\n", + " 4. How can you select a specific column, when multiple columns share a name?" + ] + }, + { + "cell_type": "markdown", + "id": "b5b01c88", + "metadata": {}, + "source": [ + "1. The columns of both DataFrames are kept, duplicates are not merged, and the rows of both DataFrames are kept as well, no merging there either. This results in the two original DataFrames appearing in a checker-pattern in the resulting DataFrames, with the empty spaces padded out with `NaN` values.\n", + "2. Columns still are not merged, but now, rows with a common index are merged. This means that the information of both rows is put into a single row in the resulting DataFrame. If no corresponding row exists in the other DataFrame, the row is still padded with `NaN`'s in the result. But if a corresponding row (with the same index) *does* exists, this is no longer necessary." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0faee1fd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Unexpected exception formatting exception. Falling back to standard exception\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 3460, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"/tmp/ipykernel_16478/3936511566.py\", line 4, in \n", + " horizontal_stack = pd.concat([surveys_df_sub_first10, surveys_df_sub_last10], axis=1)\n", + "NameError: name 'surveys_df_sub_first10' is not defined\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 2057, in showtraceback\n", + " stb = self.InteractiveTB.structured_traceback(\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1288, in structured_traceback\n", + " return FormattedTB.structured_traceback(\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1177, in structured_traceback\n", + " return VerboseTB.structured_traceback(\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1030, in structured_traceback\n", + " formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 935, in format_exception_as_a_whole\n", + " self.get_records(etb, number_of_lines_of_context, tb_offset) if etb else []\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 987, in get_records\n", + " style = stack_data.style_with_executing_node(style, self._tb_highlight)\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/stack_data/core.py\", line 455, in style_with_executing_node\n", + " class NewStyle(style):\n", + " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 91, in __new__\n", + " ndef[4] = colorformat(styledef[3:])\n", + " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 58, in colorformat\n", + " assert False, \"wrong color format %r\" % text\n", + "AssertionError: wrong color format 'ansiyellow'\n" + ] + } + ], + "source": [ + "# 3.\n", + "surveys_df_sub_last10 = surveys_df.tail(10)\n", + "surveys_df_sub_last10 = surveys_df_sub_last10.reset_index(drop=True)\n", + "horizontal_stack = pd.concat([surveys_df_sub_first10, surveys_df_sub_last10], axis=1)\n", + "horizontal_stack['species_id']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f860304", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Unexpected exception formatting exception. Falling back to standard exception\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 3460, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"/tmp/ipykernel_16478/4095435365.py\", line 2, in \n", + " horizontal_stack.iloc[:,5]\n", + "NameError: name 'horizontal_stack' is not defined\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 2057, in showtraceback\n", + " stb = self.InteractiveTB.structured_traceback(\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1288, in structured_traceback\n", + " return FormattedTB.structured_traceback(\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1177, in structured_traceback\n", + " return VerboseTB.structured_traceback(\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1030, in structured_traceback\n", + " formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 935, in format_exception_as_a_whole\n", + " self.get_records(etb, number_of_lines_of_context, tb_offset) if etb else []\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 987, in get_records\n", + " style = stack_data.style_with_executing_node(style, self._tb_highlight)\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/stack_data/core.py\", line 455, in style_with_executing_node\n", + " class NewStyle(style):\n", + " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 91, in __new__\n", + " ndef[4] = colorformat(styledef[3:])\n", + " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 58, in colorformat\n", + " assert False, \"wrong color format %r\" % text\n", + "AssertionError: wrong color format 'ansiyellow'\n" + ] + } + ], + "source": [ + "# 4.\n", + "horizontal_stack.iloc[:,5]" + ] + }, + { + "cell_type": "markdown", + "id": "d6382b5f-9c13-4ceb-88f6-f7ba13f797f7", + "metadata": { + "tags": [] + }, + "source": [ + "### Exercise 12\n", + "Time to play with plots! Look at the pandas.DataFrame.plot() documentation [here](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.plot.html) and change your data visualization swapping the DataFrame columns, and changing the axes labels.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1896c89b-f2c2-41ae-9a18-9b9fb8a87648", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 0.98, 'Scatter plot of weight versus hindfoot length')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "fig, ax1 = plt.subplots() # prepare a matplotlib figure\n", + "\n", + "surveys_df.plot(\"hindfoot_length\", \"weight\", kind=\"scatter\", ax=ax1)\n", + "\n", + "# Provide further adaptations with matplotlib:\n", + "ax1.set_xlabel(\"Hindfoot length\")\n", + "ax1.tick_params(labelsize=16, pad=8)\n", + "fig.suptitle('Scatter plot of weight versus hindfoot length', fontsize=15)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "339fb652", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 0.98, 'Scatter plot of hindfood length versus weight')" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "fig, ax1 = plt.subplots() # prepare a matplotlib figure\n", + "\n", + "surveys_df.plot(\"weight\",\"hindfoot_length\", kind=\"scatter\", ax=ax1)\n", + "\n", + "# Provide further adaptations with matplotlib:\n", + "ax1.set_xlabel(\"weight\")\n", + "ax1.tick_params(labelsize=16, pad=8)\n", + "fig.suptitle('Scatter plot of hindfood length versus weight', fontsize=15)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/course_materials/solutions/morning_exercises_solutions.ipynb b/book/solutions/morning_exercises_solutions.ipynb similarity index 62% rename from course_materials/solutions/morning_exercises_solutions.ipynb rename to book/solutions/morning_exercises_solutions.ipynb index 09a4d90..0da205b 100644 --- a/course_materials/solutions/morning_exercises_solutions.ipynb +++ b/book/solutions/morning_exercises_solutions.ipynb @@ -5,13 +5,21 @@ "id": "cac5b58d", "metadata": {}, "source": [ - "#### ---\n", - "title: \"Exercises\"\n", + "---\n", "execute: \n", " enabled: true\n", + " error: true\n", "---" ] }, + { + "cell_type": "markdown", + "id": "f2eba16f", + "metadata": {}, + "source": [ + "# Morning Exercises: Python fundamentals {.unnumbered}" + ] + }, { "cell_type": "markdown", "id": "74ecb0e3-061f-4318-b0c1-eb7eea7b76a8", @@ -24,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "5b2453e4-d68b-49ad-85b1-1d28e94e54a7", "metadata": {}, "outputs": [], @@ -51,10 +59,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "7463ed46-b7c5-4b51-b34a-7c5f3c478283", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "the value of x is 6\n", + "the value of apple is apple\n" + ] + } + ], "source": [ "x = 6\n", "apple = \"apple\"\n", @@ -75,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "f6be831f-656e-4eba-96ec-6f8703840617", "metadata": {}, "outputs": [], @@ -85,20 +102,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "27da351a-5246-458f-8391-3ef325ee2f20", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "s > 1" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "76ff08d7-e30e-4b2e-b80f-f3e90598b840", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "1.6" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "round(s, 1)" ] @@ -120,10 +159,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "c3e04e72-a4e7-4599-a8d4-759e4031d7a1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "False\n", + "False\n", + "True\n" + ] + } + ], "source": [ "print(5 == 5)\n", "print(not 3 > 2)\n", @@ -146,30 +196,89 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "20427367-d74a-4e22-9a64-975d46a966cc", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2\n" + ] + } + ], "source": [ "print(1 + 1)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "35bfe887", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2\n" + ] + } + ], "source": [ "print(1 + True)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "cd1f883f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Unexpected exception formatting exception. Falling back to standard exception\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 3460, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"/tmp/ipykernel_18646/2622355716.py\", line 1, in \n", + " print(1 + \"one\")\n", + "TypeError: unsupported operand type(s) for +: 'int' and 'str'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 2057, in showtraceback\n", + " stb = self.InteractiveTB.structured_traceback(\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1288, in structured_traceback\n", + " return FormattedTB.structured_traceback(\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1177, in structured_traceback\n", + " return VerboseTB.structured_traceback(\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1030, in structured_traceback\n", + " formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 935, in format_exception_as_a_whole\n", + " self.get_records(etb, number_of_lines_of_context, tb_offset) if etb else []\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 987, in get_records\n", + " style = stack_data.style_with_executing_node(style, self._tb_highlight)\n", + " File \"/home/jelle/.local/lib/python3.8/site-packages/stack_data/core.py\", line 455, in style_with_executing_node\n", + " class NewStyle(style):\n", + " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 91, in __new__\n", + " ndef[4] = colorformat(styledef[3:])\n", + " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 58, in colorformat\n", + " assert False, \"wrong color format %r\" % text\n", + "AssertionError: wrong color format 'ansiyellow'\n" + ] + } + ], "source": [ "print(1 + \"one\")" ] @@ -198,10 +307,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "6d12ffce-0492-4b91-88b6-d657bbeffe2f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "average age is 29.75\n" + ] + } + ], "source": [ "name = ['Ann', 'Bob', 'Chloe', 'Dan']\n", "age = [25, 28, 36, 30]\n", @@ -223,30 +340,63 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "2b5a51a0-3c0d-477c-996c-4baf28712025", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "25" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "age[0]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "f70eb682-14dd-422e-8073-7f18b81c5c3f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['Bob', 'Dan']" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "[name[1], name[3]]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "a09e1dce-23be-4de5-9e70-6b7d616efed4", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[25, 28]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "new_age=[]\n", "for a in age:\n", @@ -263,10 +413,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "0851e7fc-ce66-449a-a9a6-25f4df4363f6", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "36" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "age[name.index(\"Chloe\")]" ] @@ -293,7 +454,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "c52f771b-6638-4eee-b55e-e1a643ca9508", "metadata": {}, "outputs": [], @@ -303,17 +464,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "c95ea81a-8281-4951-b13e-562a1b713ae5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "dict_values(['Jill', 39, 'Netherlands'])" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "my_dict.values()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "a87bf15a-08c2-4469-ad4d-593bee5593cb", "metadata": {}, "outputs": [], @@ -323,10 +495,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "a8ba22bd-9874-421d-903d-2ea1a56ab37c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "dict_values(['Jill', 24, 'Netherlands'])" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "my_dict.values()" ] @@ -343,10 +526,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "dc3bc782-bc32-4314-b1d9-87836eefc2ae", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "odd\n" + ] + } + ], "source": [ "number = 5\n", "\n", @@ -369,7 +560,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "204e89ac-af75-4c99-8caf-cf591b7d037f", "metadata": {}, "outputs": [], @@ -384,10 +575,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "577e90fd-5fe7-4104-a879-1c138e7bd041", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5 is odd\n" + ] + } + ], "source": [ "# Run this code to test your function above\n", "number = 5\n", @@ -406,10 +605,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "47cd7a85-97c6-4389-8cc3-995e5dc7c33a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 is odd\n", + "2 is even\n", + "3 is odd\n", + "4 is even\n", + "5 is odd\n", + "6 is even\n", + "7 is odd\n", + "8 is even\n", + "9 is odd\n" + ] + } + ], "source": [ "for number in range(1,10,1):\n", " print(number, \"is\", even_or_odd(number))" @@ -432,11 +647,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", -<<<<<<< HEAD - "version": "3.10.8" -======= "version": "3.10.11" ->>>>>>> main } }, "nbformat": 4, diff --git a/course_materials/afternoon_exercises.ipynb b/course_materials/afternoon_exercises.ipynb index 6d07f60..bffcb9f 100644 --- a/course_materials/afternoon_exercises.ipynb +++ b/course_materials/afternoon_exercises.ipynb @@ -8,8 +8,6 @@ "---\n", "title: \"Working with data\"\n", "format: html\n", - "execute: \n", - " enabled: true\n", "---" ] }, @@ -29,18 +27,18 @@ "id": "fc1fdccb-ac7f-4667-97e9-5c63499693c8", "metadata": {}, "source": [ - "### Exercise 1\n", + "### Exercise 0\n", "\n", "Type the following commands and check the outputs. Can you tell what each command does? What is the difference between commands with and without parenthesis?\n", "\n", - "```python\n", - "surveys_df.shape\n", - "surveys_df.columns\n", - "surveys_df.index\n", - "surveys_df.dtypes\n", - "surveys_df.head()\n", - "surveys_df.tail()\n", - "```" + "`surveys_df.shape` Answer:\n", + "`surveys_df.columns` Answer:\n", + "`surveys_df.index` Answer:\n", + "`surveys_df.dtypes` Answer:\n", + "`surveys_df.head()` Answer:\n", + "`surveys_df.tail()` Answer:\n", + "\n", + "[Course book chapter 5 for reference](https://utrechtuniversity.github.io/workshop-introduction-to-python/data-science-with-pandas-1.html)" ] }, { @@ -54,7 +52,7 @@ { "cell_type": "code", "execution_count": null, - "id": "95c72b85-8d26-43d8-86f8-09030c0b0c5c", + "id": "18f70271-17df-48c4-9ff5-9458584da7ef", "metadata": {}, "outputs": [], "source": [] @@ -62,7 +60,7 @@ { "cell_type": "code", "execution_count": null, - "id": "18f70271-17df-48c4-9ff5-9458584da7ef", + "id": "230f8bde", "metadata": {}, "outputs": [], "source": [] @@ -96,7 +94,7 @@ "id": "763cd8cc-6fc0-4f55-b1f5-aaf2d35b03ba", "metadata": {}, "source": [ - "### Exercise 2\n", + "### Exercise 1\n", "Perform some basic statistics on the weight column. For practical reasons, it can be useful to first create a variable `weight` that contains the just the weight column. It will make the code look a bit cleaner. Can you tell what each method listed below does? Look at our explorative plot, do the statistics make sense?\n", "\n", "```python\n", @@ -106,7 +104,9 @@ "weight.mean()\n", "weight.std()\n", "weight.counts()\n", - "```" + "```\n", + "\n", + "[Course book chapter 5 for reference](https://utrechtuniversity.github.io/workshop-introduction-to-python/data-science-with-pandas-1.html)" ] }, { @@ -162,12 +162,12 @@ "id": "2bcb581f", "metadata": {}, "source": [ - "### Exercise 3\n", - "- Swap the order of column names in `surveys_df[['plot_id', 'species_id']]`\n", - "- Repeat one of the column names like `surveys_df[['plot_id', 'plot_id', 'species_id']]`\n", - "What do the results look like and why?\n", - "- Which error occurrs in `surveys_df['plot_id', 'species_id']` and why?\n", - "- Which error occurrs in `surveys_df['speciess']`?" + "### Exercise 2\n", + "- Print the columns 'species_id' and 'plot_id' side by side\n", + "- Do the same, but use one of these columns twice.\n", + "What do the results look like and why? Answer: \n", + "- Which error occurrs in `surveys_df['plot_id', 'species_id']` and why? Answer:\n", + "- Which error occurrs in `surveys_df['speciess']`? Answer:" ] }, { @@ -178,27 +178,62 @@ "outputs": [], "source": [] }, + { + "cell_type": "code", + "execution_count": null, + "id": "a76c904a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f431b19e", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "108f548a", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "id": "119b576b", "metadata": {}, "source": [ - "### Exercise 4\n", - "What happens when you call:\n", - "- `surveys_df[0:1]`\n", - "- `surveys_df[:4]`\n", - "- `surveys_df[:-1]`\n", - "\n", + "### Exercise 3\n", "What happens when you call:\n", - "- `surveys_df.iloc[0:4, 1:4]`\n", - "- `surveys_df.loc[0:4, 1:4]`\n", - "How are the two commands different?" + "- `surveys_df[0:1]` Answer:\n", + "- `surveys_df[:4]` Answer: \n", + "- `surveys_df[:-1]` Answer: " ] }, { "cell_type": "code", "execution_count": null, - "id": "bd3b4c74", + "id": "1017c8fe", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5ba4a57", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a3cd7e6", "metadata": {}, "outputs": [], "source": [] @@ -208,11 +243,13 @@ "id": "4fc54c12", "metadata": {}, "source": [ - "### Exercise 5\n", + "### Exercise 4\n", "What happens in the following two examples?\n", "\n", "- ```surveys_df.iloc[0:4, 1:4]```;\n", - "- ```surveys_df.loc[0:4, 1:4]```.\n" + "- ```surveys_df.loc[0:4, 1:4]```.\n", + "\n", + "How are the two commands different? Answer: " ] }, { @@ -223,13 +260,21 @@ "outputs": [], "source": [] }, + { + "cell_type": "code", + "execution_count": null, + "id": "002ca88a", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "id": "7b12cdaf", "metadata": {}, "source": [ - "### Exercise 6\n", - "- Create a new DataFrame that only contains observations with sex values that are not female or male. Print the number of rows in this new DataFrame. Verify the result by comparing the number of rows in the new DataFrame with the number of rows in the surveys DataFrame where sex is NaN (hint: there is a function `isnull`).\n", + "### Exercise 5\n", + "- Create a new DataFrame that only contains observations from the original DataFrame with sex values that are not female or male. Print the number of rows in this new DataFrame. Verify the result by comparing the number of rows in the new DataFrame with the number of rows in the surveys DataFrame where sex is NaN (hint: there is a function `isnull`).\n", "- Create a new DataFrame that contains only observations that are of sex male or female and where weight values are greater than 0." ] }, @@ -241,15 +286,23 @@ "outputs": [], "source": [] }, + { + "cell_type": "code", + "execution_count": null, + "id": "cdcd971b", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "id": "b34b321b", "metadata": {}, "source": [ - "### Exercise 7: Putting it all together \n", - "1. Clean the column *sex* (leave out samples of which we do not know whether they are male or female)\n", - "2. Create a new dataframe *clean_df* which only contains rows of which we have a valid value for *sex* and *weight*\n", - "3. Calculate the average weight of that new DataFrame" + "### Exercise 6: Putting it all together \n", + "1. Clean the column *sex* (leave out samples of which we do not know whether they are male or female) and save the result as a new dataframe `clean_df`.\n", + "2. Replace undefined *weight* values with the mean of all (defined) weights in `surveys_df`.\n", + "3. Calculate the average weight of that new DataFrame `clean_df`" ] }, { @@ -265,7 +318,7 @@ "id": "ccb33c2e", "metadata": {}, "source": [ - "### Exercise 8\n", + "### Exercise 7\n", "Let's see in which plots animals get more food. Calculate the average weight per plot! Complete the code below." ] }, @@ -285,10 +338,11 @@ "id": "2bccb9da", "metadata": {}, "source": [ - "### Exercise 9\n", - "Investigate the group keys and row indexes for this more complex grouping example. \n", - "Why are there more than 48 groups?\n", - "What happened to the third group and why does it not turn up in our statistics?" + "### Exercise 8\n", + "See below a more complex grouping example. Investigate the group keys and row indexes for this more complex grouping example. \n", + "Why are there more than 48 groups? Answer: \n", + "Calculate the average weight per group.\n", + "What happened to the third group and why does it not turn up in our statistics? Answer:" ] }, { @@ -299,15 +353,23 @@ "outputs": [], "source": [ "grouped_data = surveys_df.groupby(['sex', 'plot_id'])\n", - "len(grouped_data.groups)" + "print(len(grouped_data.groups))" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "7843cd60", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "id": "b0f1ab75", "metadata": {}, "source": [ - "### Exercise 10\n", + "### Exercise 9\n", "Would it make sense to group our data frame by the column *weight*? Why or why not?" ] }, @@ -316,7 +378,7 @@ "id": "0c7ae97d", "metadata": {}, "source": [ - "### Exercise 11\n", + "### Exercise 10\n", "In the given example of vertical concatenation, you concatenated two DataFrames with the same columns. What would happen if the two DataFrames to concatenate have different column number and names?\n", "\n", " 1. Create a new DataFrame using the last 10 rows of the species DataFrame (`species_df`);\n", @@ -341,7 +403,7 @@ "id": "afa7dd9c", "metadata": {}, "source": [ - "### Exercise 12\n", + "### Exercise 11\n", "In the given example of horizontal concatenation, you first concatenated two DataFrame with different indices, then reset the indices of the second one. Based on the outcome of these two cases, try to answer the following questions:\n", " 1. What happens when you concatenate horizontally two DataFrames with different indexing?\n", " 2. What happens when you concatenate horizontally two DataFrames with the same columns?\n", @@ -368,7 +430,7 @@ "tags": [] }, "source": [ - "### Exercise 13\n", + "### Exercise 12\n", "Time to play with plots! Look at the pandas.DataFrame.plot() documentation [here](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.plot.html) and change your data visualization selecting different DataFrame columns, x and y axes, and kind of plot (try at least three different plots).\n" ] }, @@ -379,9 +441,11 @@ "metadata": {}, "outputs": [], "source": [ + "import matplotlib.pyplot as plt\n", + "\n", "fig, ax1 = plt.subplots() # prepare a matplotlib figure\n", "\n", - "surveys.plot(\"hindfoot_length\", \"weight\", kind=\"scatter\", ax=ax1)\n", + "surveys_df.plot(\"hindfoot_length\", \"weight\", kind=\"scatter\", ax=ax1)\n", "\n", "# Provide further adaptations with matplotlib:\n", "ax1.set_xlabel(\"Hindfoot length\")\n", @@ -389,20 +453,12 @@ "fig.suptitle('Scatter plot of weight versus hindfoot length', fontsize=15)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "d9191732-388d-4b5b-8445-f9cce0375390", - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "id": "aec3890f-9219-456a-9375-692ed307ff43", "metadata": {}, "source": [ - "### Exercise 14 \n", + "### Exercise 13\n", "\n", "Plot the data from different groups in two subplots in a single figure:\n", "- Initialize a Figure with 2 Axes distributed in one rows and two columns;\n", diff --git a/course_materials/empty_notebook_for_coding_along.ipynb b/course_materials/empty_notebook_for_coding_along.ipynb new file mode 100644 index 0000000..08712f7 --- /dev/null +++ b/course_materials/empty_notebook_for_coding_along.ipynb @@ -0,0 +1,92 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6be8c60c-486a-49ec-860d-e8dc361928ad", + "metadata": {}, + "source": [ + "# Code along\n", + "\n", + "Use this notebook to code along during the live coding lectures.\n", + "Use the dedicated exercises notebooks `morning_exercises.ipynb` and `afternoon_exercises.ipynb` for the exercises parts." + ] + }, + { + "cell_type": "markdown", + "id": "c239edb5", + "metadata": {}, + "source": [ + "## Test Installation" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d8e35e29-a355-45e7-a693-624f0ee80b2b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3.8.10\n", + "1.5.3\n", + "3.6.3\n", + "1.24.1\n", + "No errors! Ready to code!\n" + ] + } + ], + "source": [ + "from platform import python_version\n", + "\n", + "import pandas as pd\n", + "import matplotlib\n", + "import numpy as np\n", + "\n", + "print(python_version())\n", + "print(pd.__version__)\n", + "print(matplotlib.__version__)\n", + "print(np.__version__)\n", + "print(\"No errors! Ready to code!\")" + ] + }, + { + "cell_type": "markdown", + "id": "2d5cbbaf", + "metadata": {}, + "source": [ + "## Code along" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14183541", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/course_materials/morning_exercises.ipynb b/course_materials/morning_exercises.ipynb index 88f49df..9009422 100644 --- a/course_materials/morning_exercises.ipynb +++ b/course_materials/morning_exercises.ipynb @@ -19,12 +19,13 @@ "source": [ "### Exercise 0\n", "\n", - "1. Try to run the code below. Why is there no output? (double click this cell to be able to edit the answer) " + "1. Try to run the code below. Why is there no output? (double the cell with 'Answer:' to edit the answer) \n", + "[Course book chapter 1 for reference](https://utrechtuniversity.github.io/workshop-introduction-to-python/Introduction_to_python_1.html)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "5b2453e4-d68b-49ad-85b1-1d28e94e54a7", "metadata": {}, "outputs": [], @@ -38,7 +39,7 @@ "id": "ffb712c2-e744-4110-9879-acab825e8caf", "metadata": {}, "source": [ - "Answer: " + "Answer: [DOUBLE CLICK THIS CELL AND ANSWER HERE]" ] }, { @@ -65,7 +66,9 @@ "### Exercise 1\n", "1. Calculate: One plus five divided by nine and assign the result of the calculation to a variable.\n", "2. Test if the result is larger than one. (tip: output of the cell should be True or False)\n", - "3. Round off the result to one decimal. Use the function `round`. (tip: use `? round` or you internet search engine for information about how to use the function)" + "3. Round off the result to one decimal. Use the function `round`. (tip: use `? round` or you internet search engine for information about how to use the function)\n", + "\n", + "[Course book chapter 2 for reference](https://utrechtuniversity.github.io/workshop-introduction-to-python/Introduction_to_python_2.html)" ] }, { @@ -110,7 +113,9 @@ "`5 == 5` answer: \n", "`not 3 > 2` answer: \n", "`True == 'True'` answer: \n", - "`False < True` answer: \n" + "`False < True` answer: \n", + "\n", + "[Course book chapter 2 for reference](https://utrechtuniversity.github.io/workshop-introduction-to-python/Introduction_to_python_2.html)" ] }, { @@ -135,7 +140,9 @@ "Evaluate the statements below. What do you think will be the output of python? Do you agree with python? \n", "`1 + 1` answer: \n", "`1 + True` answer: \n", - "`1 + \"one\"` answer: \n" + "`1 + \"one\"` answer: \n", + "\n", + "[Course book chapter 2 for reference](https://utrechtuniversity.github.io/workshop-introduction-to-python/Introduction_to_python_2.html)" ] }, { @@ -161,7 +168,9 @@ "\n", "2. How old are Ann, Bob, Chloe, and Dan? You decide! Design a numeric list with their respective ages. Save it as \"age\".\n", "\n", - "3. What is their average age? (Use the function `sum` to sum up their cumulative ages, you can use `len(age)` to get the number of elements in a list)" + "3. What is their average age? (Use the function `sum` to sum up their cumulative ages, you can use `len(age)` to get the number of elements in a list)\n", + "\n", + "[Course book chapter 3 for reference](https://utrechtuniversity.github.io/workshop-introduction-to-python/Introduction_to_python_3.html)" ] }, { @@ -182,7 +191,9 @@ "1. Return only the first number in the list age.\n", "2. Return the 2nd and 4th name in your list name.\n", "3. Return only ages under 30 from your list age. Bonus: put them in a new list called `new_age`.\n", - "4. Return the age of \"Chloe\" from the list age." + "4. Return the age of \"Chloe\" from the list age.\n", + "\n", + "[Course book chapter 3 for reference](https://utrechtuniversity.github.io/workshop-introduction-to-python/Introduction_to_python_3.html)" ] }, { @@ -226,7 +237,9 @@ "- Create a new dictionary that contains the keys: 'name', 'age' and 'country' and their respective values (you can make up the values yourself).\n", "- Print the values of the dictionary to the screen\n", "- Reassign the value of key 'age' to 24\n", - "- Print the values of the dictionary to the screen again to see if the value has changed." + "- Print the values of the dictionary to the screen again to see if the value has changed.\n", + "\n", + "[Course book chapter 3 for reference](https://utrechtuniversity.github.io/workshop-introduction-to-python/Introduction_to_python_3.html)" ] }, { @@ -268,7 +281,9 @@ "source": [ "### Exercise 7\n", "Create an if statement that tests whether a number is even or odd, and saves the classification in a variable called `number_class`. \n", - "Hint, you can use the % operator (aka modulo operator). If necessary, you can try the operator to see what it does in the cell below or search the web." + "Hint, you can use the % operator (aka modulo operator). If necessary, you can try the operator to see what it does in the cell below or search the web.\n", + "\n", + "[Course book chapter 3 for reference](https://utrechtuniversity.github.io/workshop-introduction-to-python/Introduction_to_python_3.html)" ] }, { @@ -289,7 +304,9 @@ "metadata": {}, "source": [ "### Exercise 8\n", - "Turn the if statement from the last exercise into a function e.g. with the name `even_or_odd` (or choose a different name yourself). Let the user provide the value for `number`, and return the `number_class`." + "Turn the if statement from the last exercise into a function e.g. with the name `even_or_odd` (or choose a different name yourself). Let the user provide the value for `number`, and return the `number_class`.\n", + "\n", + "[Course book chapter 4 for reference](https://utrechtuniversity.github.io/workshop-introduction-to-python/Introduction_to_python_4.html)" ] }, { @@ -319,7 +336,9 @@ "source": [ "### Exercise 9 \n", "Use the function above to determine whether the numbers between 1 and 10 are even or odd. \n", - "Hint: the [`range` function](https://docs.python.org/3.8/library/stdtypes.html#range) might be helpful." + "Hint: the [`range` function](https://docs.python.org/3.8/library/stdtypes.html#range) might be helpful.\n", + "\n", + "[Course book chapter 4 for reference](https://utrechtuniversity.github.io/workshop-introduction-to-python/Introduction_to_python_4.html)" ] }, { @@ -347,7 +366,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.8.12" } }, "nbformat": 4, diff --git a/course_materials/solutions/afternoon_exercises_solutions.ipynb b/course_materials/solutions/afternoon_exercises_solutions.ipynb deleted file mode 100644 index fd526e6..0000000 --- a/course_materials/solutions/afternoon_exercises_solutions.ipynb +++ /dev/null @@ -1,425 +0,0 @@ -{ - "cells": [ - { - "cell_type": "raw", - "id": "14f401b8-2d1d-4a01-a3c8-ed2c59e66d29", - "metadata": {}, - "source": [ - "---\n", - "title: \"Working with data\"\n", - "format: html\n", - "execute: \n", - " enabled: true\n", - "---" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2c6372a4", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "surveys_df = pd.read_csv('../data/surveys.csv')" - ] - }, - { - "cell_type": "markdown", - "id": "507d1c2c-98b8-4a67-b2e5-b3a3031a5b5f", - "metadata": {}, - "source": [ - "### Exercise 1\n", - "\n", - "Type the following commands and check the outputs. Can you tell what each command does? What is the difference between commands with and without parenthesis?\n", - "\n", - "```python\n", - "surveys_df.shape\n", - "surveys_df.columns\n", - "surveys_df.index\n", - "surveys_df.dtypes\n", - "surveys_df.head()\n", - "surveys_df.tail()\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "0514bad4-1266-4553-9752-29dcf3166d21", - "metadata": {}, - "source": [ - "### Exercise 2\n", - "Perform some basic statistics on the weight column. For practical reasons, it can be useful to first create a variable `weight` that contains the just the weight column. It will make the code look a bit cleaner. Can you tell what each method listed below does? Look at our explorative plot, do the statistics make sense?\n", - "\n", - "```python\n", - "weight=surveys_df['weight']\n", - "weight.min()\n", - "weight.max()\n", - "weight.mean()\n", - "weight.std()\n", - "weight.count()\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "2bcb581f", - "metadata": {}, - "source": [ - "### Exercise 3\n", - "- Swap the order of column names in `surveys_df[['plot_id', 'species_id']]`\n", - "- Repeat one of the column names like `surveys_df[['plot_id', 'plot_id', 'species_id']]`.\n", - "What do the results look like and why?\n", - "- Which error occurs in `surveys_df['plot_id', 'species_id']` and why?\n", - "- Which error occurs in `surveys_df['speciess']`?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3731f5fd", - "metadata": {}, - "outputs": [], - "source": [ - "print(surveys_df[['species_id', 'plot_id']])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b71788b1", - "metadata": {}, - "outputs": [], - "source": [ - "surveys_df[['plot_id', 'plot_id', 'species_id']] # repeating column plot_id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bb68a995", - "metadata": {}, - "outputs": [], - "source": [ - "surveys_df['plot_id', 'species_id'] \n", - "# The tuple, or combination ('plot_id', 'species_id') is not a \n", - "# column name (key) in the dataframe --> KeyError: ('plot_id', 'species_id')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b74edc72", - "metadata": {}, - "outputs": [], - "source": [ - "surveys_df['speciess']\n", - "# 'speciess' is not a column name (key) in the dataframe" - ] - }, - { - "cell_type": "markdown", - "id": "119b576b", - "metadata": {}, - "source": [ - "### Exercise 4\n", - "What happens when you call:\n", - "- `surveys_df[0:1]`\n", - "- `surveys_df[:4]`\n", - "- `surveys_df[:-1]`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bd3b4c74", - "metadata": {}, - "outputs": [], - "source": [ - "surveys_df[0:1] # shows the first row of the dataframe\n", - "surveys_df[:4] # shows the first four rows from index 0 to index 3\n", - "surveys_df[:-1] # shows all rows of the dataframe" - ] - }, - { - "cell_type": "markdown", - "id": "4fc54c12", - "metadata": {}, - "source": [ - "### Exercise 5\n", - "What happens in the following two examples?\n", - "\n", - "- ```surveys_df.iloc[0:4, 1:4]```;\n", - "- ```surveys_df.loc[0:4, 1:4]```.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6680a6fe", - "metadata": {}, - "outputs": [], - "source": [ - "print(surveys_df.iloc[0:4, 1:4])\n", - "surveys_df.loc[0:4, 1:4] # the function loc works with indices for rows (0:4), \n", - "# but not with indices for columns (1:4). COlumns do have names in our dataframe" - ] - }, - { - "cell_type": "markdown", - "id": "7b12cdaf", - "metadata": {}, - "source": [ - "### Exercise 6\n", - "- Create a new DataFrame that only contains observations with sex values that are not female or male. Print the number of rows in this new DataFrame. Verify the result by comparing the number of rows in the new DataFrame with the number of rows in the surveys DataFrame where sex is NaN (hint: there is a function `isnull`).\n", - "- Create a new DataFrame that contains only observations that are of sex male or female and where weight values are greater than 0." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "53f7777a", - "metadata": {}, - "outputs": [], - "source": [ - "df = surveys_df[(surveys_df['sex'].isnull())]\n", - "print(\"Number of rows:\", len(df))\n", - "print(\"Unique values in column 'sex':\", df['sex'].unique())" - ] - }, - { - "cell_type": "markdown", - "id": "b34b321b", - "metadata": {}, - "source": [ - "### Exercise 7: Putting it all together \n", - "1. Clean the column *sex* (leave out samples of which we do not know whether they are male or female) and save the result as a new dataframe `clean_df`.\n", - "2. Fill undefined *weight* values with the mean of all valid weights in `surveys_df`.\n", - "3. Calculate the average weight of that new DataFrame `clean_df`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "570cbd6c", - "metadata": {}, - "outputs": [], - "source": [ - "# Step 1\n", - "# sex is 'F' or 'M'. The `|` means or.\n", - "clean_df = surveys_df[(surveys_df['sex']=='F') | (surveys_df['sex']=='M')]\n", - "# or not sex is null. The `~` means not.\n", - "clean_df = surveys_df[~(surveys_df['sex'].isnull())]\n", - "\n", - "# Step 2\n", - "clean_df.weight.fillna(surveys_df.weight.mean())\n", - "\n", - "# Step 3\n", - "print(\"Average weight of surveys_df:\", surveys_df.weight.mean())\n", - "print(\"Average weight of clean_df:\", clean_df.weight.mean())" - ] - }, - { - "cell_type": "markdown", - "id": "ccb33c2e", - "metadata": {}, - "source": [ - "### Exercise 8\n", - "Let's see in which plots animals get more food. Calculate the average weight per plot! Complete the code below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "733aba46", - "metadata": {}, - "outputs": [], - "source": [ - "grouped_data = surveys_df.groupby(\"plot_id\")\n", - "grouped_data['weight'].mean()" - ] - }, - { - "cell_type": "markdown", - "id": "2bccb9da", - "metadata": {}, - "source": [ - "### Exercise 9\n", - "Investigate the group keys and row indexes for this more complex grouping example. \n", - "Why are there more than 48 groups?\n", - "What happened to the third group and why does it not turn up in our statistics?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ee674ae9", - "metadata": {}, - "outputs": [], - "source": [ - "grouped_data = surveys_df.groupby(['sex', 'plot_id'])\n", - "print(len(grouped_data.groups))\n", - "grouped_data.groups.keys() # we also have a categorial value 'nan'." - ] - }, - { - "cell_type": "markdown", - "id": "b0f1ab75", - "metadata": {}, - "source": [ - "### Exercise 10\n", - "Would it make sense to group our data frame by the column *weight*? Why or why not?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bbff11b3", - "metadata": {}, - "outputs": [], - "source": [ - "# In real life nearly every sample has a unique value. So nearly every sample would \n", - "# be placed in an own group.\n", - "# In our training data you can see that there are quite some values for weight. So\n", - "# usually it is not a good idea to categorise (group) data on such values.\n", - "print(\"Number of rows:\", len(surveys_df))\n", - "print(len(surveys_df['weight'].unique())) #includes nan\n", - "print(len(surveys_df.groupby(['weight']).groups)) #does not include nan" - ] - }, - { - "cell_type": "markdown", - "id": "79ae54cf", - "metadata": {}, - "source": [ - "### Exercise 11\n", - "In the given example of vertical concatenation, you concatenated two DataFrames with the same columns. What would happen if the two DataFrames to concatenate have different column number and names?\n", - "\n", - " 1. Create a new DataFrame using the last 10 rows of the species DataFrame (`species_df`);\n", - " 2. Concatenate vertically `surveys_df_sub_first10` and your just created DataFrame;\n", - " 3. Print the concatenated DataFrame info on the screen. How may rows does it have? What happened to the columns? Explain why you get this result." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18b13126", - "metadata": {}, - "outputs": [], - "source": [ - "species_df = pd.read_csv(\"../data/species.csv\")\n", - "species_df_sub_last10 = species_df.tail(10)\n", - "\n", - "surveys_df_sub_first10 = surveys_df.head(10)\n", - "vert_concat = pd.concat([surveys_df_sub_first10, species_df_sub_last10], axis=0)\n", - "\n", - "vert_concat" - ] - }, - { - "cell_type": "markdown", - "id": "aa6d4ee8", - "metadata": {}, - "source": [ - "We get a total of 20 rows and 12 columns. The original dataframes together had a total of 13 columns. As they both have a column `species_id`, this one is collapsed. All other columns are padded with `NaN` values.\n", - "We expect 20 rows, as we are putting two DataFrames of 10 rows one after the other. The padding of the columns happens because these two DataFrames do not have the same column names. To keep all the information that was in the original DataFrames, the padding of columns that occur in only one of the two is necessary." - ] - }, - { - "cell_type": "markdown", - "id": "9478026d", - "metadata": {}, - "source": [ - "### Exercise 12\n", - "In the given example of horizontal concatenation, you first concatenated two DataFrame with different indices, then reset the indices of the second one. Based on the outcome of these two cases, try to answer the following questions:\n", - " 1. What happens when you concatenate horizontally two DataFrames with different indexing?\n", - " 2. What happens when you concatenate horizontally two DataFrames with the same columns?\n", - " 3. What happens when you try to select a column of the `horizontal_stack` DataFrame we just created?\n", - " 4. How can you select a specific column, when multiple columns share a name?" - ] - }, - { - "cell_type": "markdown", - "id": "b5b01c88", - "metadata": {}, - "source": [ - "1. The columns of both DataFrames are kept, duplicates are not merged, and the rows of both DataFrames are kept as well, no merging there either. This results in the two original DataFrames appearing in a checker-pattern in the resulting DataFrames, with the empty spaces padded out with `NaN` values.\n", - "2. Columns still are not merged, but now, rows with a common index are merged. This means that the information of both rows is put into a single row in the resulting DataFrame. If no corresponding row exists in the other DataFrame, the row is still padded with `NaN`'s in the result. But if a corresponding row (with the same index) *does* exists, this is no longer necessary." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0faee1fd", - "metadata": {}, - "outputs": [], - "source": [ - "# 3.\n", - "surveys_df_sub_last10 = surveys_df.tail(10)\n", - "surveys_df_sub_last10 = surveys_df_sub_last10.reset_index(drop=True)\n", - "horizontal_stack = pd.concat([surveys_df_sub_first10, surveys_df_sub_last10], axis=1)\n", - "horizontal_stack['species_id']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4f860304", - "metadata": {}, - "outputs": [], - "source": [ - "# 4.\n", - "horizontal_stack.iloc[:,5]" - ] - }, - { - "cell_type": "markdown", - "id": "d6382b5f-9c13-4ceb-88f6-f7ba13f797f7", - "metadata": { - "tags": [] - }, - "source": [ - "### Exercise 13\n", - "Time to play with plots! Look at the pandas.DataFrame.plot() documentation [here](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.plot.html) and change your data visualization selecting different DataFrame columns, x and y axes, and kind of plot (try at least three different plots).\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1896c89b-f2c2-41ae-9a18-9b9fb8a87648", - "metadata": {}, - "outputs": [], - "source": [ - "fig, ax1 = plt.subplots() # prepare a matplotlib figure\n", - "\n", - "surveys.plot(\"hindfoot_length\", \"weight\", kind=\"scatter\", ax=ax1)\n", - "\n", - "# Provide further adaptations with matplotlib:\n", - "ax1.set_xlabel(\"Hindfoot length\")\n", - "ax1.tick_params(labelsize=16, pad=8)\n", - "fig.suptitle('Scatter plot of weight versus hindfoot length', fontsize=15)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}