From fff9444da55d046fb33d6caaac6bf9b9048cf2a2 Mon Sep 17 00:00:00 2001
From: Jelle Treep <40466121+jelletreep@users.noreply.github.com>
Date: Wed, 27 Sep 2023 17:50:29 +0200
Subject: [PATCH] last minor revisions from pilot (#79)
---
book/Introduction_to_python_1.ipynb | 2 +-
book/Introduction_to_python_2.ipynb | 4 +-
book/Introduction_to_python_3.ipynb | 12 +-
book/Introduction_to_python_4.ipynb | 2 +-
book/_quarto.yml | 4 +
book/data-science-with-pandas-1.ipynb | 43 +-
book/data-science-with-pandas-2.ipynb | 14 +-
book/data-science-with-pandas-3.ipynb | 17 +-
book/data-science-with-pandas-4.ipynb | 10 +-
book/installation-and-setup.qmd | 19 +-
.../afternoon_exercises_solutions.ipynb | 1337 +++++++++++++++++
.../morning_exercises_solutions.ipynb | 301 +++-
course_materials/afternoon_exercises.ipynb | 174 ++-
.../empty_notebook_for_coding_along.ipynb | 92 ++
course_materials/morning_exercises.ipynb | 45 +-
.../afternoon_exercises_solutions.ipynb | 425 ------
16 files changed, 1887 insertions(+), 614 deletions(-)
create mode 100644 book/solutions/afternoon_exercises_solutions.ipynb
rename {course_materials => book}/solutions/morning_exercises_solutions.ipynb (62%)
create mode 100644 course_materials/empty_notebook_for_coding_along.ipynb
delete mode 100644 course_materials/solutions/afternoon_exercises_solutions.ipynb
diff --git a/book/Introduction_to_python_1.ipynb b/book/Introduction_to_python_1.ipynb
index 4b2f6ef..bb93656 100644
--- a/book/Introduction_to_python_1.ipynb
+++ b/book/Introduction_to_python_1.ipynb
@@ -273,7 +273,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.0"
+ "version": "3.8.10"
}
},
"nbformat": 4,
diff --git a/book/Introduction_to_python_2.ipynb b/book/Introduction_to_python_2.ipynb
index fca6fd1..3ecbe26 100644
--- a/book/Introduction_to_python_2.ipynb
+++ b/book/Introduction_to_python_2.ipynb
@@ -76,7 +76,7 @@
"When we want to make use of a function (referred to as calling the function), we type the name of the function followed by parentheses. Between the parentheses we can pass arguments.\n",
"\n",
"**Arguments**\n",
- "We typically provide a function with 'arguments' to tell python which values or variables are used to perform the body of the function. In the example below `type` is the function name and `pi_value` is the argument."
+ "Arguments are used by a function to perform the body of the function with the value of this argument. In the example below `type` is the function name and `pi_value` is the argument."
]
},
{
@@ -397,7 +397,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.0"
+ "version": "3.8.10"
}
},
"nbformat": 4,
diff --git a/book/Introduction_to_python_3.ipynb b/book/Introduction_to_python_3.ipynb
index 118aca0..a585a31 100644
--- a/book/Introduction_to_python_3.ipynb
+++ b/book/Introduction_to_python_3.ipynb
@@ -522,7 +522,7 @@
"source": [
"## Dictionaries\n",
"\n",
- "A dictionary is another way to store multiple items into one object. In dictionaries, however, this is done with keys and values. This can be useful for several reasons, one example is to store model settings, parameters or variable values for multiple scenarios."
+ "A dictionary is another way to store multiple items into one object. In dictionaries, however, this is done with keys and values. In dictionaries, keys are typically use to look up values. A good analogy may be the contact list in your phone where you use a name (key) to lookup a phone number (value). This can be useful for several reasons, one example is to store model settings, parameters or variable values for multiple scenarios. "
]
},
{
@@ -532,7 +532,7 @@
"metadata": {},
"outputs": [],
"source": [
- "my_dict = {'one': 'first', 'two': 'second'}\n",
+ "my_dict = {'one': 1, 'two': 2}\n",
"my_dict"
]
},
@@ -569,7 +569,7 @@
"metadata": {},
"outputs": [],
"source": [
- "my_dict['third'] = 'three'\n",
+ "my_dict['three'] = 3\n",
"my_dict"
]
},
@@ -578,7 +578,7 @@
"id": "1043ec01",
"metadata": {},
"source": [
- "Dictionary items are key-value pairs. The keys are changeable and unique. The values are changable, but not necessarily unique."
+ "Dictionary items are key-value pairs. The keys are changeable and always have to be unique (within a dictionary object). The values within a dictionary are changable, and don't have to be unique."
]
},
{
@@ -588,7 +588,7 @@
"metadata": {},
"outputs": [],
"source": [
- "my_dict['two'] = 'three'\n",
+ "my_dict['two'] = 5\n",
"my_dict"
]
},
@@ -738,7 +738,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.0"
+ "version": "3.8.10"
}
},
"nbformat": 4,
diff --git a/book/Introduction_to_python_4.ipynb b/book/Introduction_to_python_4.ipynb
index 603ecc1..f3ac93f 100644
--- a/book/Introduction_to_python_4.ipynb
+++ b/book/Introduction_to_python_4.ipynb
@@ -168,7 +168,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.0"
+ "version": "3.8.10"
}
},
"nbformat": 4,
diff --git a/book/_quarto.yml b/book/_quarto.yml
index 69525cf..0a3b4cf 100644
--- a/book/_quarto.yml
+++ b/book/_quarto.yml
@@ -28,6 +28,10 @@ book:
- data-science-with-pandas-2.ipynb
- data-science-with-pandas-3.ipynb
- data-science-with-pandas-4.ipynb
+ - part: "Exercises solutions"
+ chapters:
+ - solutions/morning_exercises_solutions.ipynb
+ - solutions/afternoon_exercises_solutions.ipynb
- what-next.qmd
- references.qmd
repo-url: https://github.com/UtrechtUniversity/workshop-introduction-to-python
diff --git a/book/data-science-with-pandas-1.ipynb b/book/data-science-with-pandas-1.ipynb
index 800957c..3d1cdc8 100644
--- a/book/data-science-with-pandas-1.ipynb
+++ b/book/data-science-with-pandas-1.ipynb
@@ -25,7 +25,7 @@
"\n",
"A library (aka package) is a collection of files (aka python scripts) that contains **functions** that can be used to perform specific tasks. A library may also contain data. The functions in a library are typically related and used for a specific purpose, e.g. there are libraries for plotting, handling audio data and machine learning and many many more. Some libraries are built into python, but most packages need to be installed before you can use it.\n",
"\n",
- "Important to add: libraries are developed and maintained by other Python users. A popular library like Pandas has a large user base and the maintainers are supported by several funders, which makes it a reliable library that is updated very frequently. But this is not always the case, on the other side of the spectrum, a library can also be published once and not maintained at all.\n",
+ "Libraries are developed and maintained by other Python users. That is why there are so many packages and this is great: there is a huge variety of functions available that you can use instead of programming them yourself. But it is important to be aware that the quality can differ. A popular library like Pandas has a large user base and the maintainers are supported by several funders, which makes it a reliable library that is updated very frequently. But this is not always the case, on the other side of the spectrum, a library can also be published once, badly designed/documented and/or not maintained at all. To check the quality of a library, you can check e.g. the number of downloads, the number of contributors, the number of open issues, the date of the last update and the number of stars on GitHub.\n",
"\n",
"## Pandas\n",
"The python library [**Pandas**](https://pandas.pydata.org/about/) is a popular open-source data analysis and data manipulation library for Python which was developed in 2008. The library has some similarities with R, mainly related to the DataFrame data type that is used to handle table like datasets.\n",
@@ -212,29 +212,6 @@
"It is, however, enough for a quick exploration of how the dataset looks like in terms of columns names, values, and potential reading errors."
]
},
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "
\n",
- "Exercise 1\n",
- " \n",
- "Now go to the Jupyter Dashboard in your internet browser and open the notebook `afternoon_exercises.ipynb` and do exercise 1.\n",
- "
"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "::: {.callout-note}\n",
- "As you can see in this exercise a DataFrame object comes with several methods that can be applied to the DataFrame. A method is similar to a function, but it can only be applied to the object it belongs to and has a different notation than a function.\n",
- "\n",
- "Compare the notation of the function `len`: `len(surveys_df)` \n",
- "with the DataFrame specific method `shape`: `surveys_df.shape`\n",
- ":::"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
@@ -356,12 +333,24 @@
"metadata": {},
"source": [
"
\n",
- "Exercise 2\n",
+ "Exercises 0 and 1\n",
" \n",
- "Now go to the Jupyter Dashboard in your internet browser and continue with exercise 2.\n",
+ "Now go to the Jupyter Dashboard in your internet browser and continue with exercises 0 and 1.\n",
"
"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "::: {.callout-note}\n",
+ "As you can see in this exercise a DataFrame object comes with several methods that can be applied to the DataFrame. A method is similar to a function, but it can only be applied to the object it belongs to and has a different notation than a function.\n",
+ "\n",
+ "Compare the notation of the function `len`: `len(surveys_df)` \n",
+ "with the DataFrame specific method `shape`: `surveys_df.shape`\n",
+ ":::"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -439,7 +428,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.0"
+ "version": "3.8.10"
},
"vscode": {
"interpreter": {
diff --git a/book/data-science-with-pandas-2.ipynb b/book/data-science-with-pandas-2.ipynb
index 3aa722a..6f5bc4e 100644
--- a/book/data-science-with-pandas-2.ipynb
+++ b/book/data-science-with-pandas-2.ipynb
@@ -353,9 +353,9 @@
"metadata": {},
"source": [
"
\n",
- "Exercise 3 to 5\n",
+ "Exercise 2 to 4\n",
" \n",
- "Now go to the Jupyter Dashboard in your internet browser and continue with the afternoon exercises 3 to 5."
+ "Now go to the Jupyter Dashboard in your internet browser and continue with the afternoon exercises 2 to 4."
]
},
{
@@ -669,9 +669,9 @@
"metadata": {},
"source": [
"
\n",
- "Exercise 6 and 7\n",
+ "Exercise 5 and 6\n",
"\n",
- "Now go to the Jupyter Dashboard in your internet browser and continue with the afternoon exercises 6 and 7."
+ "Now go to the Jupyter Dashboard in your internet browser and continue with the afternoon exercises 5 and 6."
]
},
{
@@ -869,9 +869,9 @@
"metadata": {},
"source": [
"
\n",
- "Exercise 8 to 10\n",
+ "Exercise 7 to 9\n",
" \n",
- "Now go to the Jupyter Dashboard in your internet browser and continue with the afternoon exercises 8 to 10."
+ "Now go to the Jupyter Dashboard in your internet browser and continue with the afternoon exercises 7 to 9."
]
},
{
@@ -899,7 +899,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.0"
+ "version": "3.8.10"
}
},
"nbformat": 4,
diff --git a/book/data-science-with-pandas-3.ipynb b/book/data-science-with-pandas-3.ipynb
index e7400c7..d3bf853 100644
--- a/book/data-science-with-pandas-3.ipynb
+++ b/book/data-science-with-pandas-3.ipynb
@@ -193,17 +193,6 @@
"vertical_stack.reset_index()"
]
},
- {
- "cell_type": "markdown",
- "id": "f1db04a6-8ae2-44e4-9798-adc1b108c9d8",
- "metadata": {},
- "source": [
- "
\n",
- "Exercise 11\n",
- " \n",
- "Now go to the Jupyter Dashboard in your internet browser and continue with exercise 11.\n"
- ]
- },
{
"cell_type": "markdown",
"id": "7bc415a6",
@@ -315,9 +304,9 @@
"metadata": {},
"source": [
"
\n",
- "Exercise 12\n",
+ "Exercise 10 and 11\n",
" \n",
- "Now go to the Jupyter Dashboard in your internet browser and continue with exercise 12.\n"
+ "Now go to the Jupyter Dashboard in your internet browser and continue with exercise 10 and 11.\n"
]
},
{
@@ -508,7 +497,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.0"
+ "version": "3.8.10"
}
},
"nbformat": 4,
diff --git a/book/data-science-with-pandas-4.ipynb b/book/data-science-with-pandas-4.ipynb
index a4a9abe..99aced6 100644
--- a/book/data-science-with-pandas-4.ipynb
+++ b/book/data-science-with-pandas-4.ipynb
@@ -439,9 +439,9 @@
"metadata": {},
"source": [
"
\n",
- "Exercise 13\n",
+ "Exercise 12\n",
" \n",
- "Now go to the Jupyter Dashboard in your internet browser and continue with exercise 13.\n",
+ "Now go to the Jupyter Dashboard in your internet browser and continue with exercise 12.\n",
"
\n",
- "Exercise 14\n",
+ "Exercise 13\n",
" \n",
- "Now go to the Jupyter Dashboard in your internet browser and continue with exercise 14.\n",
+ "Now go to the Jupyter Dashboard in your internet browser and continue with exercise 13.\n",
"
"
]
},
@@ -564,7 +564,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.0"
+ "version": "3.8.10"
},
"toc": {
"base_numbering": 1,
diff --git a/book/installation-and-setup.qmd b/book/installation-and-setup.qmd
index 08be8bf..6a0ace9 100644
--- a/book/installation-and-setup.qmd
+++ b/book/installation-and-setup.qmd
@@ -72,7 +72,7 @@ version 9 and below, are not).
3. Move the downloaded zip to the folder where you want to store these course materials.
4. Unzip the zip file.
-In your `python-workshop` you will see two folders called `data` and `solutions` and the following files:
+In your `python-workshop` you will see a folders called `data` and the following files:
```
python-workshop
@@ -82,11 +82,9 @@ python-workshop
│ ├── species.csv
│ ├── surveys.csv
│ └── plots.csv
-├── solutions
-│ ├── morning_exercises_solutions.ipynb
-│ └── afternoon_exercises_solutions.ipynb
├── morning_exercises.ipynb
-└── afternoon_exercises.ipynb
+├── afternoon_exercises.ipynb
+└── empty_notebook_for_code_along.ipynb
```
## Launch Python interface
@@ -139,9 +137,12 @@ Make your choice and click "Ok, and don't show again" button.
2. Find the "Notebook" tab and click on the "Launch" button.
Anaconda will open a new browser window or tab with a Notebook Dashboard showing you the
contents of your Home (or User) folder.
-3. Navigate to the `python-workshop` directory by clicking on the directory names leading to it.
-`Desktop` and then `python-workshop`:
-4. Launch the notebook by clicking on the "New" button and then selecting "Python 3".
+3. Navigate to the `data` directory by clicking on the directory names leading to it.
+`Desktop`, `python-workshop`, then `data`:
+4. Launch the notebook called `empty_notebook_for_code_along.ipynb` by clicking on it.
+5. Run the first code cell just below "Test installation" by clicking on it and then click on the 'play button'.
+If the output of the cell displays 4 version numbers and the words "No errors! Ready to code!" instead of an error message, your installation is successful. If not, contact us at [RDM walk in hours][walk-in-hours] or reply to the welcome email.
+
:::
### Option B: IPython interpreter
@@ -176,7 +177,7 @@ This workshop will make use of the following Python packages:
- `matplotlib`
- `numpy`
-Anaconda Navigator comes with these packages, so you are ready to go. If you are using another option to work with Python, you need to install these packages. If you need help with this, please ask the internet, your colleagues or us (see welcome email).
+Anaconda Navigator comes with these packages, so you are ready to go. If you are using another option to work with Python, you need to install these packages. We then assume that you know how to install packages, otherwise we recommend using Anaconda instead. If for some reason you cannot work with Anaconda and you need help with installing packages, we are happy to help during the [RDM walk in hours](https://www.uu.nl/en/research/research-data-management/workshops/walk-in-hours-research-data-and-software), or email us by replying to the welcome email.
## References {.unnumbered}
diff --git a/book/solutions/afternoon_exercises_solutions.ipynb b/book/solutions/afternoon_exercises_solutions.ipynb
new file mode 100644
index 0000000..7c53bbe
--- /dev/null
+++ b/book/solutions/afternoon_exercises_solutions.ipynb
@@ -0,0 +1,1337 @@
+{
+ "cells": [
+ {
+ "cell_type": "raw",
+ "id": "14f401b8-2d1d-4a01-a3c8-ed2c59e66d29",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "format: html\n",
+ "execute: \n",
+ " enabled: true\n",
+ " error: true\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "67d94b6e",
+ "metadata": {},
+ "source": [
+ "# Afternoon Exercises: Working with data {.unnumbered}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "2c6372a4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "surveys_df = pd.read_csv('../../course_materials/data/surveys.csv') # in your notebook the path should be 'data/surveys.csv'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "507d1c2c-98b8-4a67-b2e5-b3a3031a5b5f",
+ "metadata": {},
+ "source": [
+ "### Exercise 0\n",
+ "\n",
+ "Type the following commands and check the outputs. Can you tell what each command does? What is the difference between commands with and without parenthesis?\n",
+ "\n",
+ "```python\n",
+ "surveys_df.shape # Answer: the dimensions of the dataframe\n",
+ "surveys_df.columns # Answer: the column names of the dataframe\n",
+ "surveys_df.index # Answer: the index (row labels) of the dataframe\n",
+ "surveys_df.dtypes # Answer: the data types of each column\n",
+ "surveys_df.head() # Answer: the first n rows of the dataframe\n",
+ "surveys_df.tail() # Answer: the last n rows of the dataframe\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0514bad4-1266-4553-9752-29dcf3166d21",
+ "metadata": {},
+ "source": [
+ "### Exercise 1\n",
+ "Perform some basic statistics on the weight column. For practical reasons, it can be useful to first create a variable `weight` that contains the just the weight column. It will make the code look a bit cleaner. Can you tell what each method listed below does? Look at our explorative plot, do the statistics make sense?\n",
+ "\n",
+ "```python\n",
+ "weight=surveys_df['weight'] # Answer: creates a new variable that contains the weight column\n",
+ "weight.min() # Answer: the minimum value of the weight column\n",
+ "weight.max() # Answer: the maximum value of the weight column\n",
+ "weight.mean() # Answer: the mean value of the weight column\n",
+ "weight.std() # Answer: the standard deviation of the weight column\n",
+ "weight.count() # Answer: the number of non-NaN values in the weight column\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2bcb581f",
+ "metadata": {},
+ "source": [
+ "### Exercise 2\n",
+ "- Swap the order of column names in `surveys_df[['plot_id', 'species_id']]`\n",
+ "- Repeat one of the column names like `surveys_df[['plot_id', 'plot_id', 'species_id']]`.\n",
+ "What do the results look like and why? \n",
+ "\n",
+ "> Answer: the column names are repeated and the data is displayed twice. Column names do not have to be unique.\n",
+ "\n",
+ "- Which error occurs in `surveys_df['plot_id', 'species_id']` and why? \n",
+ "\n",
+ "> Answer: KeyError: ('plot_id', 'species_id'). The column names are not in a list. We need double square brackets to select multiple columns.\n",
+ "\n",
+ "- Which error occurs in `surveys_df['speciess']`? \n",
+ "\n",
+ "> Answer: KeyError: 'speciess'. The column name does not exist. Typo."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "3731f5fd",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " species_id plot_id\n",
+ "0 NL 2\n",
+ "1 NL 3\n",
+ "2 DM 2\n",
+ "3 DM 7\n",
+ "4 DM 3\n",
+ "... ... ...\n",
+ "35544 AH 15\n",
+ "35545 AH 15\n",
+ "35546 RM 10\n",
+ "35547 DO 7\n",
+ "35548 NaN 5\n",
+ "\n",
+ "[35549 rows x 2 columns]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(surveys_df[['species_id', 'plot_id']])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "b71788b1",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
plot_id
\n",
+ "
plot_id
\n",
+ "
species_id
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
2
\n",
+ "
2
\n",
+ "
NL
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
3
\n",
+ "
3
\n",
+ "
NL
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
2
\n",
+ "
2
\n",
+ "
DM
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
7
\n",
+ "
7
\n",
+ "
DM
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
3
\n",
+ "
3
\n",
+ "
DM
\n",
+ "
\n",
+ "
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
\n",
+ "
\n",
+ "
35544
\n",
+ "
15
\n",
+ "
15
\n",
+ "
AH
\n",
+ "
\n",
+ "
\n",
+ "
35545
\n",
+ "
15
\n",
+ "
15
\n",
+ "
AH
\n",
+ "
\n",
+ "
\n",
+ "
35546
\n",
+ "
10
\n",
+ "
10
\n",
+ "
RM
\n",
+ "
\n",
+ "
\n",
+ "
35547
\n",
+ "
7
\n",
+ "
7
\n",
+ "
DO
\n",
+ "
\n",
+ "
\n",
+ "
35548
\n",
+ "
5
\n",
+ "
5
\n",
+ "
NaN
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
35549 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " plot_id plot_id species_id\n",
+ "0 2 2 NL\n",
+ "1 3 3 NL\n",
+ "2 2 2 DM\n",
+ "3 7 7 DM\n",
+ "4 3 3 DM\n",
+ "... ... ... ...\n",
+ "35544 15 15 AH\n",
+ "35545 15 15 AH\n",
+ "35546 10 10 RM\n",
+ "35547 7 7 DO\n",
+ "35548 5 5 NaN\n",
+ "\n",
+ "[35549 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "surveys_df[['plot_id', 'plot_id', 'species_id']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "bb68a995",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Unexpected exception formatting exception. Falling back to standard exception\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Traceback (most recent call last):\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py\", line 3802, in get_loc\n",
+ " return self._engine.get_loc(casted_key)\n",
+ " File \"pandas/_libs/index.pyx\", line 138, in pandas._libs.index.IndexEngine.get_loc\n",
+ " File \"pandas/_libs/index.pyx\", line 165, in pandas._libs.index.IndexEngine.get_loc\n",
+ " File \"pandas/_libs/hashtable_class_helper.pxi\", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+ " File \"pandas/_libs/hashtable_class_helper.pxi\", line 5753, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+ "KeyError: ('plot_id', 'species_id')\n",
+ "\n",
+ "The above exception was the direct cause of the following exception:\n",
+ "\n",
+ "Traceback (most recent call last):\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 3460, in run_code\n",
+ " exec(code_obj, self.user_global_ns, self.user_ns)\n",
+ " File \"/tmp/ipykernel_16478/997369930.py\", line 1, in \n",
+ " surveys_df['plot_id', 'species_id']\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/frame.py\", line 3807, in __getitem__\n",
+ " indexer = self.columns.get_loc(key)\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py\", line 3804, in get_loc\n",
+ " raise KeyError(key) from err\n",
+ "KeyError: ('plot_id', 'species_id')\n",
+ "\n",
+ "During handling of the above exception, another exception occurred:\n",
+ "\n",
+ "Traceback (most recent call last):\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 2057, in showtraceback\n",
+ " stb = self.InteractiveTB.structured_traceback(\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1288, in structured_traceback\n",
+ " return FormattedTB.structured_traceback(\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1177, in structured_traceback\n",
+ " return VerboseTB.structured_traceback(\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1030, in structured_traceback\n",
+ " formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 935, in format_exception_as_a_whole\n",
+ " self.get_records(etb, number_of_lines_of_context, tb_offset) if etb else []\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 987, in get_records\n",
+ " style = stack_data.style_with_executing_node(style, self._tb_highlight)\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/stack_data/core.py\", line 455, in style_with_executing_node\n",
+ " class NewStyle(style):\n",
+ " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 91, in __new__\n",
+ " ndef[4] = colorformat(styledef[3:])\n",
+ " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 58, in colorformat\n",
+ " assert False, \"wrong color format %r\" % text\n",
+ "AssertionError: wrong color format 'ansiyellow'\n"
+ ]
+ }
+ ],
+ "source": [
+ "surveys_df['plot_id', 'species_id'] "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "b74edc72",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Unexpected exception formatting exception. Falling back to standard exception\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Traceback (most recent call last):\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py\", line 3802, in get_loc\n",
+ " return self._engine.get_loc(casted_key)\n",
+ " File \"pandas/_libs/index.pyx\", line 138, in pandas._libs.index.IndexEngine.get_loc\n",
+ " File \"pandas/_libs/index.pyx\", line 165, in pandas._libs.index.IndexEngine.get_loc\n",
+ " File \"pandas/_libs/hashtable_class_helper.pxi\", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+ " File \"pandas/_libs/hashtable_class_helper.pxi\", line 5753, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+ "KeyError: 'speciess'\n",
+ "\n",
+ "The above exception was the direct cause of the following exception:\n",
+ "\n",
+ "Traceback (most recent call last):\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 3460, in run_code\n",
+ " exec(code_obj, self.user_global_ns, self.user_ns)\n",
+ " File \"/tmp/ipykernel_16478/158671928.py\", line 1, in \n",
+ " surveys_df['speciess']\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/frame.py\", line 3807, in __getitem__\n",
+ " indexer = self.columns.get_loc(key)\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py\", line 3804, in get_loc\n",
+ " raise KeyError(key) from err\n",
+ "KeyError: 'speciess'\n",
+ "\n",
+ "During handling of the above exception, another exception occurred:\n",
+ "\n",
+ "Traceback (most recent call last):\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 2057, in showtraceback\n",
+ " stb = self.InteractiveTB.structured_traceback(\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1288, in structured_traceback\n",
+ " return FormattedTB.structured_traceback(\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1177, in structured_traceback\n",
+ " return VerboseTB.structured_traceback(\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1030, in structured_traceback\n",
+ " formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 935, in format_exception_as_a_whole\n",
+ " self.get_records(etb, number_of_lines_of_context, tb_offset) if etb else []\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 987, in get_records\n",
+ " style = stack_data.style_with_executing_node(style, self._tb_highlight)\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/stack_data/core.py\", line 455, in style_with_executing_node\n",
+ " class NewStyle(style):\n",
+ " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 91, in __new__\n",
+ " ndef[4] = colorformat(styledef[3:])\n",
+ " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 58, in colorformat\n",
+ " assert False, \"wrong color format %r\" % text\n",
+ "AssertionError: wrong color format 'ansiyellow'\n"
+ ]
+ }
+ ],
+ "source": [
+ "surveys_df['speciess']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "119b576b",
+ "metadata": {},
+ "source": [
+ "### Exercise 3\n",
+ "What happens when you call:\n",
+ "\n",
+ "- `surveys_df[0:1]` Answer: shows the first row of the dataframe\n",
+ "- `surveys_df[:4]` Answer: shows the first 4 rows of the dataframe from index 0 to index 3\n",
+ "- `surveys_df[:-1]` Answer: shows all rows of the dataframe except the last row"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "bd3b4c74",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
record_id
\n",
+ "
month
\n",
+ "
day
\n",
+ "
year
\n",
+ "
plot_id
\n",
+ "
species_id
\n",
+ "
sex
\n",
+ "
hindfoot_length
\n",
+ "
weight
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
1
\n",
+ "
7
\n",
+ "
16
\n",
+ "
1977
\n",
+ "
2
\n",
+ "
NL
\n",
+ "
M
\n",
+ "
32.0
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
2
\n",
+ "
7
\n",
+ "
16
\n",
+ "
1977
\n",
+ "
3
\n",
+ "
NL
\n",
+ "
M
\n",
+ "
33.0
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
3
\n",
+ "
7
\n",
+ "
16
\n",
+ "
1977
\n",
+ "
2
\n",
+ "
DM
\n",
+ "
F
\n",
+ "
37.0
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
4
\n",
+ "
7
\n",
+ "
16
\n",
+ "
1977
\n",
+ "
7
\n",
+ "
DM
\n",
+ "
M
\n",
+ "
36.0
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
5
\n",
+ "
7
\n",
+ "
16
\n",
+ "
1977
\n",
+ "
3
\n",
+ "
DM
\n",
+ "
M
\n",
+ "
35.0
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
\n",
+ "
\n",
+ "
35543
\n",
+ "
35544
\n",
+ "
12
\n",
+ "
31
\n",
+ "
2002
\n",
+ "
15
\n",
+ "
US
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
35544
\n",
+ "
35545
\n",
+ "
12
\n",
+ "
31
\n",
+ "
2002
\n",
+ "
15
\n",
+ "
AH
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
35545
\n",
+ "
35546
\n",
+ "
12
\n",
+ "
31
\n",
+ "
2002
\n",
+ "
15
\n",
+ "
AH
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
35546
\n",
+ "
35547
\n",
+ "
12
\n",
+ "
31
\n",
+ "
2002
\n",
+ "
10
\n",
+ "
RM
\n",
+ "
F
\n",
+ "
15.0
\n",
+ "
14.0
\n",
+ "
\n",
+ "
\n",
+ "
35547
\n",
+ "
35548
\n",
+ "
12
\n",
+ "
31
\n",
+ "
2002
\n",
+ "
7
\n",
+ "
DO
\n",
+ "
M
\n",
+ "
36.0
\n",
+ "
51.0
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
35548 rows × 9 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " record_id month day year plot_id species_id sex hindfoot_length \\\n",
+ "0 1 7 16 1977 2 NL M 32.0 \n",
+ "1 2 7 16 1977 3 NL M 33.0 \n",
+ "2 3 7 16 1977 2 DM F 37.0 \n",
+ "3 4 7 16 1977 7 DM M 36.0 \n",
+ "4 5 7 16 1977 3 DM M 35.0 \n",
+ "... ... ... ... ... ... ... ... ... \n",
+ "35543 35544 12 31 2002 15 US NaN NaN \n",
+ "35544 35545 12 31 2002 15 AH NaN NaN \n",
+ "35545 35546 12 31 2002 15 AH NaN NaN \n",
+ "35546 35547 12 31 2002 10 RM F 15.0 \n",
+ "35547 35548 12 31 2002 7 DO M 36.0 \n",
+ "\n",
+ " weight \n",
+ "0 NaN \n",
+ "1 NaN \n",
+ "2 NaN \n",
+ "3 NaN \n",
+ "4 NaN \n",
+ "... ... \n",
+ "35543 NaN \n",
+ "35544 NaN \n",
+ "35545 NaN \n",
+ "35546 14.0 \n",
+ "35547 51.0 \n",
+ "\n",
+ "[35548 rows x 9 columns]"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "surveys_df[0:1]\n",
+ "surveys_df[:4] \n",
+ "surveys_df[:-1] "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4fc54c12",
+ "metadata": {},
+ "source": [
+ "### Exercise 4\n",
+ "What happens in the following two examples?\n",
+ "\n",
+ "- ```surveys_df.iloc[0:4, 1:4]```;\n",
+ "- ```surveys_df.loc[0:4, 1:4]```.\n",
+ "\n",
+ "How are the two commands different? \n",
+ "\n",
+ "> Answer: iloc uses integer indices, loc uses labels. The first command will return the first 4 rows and the columns with integer indices 1, 2 and 3. The second command will return the first 4 rows and the columns with labels 1, 2 and 3. Since there are no columns with these labels, the second command will return an error."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "6680a6fe",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " month day year\n",
+ "0 7 16 1977\n",
+ "1 7 16 1977\n",
+ "2 7 16 1977\n",
+ "3 7 16 1977\n",
+ "Unexpected exception formatting exception. Falling back to standard exception\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Traceback (most recent call last):\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 3460, in run_code\n",
+ " exec(code_obj, self.user_global_ns, self.user_ns)\n",
+ " File \"/tmp/ipykernel_16478/1314140639.py\", line 2, in \n",
+ " surveys_df.loc[0:4, 1:4] # the function loc works with indices for rows (0:4),\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexing.py\", line 1067, in __getitem__\n",
+ " return self._getitem_tuple(key)\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexing.py\", line 1256, in _getitem_tuple\n",
+ " return self._getitem_tuple_same_dim(tup)\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexing.py\", line 924, in _getitem_tuple_same_dim\n",
+ " retval = getattr(retval, self.name)._getitem_axis(key, axis=i)\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexing.py\", line 1290, in _getitem_axis\n",
+ " return self._get_slice_axis(key, axis=axis)\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexing.py\", line 1324, in _get_slice_axis\n",
+ " indexer = labels.slice_indexer(slice_obj.start, slice_obj.stop, slice_obj.step)\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py\", line 6559, in slice_indexer\n",
+ " start_slice, end_slice = self.slice_locs(start, end, step=step)\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py\", line 6767, in slice_locs\n",
+ " start_slice = self.get_slice_bound(start, \"left\")\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py\", line 6676, in get_slice_bound\n",
+ " label = self._maybe_cast_slice_bound(label, side)\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py\", line 6623, in _maybe_cast_slice_bound\n",
+ " raise self._invalid_indexer(\"slice\", label)\n",
+ "TypeError: cannot do slice indexing on Index with these indexers [1] of type int\n",
+ "\n",
+ "During handling of the above exception, another exception occurred:\n",
+ "\n",
+ "Traceback (most recent call last):\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 2057, in showtraceback\n",
+ " stb = self.InteractiveTB.structured_traceback(\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1288, in structured_traceback\n",
+ " return FormattedTB.structured_traceback(\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1177, in structured_traceback\n",
+ " return VerboseTB.structured_traceback(\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1030, in structured_traceback\n",
+ " formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 935, in format_exception_as_a_whole\n",
+ " self.get_records(etb, number_of_lines_of_context, tb_offset) if etb else []\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 987, in get_records\n",
+ " style = stack_data.style_with_executing_node(style, self._tb_highlight)\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/stack_data/core.py\", line 455, in style_with_executing_node\n",
+ " class NewStyle(style):\n",
+ " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 91, in __new__\n",
+ " ndef[4] = colorformat(styledef[3:])\n",
+ " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 58, in colorformat\n",
+ " assert False, \"wrong color format %r\" % text\n",
+ "AssertionError: wrong color format 'ansiyellow'\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(surveys_df.iloc[0:4, 1:4])\n",
+ "surveys_df.loc[0:4, 1:4] "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7b12cdaf",
+ "metadata": {},
+ "source": [
+ "### Exercise 5\n",
+ "- Create a new DataFrame that only contains observations from the original with sex values that are not female or male. Print the number of rows in this new DataFrame. Verify the result by comparing the number of rows in the new DataFrame with the number of rows in the surveys DataFrame where sex is NaN (hint: there is a function `isnull`)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "53f7777a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of rows: 2511\n",
+ "Unique values in column 'sex': [nan]\n"
+ ]
+ }
+ ],
+ "source": [
+ "df = surveys_df[(surveys_df['sex'] != 'M') & (surveys_df['sex'] != 'F')]\n",
+ "print(\"Number of rows not female or male:\", len(df))\n",
+ "print(\"Number of rows NaN:\", len(surveys_df['sex'].isnull()))\n",
+ "print(\"Unique values in column 'sex':\", df['sex'].unique())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b34b321b",
+ "metadata": {},
+ "source": [
+ "### Exercise 6: Putting it all together \n",
+ "1. Clean the column *sex* (leave out samples of which we do not know whether they are male or female) and save the result as a new dataframe `clean_df`.\n",
+ "2. Fill undefined *weight* values with the mean of all valid weights in `surveys_df`.\n",
+ "3. Calculate the average weight of that new DataFrame `clean_df`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "570cbd6c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Average weight of surveys_df: 42.672428212991356\n",
+ "Average weight of clean_df: 42.60316325896464\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Step 1\n",
+ "# sex is 'F' or 'M'. The `|` means or.\n",
+ "clean_df = surveys_df[(surveys_df['sex']=='F') | (surveys_df['sex']=='M')]\n",
+ "# Alternative solution: select columns where 'not' sex is null. The `~` means not.\n",
+ "clean_df = surveys_df[~(surveys_df['sex'].isnull())]\n",
+ "\n",
+ "# Step 2\n",
+ "clean_df.weight.fillna(surveys_df.weight.mean())\n",
+ "\n",
+ "# Step 3\n",
+ "print(\"Average weight of surveys_df:\", surveys_df.weight.mean())\n",
+ "print(\"Average weight of clean_df:\", clean_df.weight.mean())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ccb33c2e",
+ "metadata": {},
+ "source": [
+ "### Exercise 7\n",
+ "Let's see in which plots animals get more food. Calculate the average weight per plot! Complete the code below."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "733aba46",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "plot_id\n",
+ "1 51.822911\n",
+ "2 52.251688\n",
+ "3 32.654386\n",
+ "4 47.928189\n",
+ "5 40.947802\n",
+ "6 36.738893\n",
+ "7 20.663009\n",
+ "8 47.758001\n",
+ "9 51.432358\n",
+ "10 18.541219\n",
+ "11 43.451757\n",
+ "12 49.496169\n",
+ "13 40.445660\n",
+ "14 46.277199\n",
+ "15 27.042578\n",
+ "16 24.585417\n",
+ "17 47.889593\n",
+ "18 40.005922\n",
+ "19 21.105166\n",
+ "20 48.665303\n",
+ "21 24.627794\n",
+ "22 54.146379\n",
+ "23 19.634146\n",
+ "24 43.679167\n",
+ "Name: weight, dtype: float64"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "grouped_data = surveys_df.groupby(\"plot_id\")\n",
+ "grouped_data['weight'].mean()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2bccb9da",
+ "metadata": {},
+ "source": [
+ "### Exercise 8\n",
+ "See below a more complex grouping example. Investigate the group keys and row indexes for this more complex grouping example. \n",
+ "Why are there more than 48 groups? Answer: nan values are not ignored when grouping.\n",
+ "Calculate the average weight per group.\n",
+ "What happened to the third group and why does it not turn up in our statistics? Answer: the third group contains only nan values and is therefore not included in the statistics."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "ee674ae9",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "72\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "dict_keys([('F', 1), ('F', 2), ('F', 3), ('F', 4), ('F', 5), ('F', 6), ('F', 7), ('F', 8), ('F', 9), ('F', 10), ('F', 11), ('F', 12), ('F', 13), ('F', 14), ('F', 15), ('F', 16), ('F', 17), ('F', 18), ('F', 19), ('F', 20), ('F', 21), ('F', 22), ('F', 23), ('F', 24), ('M', 1), ('M', 2), ('M', 3), ('M', 4), ('M', 5), ('M', 6), ('M', 7), ('M', 8), ('M', 9), ('M', 10), ('M', 11), ('M', 12), ('M', 13), ('M', 14), ('M', 15), ('M', 16), ('M', 17), ('M', 18), ('M', 19), ('M', 20), ('M', 21), ('M', 22), ('M', 23), ('M', 24), (nan, 1), (nan, 2), (nan, 3), (nan, 4), (nan, 5), (nan, 6), (nan, 7), (nan, 8), (nan, 9), (nan, 10), (nan, 11), (nan, 12), (nan, 13), (nan, 14), (nan, 15), (nan, 16), (nan, 17), (nan, 18), (nan, 19), (nan, 20), (nan, 21), (nan, 22), (nan, 23), (nan, 24)])"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "grouped_data = surveys_df.groupby(['sex', 'plot_id'])\n",
+ "print(len(grouped_data.groups))\n",
+ "grouped_data.groups.keys()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "3d9b5ba0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "sex plot_id\n",
+ "F 1 46.311138\n",
+ " 2 52.561845\n",
+ " 3 31.215349\n",
+ " 4 46.818824\n",
+ " 5 40.974806\n",
+ " 6 36.352288\n",
+ " 7 20.006135\n",
+ " 8 45.623011\n",
+ " 9 53.618469\n",
+ " 10 17.094203\n",
+ " 11 43.515075\n",
+ " 12 49.831731\n",
+ " 13 40.524590\n",
+ " 14 47.355491\n",
+ " 15 26.670236\n",
+ " 16 25.810427\n",
+ " 17 48.176201\n",
+ " 18 36.963514\n",
+ " 19 21.978599\n",
+ " 20 52.624406\n",
+ " 21 25.974832\n",
+ " 22 53.647059\n",
+ " 23 20.564417\n",
+ " 24 47.914405\n",
+ "M 1 55.950560\n",
+ " 2 51.391382\n",
+ " 3 34.163241\n",
+ " 4 48.888119\n",
+ " 5 40.708551\n",
+ " 6 36.867388\n",
+ " 7 21.194719\n",
+ " 8 49.641372\n",
+ " 9 49.519309\n",
+ " 10 19.971223\n",
+ " 11 43.366197\n",
+ " 12 48.909710\n",
+ " 13 40.097754\n",
+ " 14 45.159378\n",
+ " 15 27.523691\n",
+ " 16 23.811321\n",
+ " 17 47.558853\n",
+ " 18 43.546952\n",
+ " 19 20.306878\n",
+ " 20 44.197279\n",
+ " 21 22.772622\n",
+ " 22 54.572531\n",
+ " 23 18.941463\n",
+ " 24 39.321503\n",
+ "Name: weight, dtype: float64"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "grouped_data['weight'].mean()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2336ac85",
+ "metadata": {},
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b0f1ab75",
+ "metadata": {},
+ "source": [
+ "### Exercise 9\n",
+ "Would it make sense to group our data frame by the column *weight*? Why or why not?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bbff11b3",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of rows: 35549\n",
+ "256\n",
+ "255\n"
+ ]
+ }
+ ],
+ "source": [
+ "# In real life nearly every sample has a unique value. So nearly every sample would \n",
+ "# be placed in an own group.\n",
+ "# In our training data you can see that there are quite some values for weight. So\n",
+ "# usually it is not a good idea to categorise (group) data on such values.\n",
+ "print(\"Number of rows:\", len(surveys_df))\n",
+ "print(len(surveys_df['weight'].unique())) #includes nan\n",
+ "print(len(surveys_df.groupby(['weight']).groups)) #does not include nan"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "79ae54cf",
+ "metadata": {},
+ "source": [
+ "### Exercise 10\n",
+ "In the given example of vertical concatenation, you concatenated two DataFrames with the same columns. What would happen if the two DataFrames to concatenate have different column number and names?\n",
+ "\n",
+ " 1. Create a new DataFrame using the last 10 rows of the species DataFrame (`species_df`);\n",
+ " 2. Concatenate vertically `surveys_df_sub_first10` and your just created DataFrame;\n",
+ " 3. Print the concatenated DataFrame info on the screen. How may rows does it have? What happened to the columns? Explain why you get this result."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "18b13126",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Unexpected exception formatting exception. Falling back to standard exception\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Traceback (most recent call last):\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 3460, in run_code\n",
+ " exec(code_obj, self.user_global_ns, self.user_ns)\n",
+ " File \"/tmp/ipykernel_16478/1143081575.py\", line 1, in \n",
+ " species_df = pd.read_csv(\"../data/species.csv\")\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/util/_decorators.py\", line 211, in wrapper\n",
+ " return func(*args, **kwargs)\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/util/_decorators.py\", line 331, in wrapper\n",
+ " return func(*args, **kwargs)\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py\", line 950, in read_csv\n",
+ " return _read(filepath_or_buffer, kwds)\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py\", line 605, in _read\n",
+ " parser = TextFileReader(filepath_or_buffer, **kwds)\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py\", line 1442, in __init__\n",
+ " self._engine = self._make_engine(f, self.engine)\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py\", line 1735, in _make_engine\n",
+ " self.handles = get_handle(\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/pandas/io/common.py\", line 856, in get_handle\n",
+ " handle = open(\n",
+ "FileNotFoundError: [Errno 2] No such file or directory: '../data/species.csv'\n",
+ "\n",
+ "During handling of the above exception, another exception occurred:\n",
+ "\n",
+ "Traceback (most recent call last):\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 2057, in showtraceback\n",
+ " stb = self.InteractiveTB.structured_traceback(\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1288, in structured_traceback\n",
+ " return FormattedTB.structured_traceback(\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1177, in structured_traceback\n",
+ " return VerboseTB.structured_traceback(\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1030, in structured_traceback\n",
+ " formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 935, in format_exception_as_a_whole\n",
+ " self.get_records(etb, number_of_lines_of_context, tb_offset) if etb else []\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 987, in get_records\n",
+ " style = stack_data.style_with_executing_node(style, self._tb_highlight)\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/stack_data/core.py\", line 455, in style_with_executing_node\n",
+ " class NewStyle(style):\n",
+ " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 91, in __new__\n",
+ " ndef[4] = colorformat(styledef[3:])\n",
+ " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 58, in colorformat\n",
+ " assert False, \"wrong color format %r\" % text\n",
+ "AssertionError: wrong color format 'ansiyellow'\n"
+ ]
+ }
+ ],
+ "source": [
+ "species_df = pd.read_csv(\"../../course_materials/data/species.csv\")\n",
+ "species_df_sub_last10 = species_df.tail(10)\n",
+ "\n",
+ "surveys_df_sub_first10 = surveys_df.head(10)\n",
+ "vert_concat = pd.concat([surveys_df_sub_first10, species_df_sub_last10], axis=0)\n",
+ "\n",
+ "vert_concat"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "aa6d4ee8",
+ "metadata": {},
+ "source": [
+ "We get a total of 20 rows and 12 columns. The original dataframes together had a total of 13 columns. As they both have a column `species_id`, this one is collapsed. All other columns are padded with `NaN` values.\n",
+ "We expect 20 rows, as we are putting two DataFrames of 10 rows one after the other. The padding of the columns happens because these two DataFrames do not have the same column names. To keep all the information that was in the original DataFrames, the padding of columns that occur in only one of the two is necessary."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9478026d",
+ "metadata": {},
+ "source": [
+ "### Exercise 11\n",
+ "In the given example of horizontal concatenation, you first concatenated two DataFrame with different indices, then reset the indices of the second one. Based on the outcome of these two cases, try to answer the following questions:\n",
+ " 1. What happens when you concatenate horizontally two DataFrames with different indexing?\n",
+ " 2. What happens when you concatenate horizontally two DataFrames with the same columns?\n",
+ " 3. What happens when you try to select a column of the `horizontal_stack` DataFrame we just created?\n",
+ " 4. How can you select a specific column, when multiple columns share a name?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b5b01c88",
+ "metadata": {},
+ "source": [
+ "1. The columns of both DataFrames are kept, duplicates are not merged, and the rows of both DataFrames are kept as well, no merging there either. This results in the two original DataFrames appearing in a checker-pattern in the resulting DataFrames, with the empty spaces padded out with `NaN` values.\n",
+ "2. Columns still are not merged, but now, rows with a common index are merged. This means that the information of both rows is put into a single row in the resulting DataFrame. If no corresponding row exists in the other DataFrame, the row is still padded with `NaN`'s in the result. But if a corresponding row (with the same index) *does* exists, this is no longer necessary."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0faee1fd",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Unexpected exception formatting exception. Falling back to standard exception\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Traceback (most recent call last):\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 3460, in run_code\n",
+ " exec(code_obj, self.user_global_ns, self.user_ns)\n",
+ " File \"/tmp/ipykernel_16478/3936511566.py\", line 4, in \n",
+ " horizontal_stack = pd.concat([surveys_df_sub_first10, surveys_df_sub_last10], axis=1)\n",
+ "NameError: name 'surveys_df_sub_first10' is not defined\n",
+ "\n",
+ "During handling of the above exception, another exception occurred:\n",
+ "\n",
+ "Traceback (most recent call last):\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 2057, in showtraceback\n",
+ " stb = self.InteractiveTB.structured_traceback(\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1288, in structured_traceback\n",
+ " return FormattedTB.structured_traceback(\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1177, in structured_traceback\n",
+ " return VerboseTB.structured_traceback(\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1030, in structured_traceback\n",
+ " formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 935, in format_exception_as_a_whole\n",
+ " self.get_records(etb, number_of_lines_of_context, tb_offset) if etb else []\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 987, in get_records\n",
+ " style = stack_data.style_with_executing_node(style, self._tb_highlight)\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/stack_data/core.py\", line 455, in style_with_executing_node\n",
+ " class NewStyle(style):\n",
+ " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 91, in __new__\n",
+ " ndef[4] = colorformat(styledef[3:])\n",
+ " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 58, in colorformat\n",
+ " assert False, \"wrong color format %r\" % text\n",
+ "AssertionError: wrong color format 'ansiyellow'\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 3.\n",
+ "surveys_df_sub_last10 = surveys_df.tail(10)\n",
+ "surveys_df_sub_last10 = surveys_df_sub_last10.reset_index(drop=True)\n",
+ "horizontal_stack = pd.concat([surveys_df_sub_first10, surveys_df_sub_last10], axis=1)\n",
+ "horizontal_stack['species_id']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4f860304",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Unexpected exception formatting exception. Falling back to standard exception\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Traceback (most recent call last):\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 3460, in run_code\n",
+ " exec(code_obj, self.user_global_ns, self.user_ns)\n",
+ " File \"/tmp/ipykernel_16478/4095435365.py\", line 2, in \n",
+ " horizontal_stack.iloc[:,5]\n",
+ "NameError: name 'horizontal_stack' is not defined\n",
+ "\n",
+ "During handling of the above exception, another exception occurred:\n",
+ "\n",
+ "Traceback (most recent call last):\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py\", line 2057, in showtraceback\n",
+ " stb = self.InteractiveTB.structured_traceback(\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1288, in structured_traceback\n",
+ " return FormattedTB.structured_traceback(\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1177, in structured_traceback\n",
+ " return VerboseTB.structured_traceback(\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 1030, in structured_traceback\n",
+ " formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 935, in format_exception_as_a_whole\n",
+ " self.get_records(etb, number_of_lines_of_context, tb_offset) if etb else []\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/IPython/core/ultratb.py\", line 987, in get_records\n",
+ " style = stack_data.style_with_executing_node(style, self._tb_highlight)\n",
+ " File \"/home/jelle/.local/lib/python3.8/site-packages/stack_data/core.py\", line 455, in style_with_executing_node\n",
+ " class NewStyle(style):\n",
+ " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 91, in __new__\n",
+ " ndef[4] = colorformat(styledef[3:])\n",
+ " File \"/usr/lib/python3/dist-packages/pygments/style.py\", line 58, in colorformat\n",
+ " assert False, \"wrong color format %r\" % text\n",
+ "AssertionError: wrong color format 'ansiyellow'\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 4.\n",
+ "horizontal_stack.iloc[:,5]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d6382b5f-9c13-4ceb-88f6-f7ba13f797f7",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "### Exercise 12\n",
+ "Time to play with plots! Look at the pandas.DataFrame.plot() documentation [here](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.plot.html) and change your data visualization swapping the DataFrame columns, and changing the axes labels.\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "1896c89b-f2c2-41ae-9a18-9b9fb8a87648",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Text(0.5, 0.98, 'Scatter plot of weight versus hindfoot length')"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "