cmsc320.tex

% Adapted from Alex Reustle's CMSC351 Course Notes

% This program is free software: you can redistribute it and/or modify
% it under the terms of the GNU General Public License as published by
% the Free Software Foundation, either version 3 of the License, or
% (at your option) any later version.

% This program is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
% GNU General Public License for more details.

% You should have received a copy of the GNU General Public License
% along with this program.  If not, see <http://www.gnu.org/licenses/>.
\documentclass[english, 10pt]{article}

\usepackage{notes}
\usepackage{gensymb}
\usepackage{inconsolata}
\usepackage[shellescape]{gmp}
\allowdisplaybreaks%
\newcommand{\thiscoursecode}{CMSC 320}
\newcommand{\thiscoursename}{Introduction to Data Science}
\newcommand{\thisprof}{Prof.\ John Dickerson}
\newcommand{\email}{apraveen@cs.umd.edu}
\newcommand{\me}{Akilesh Praveen}
\newcommand{\thisterm}{Fall 2020}
\newcommand{\website}{https://cmsc320.github.io}%chktex 8
\usepackage{ifpdf}
\ifpdf%
\DeclareGraphicsRule{*}{mps}{*}{}
\fi
% \listfiles

\usepackage[utf8]{inputenc}

\usepackage{amsmath}
\usepackage{titlesec}% http://ctan.org/pkg/titlesec

 
\usepackage{listings}
\lstloadlanguages{Python}
\usepackage{xcolor}
\usetikzlibrary{patterns}
\usepackage[many]{tcolorbox}
 
\definecolor{codegreen}{rgb}{0,0.6,0}
\definecolor{codegray}{rgb}{0.5,0.5,0.5}
\definecolor{codepurple}{rgb}{0.58,0,0.82}
\definecolor{backcolour}{rgb}{0.95,0.95,0.94}
\definecolor{codered}{rgb}{0.5,0.15,0.15}
\definecolor{commentred}{rgb}{1,0.01,0.02}
 
\lstdefinestyle{mystyle}{
    backgroundcolor=\color{backcolour},   
    commentstyle=\color{codegreen},
    keywordstyle=\color{red},
    numberstyle=\tiny\color{codegray},
    stringstyle=\color{codered},
    basicstyle=\ttfamily\footnotesize,
    breakatwhitespace=false,         
    breaklines=true,                 
    captionpos=b,                    
    keepspaces=true,
    xleftmargin=.15\textwidth,
    xrightmargin=.15\textwidth,
    linewidth=\textwidth,                 
    numbers=left,                    
    numbersep=5pt,                  
    showspaces=false,                
    showstringspaces=false,
    showtabs=false,                  
    tabsize=2,
    belowskip=3em,
    aboveskip=3em,
}

\lstset{style=mystyle}


% \VerbEnvir{align tikzpicture algorithm}
%%%Headers
\chead{320 - Intro to Data Science}
\lhead{\thisterm}

%%%%% TITLE %%%%%
\graphicspath{{../}}
\newcommand{\notefront}{%
\pagenumbering{arabic}
\begin{center}
{\small}
\textbf{\Huge{\noun{\thiscoursecode}}}
{\Huge \par}
{\Large{\noun{\thiscoursename}}}\\
\vspace{0.1in}
\vspace{0in}\includegraphics[scale=0.3]{umd_cs.jpg} \\
\vspace{0.1in}{\noun\me} $\bullet$ {\ttfamily \url{\email}} \\
{\noun\thisprof} \ $\bullet$ \ {\noun\thisterm} \ $\bullet$ \ {\noun{University of Maryland}} \\
{\ttfamily \url{\website}} \\
\end{center}
}

 \tikzstyle{class}=[
    rectangle,
    draw=black,
    text centered,
    anchor=north,
    text=black,
    text width=2cm,
    shading=axis,
    bottom color={rgb:red,222;green,222;blue,222},
    top color=white,shading angle=45]


\newtcolorbox{myproof}[1][]{
   enhanced,arc=0pt, frame hidden, borderline west = {1pt}{0pt}{black}, #1,breakable
}

\begin{document}
% \renewcommand\familydefault{\sfdefault}
% \sffamily
  % Notes front
  \notefront%
  % Table of Contents and List of Figures
  \tocandfigures%
  
\section{Notes \& Preface}

Course notes for CMSC320, under Prof. John Dickerson. Notes collected from previous and current lectures.

\section{Introduction (L1)}

\subsection{What is Data Science?}

Data Science is the application of computation and statistical techniques to address or gain insight.
It's the intersection of statistics and Computer Science.
Based on what I've learned thus far, learning to do data science is like learning how to use a TI-84 in statistics class.
You're simply learning how to leverage programming tools in order to perform advanced, complex, and meaningful data-related operations.\newline

It's the use of statistics and computer science in order to find real-world insights.\newline\newline

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Circle [id:dp061140258122834745] 
\draw  [fill={rgb, 255:red, 255; green, 219; blue, 224 }  ,fill opacity=1 ] (254.67,119.5) .. controls (254.67,77.8) and (288.47,44) .. (330.17,44) .. controls (371.86,44) and (405.67,77.8) .. (405.67,119.5) .. controls (405.67,161.2) and (371.86,195) .. (330.17,195) .. controls (288.47,195) and (254.67,161.2) .. (254.67,119.5) -- cycle ;
%Straight Lines [id:da4450208385410075] 
\draw    (205.67,89.33) -- (246.48,105.91) ;
\draw [shift={(248.33,106.67)}, rotate = 202.11] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da055339322100053545] 
\draw    (457,84.67) -- (418.13,103.78) ;
\draw [shift={(416.33,104.67)}, rotate = 333.81] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da5012858238681945] 
\draw    (330.17,240) -- (330.17,211.33) ;
\draw [shift={(330.17,209.33)}, rotate = 450] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (274.67,112.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {DATA SCIENCE};
% Text Node
\draw (125.33,62) node [anchor=north west][inner sep=0.75pt]   [align=left] {CS Hacking Skills};
% Text Node
\draw (428.67,44.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {Math + Statistics\\Knowledge};
% Text Node
\draw (158.17,248.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {Common sense (real world substantive experience)};


\end{tikzpicture}

}

\subsection{Topics}

Here are the general topics that this class will cover.

\begin{itemize}
	\item Processing data
	\item Visualizing data
	\item Understanding data
	\item Communicating data
	\item Extracting value from data
\end{itemize}

\subsection{Tools}

Here are some tools commonly employed by data scientists. We'll try to cover how to use most of them here.

\begin{itemize}
	\item Python
	\item Scikit-Learn
	\item Docker
	\item PANDAS
	\item Spark
	\item TensorFlow
\end{itemize}

\subsection{Conda}

Conda is a package and environment manager for python that we can use with the command line.
We can create multiple environments for us and install separate packages in each of them.
This will be highly useful to us, as we sometimes want to consolidate the tools we use into separate environments.

\section{Python, Jupyter, Literate Programming (L2)}

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Data Collection} $\rightarrow$ The process of measuring and gathering information on targeted variables.
\end{tcolorbox}

\subsection{Literate Programming}

The idea of \textbf{literate programming} is that you have the source code, an explanation of the source code, and the end result of running the code all in one file. Usually, this file is identified as a \textit{notebook}. In other words, the syntax is no different from regular code, you just get a more organized way to show off tables, plots, and other outputs generated from your code.

\subsection{Jupyter Notebook + Alternatives}

Jupyter Notebook is a service that started off as \texttt{iPython}, but it's basically a web-based platform that we use for literate programming. Specifically, it supports Python-based literate programming. Most data scientists prefer it, and it can also apparently leverage big data tools, such as Apache Spark.\newline

It saves files in \texttt{.ipynb} format, which most platforms (i.e. GitHub) have built in viewers for. Options to export in other readable formats are available. Basically, it's just Python with a bunch of bells and whistles on top to make the output of your code look pretty.\newline

\textit{Apache Zeppelin} is an alternative data analysis tool, but we will stick to Jupyter for our purposes. This is because Jupyter seems to be preferred in industry.\newline

\textit{RStudio} is the equivalent, for people who prefer to use the \texttt{R} programming language for data science.\newline

This course will be centered around Jupyter Notebook.

\subsection{List Comprehensions in Python}

To make lists in Python, you can use loops or the \texttt{map()} function, but a \textit{pythonic} way of doing this would be to use a list comprehension. Below is a simple example.\newline

\begin{myproof}
\textbf{Example:} Make a list of all the squares of \texttt{\{0,1,2,3,4,5,6,7,8,9\}}\newline\newline
\textbf{List Comprehension:}\newline
\texttt{squares = [i * i for i in range(10)]}
\end{myproof}

A good way of thinking about this is that it allows you to build sets like a mathematician. This is a common theme in data science, where we can find the intersection between a lot of math stuff and computer science stuff. It's good to know how lists are generated in a mathematical sense in Python for that reason. Here's an example where we translate mathematical notation into a Python list comprehension.\\

\begin{myproof}
\textbf{Example:} Make a list of all odd natural numbers from 0 to 999\\\\
\textbf{Math Notation:}\\ $E =$ \{$x$ | $x$ in $\mathbb{N} \land x$ is odd $\land x<1000$ \}\\\\
\textbf{List Comprehension:}\\
\texttt{E = [x for x in range(1000) if x \% 2 != 0]}
\end{myproof}

\subsection{Using Python3}

We will use Python3. Since I used Python2 during my internship, I'm going to note some big changes to keep track of.\\

\begin{itemize}
	\item Python3 is backwards incompatible. (Don't write in Python2!)
	\item Print has changed from a command to a function, so make sure to use proper function notation when invoking it.
	\item Division has changed. \texttt{1/2} no longer equals 0. \texttt{1/2 == 0.5} and floored division is now taken care of this way: \texttt{1//2 == 0}
\end{itemize}

\subsection{Python vs. R for Data Scientists}

Some arguments for both sides in terms of what to use.

\begin{itemize}
	\item Python is a 'full' programming langauge. Also, if you've got prior experience with Python paradigms or just programming in general, that's a big plus in terms of learning curve.
	\item R has more mature 'pure statistics' libraries, but Python is apparently catching up.
	\item In terms of \textbf{processing speed}, R is certainly faster. It was designed and optimized for statistics processing.
	\item Python is preferable for machine learning operations, which is pretty big right now.
\end{itemize}

My personal choice will be to use Python as much as I can when I'm studying this course. Since it's more prominent in the tech industry, I should be using it more anyway.

\subsection{The Classic Statistical View of Data}

There are \textbf{four} main types of data: Nominal, Ordinal, Interval, and Ratio data. They can each be classified under two main subgroups, Categorical and Numerical data. Here's a visualization.\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Rectangle [id:dp8149613399348021] 
\draw   (72,131) -- (182.33,131) -- (182.33,170.33) -- (72,170.33) -- cycle ;
%Shape: Rectangle [id:dp680618887900226] 
\draw   (239.67,91.67) -- (350,91.67) -- (350,131) -- (239.67,131) -- cycle ;
%Shape: Rectangle [id:dp777477779359951] 
\draw   (239.67,170.33) -- (350,170.33) -- (350,209.67) -- (239.67,209.67) -- cycle ;
%Shape: Rectangle [id:dp9580214955547155] 
\draw   (433.67,53.67) -- (544,53.67) -- (544,93) -- (433.67,93) -- cycle ;
%Shape: Rectangle [id:dp19525566557447283] 
\draw   (433.67,100.5) -- (544,100.5) -- (544,139.83) -- (433.67,139.83) -- cycle ;
%Shape: Rectangle [id:dp18958725273981114] 
\draw   (433.67,159.67) -- (544,159.67) -- (544,199) -- (433.67,199) -- cycle ;
%Shape: Rectangle [id:dp27616820331988945] 
\draw   (433.67,207.67) -- (544,207.67) -- (544,247) -- (433.67,247) -- cycle ;
%Straight Lines [id:da07504069427898741] 
\draw    (182.33,150.5) -- (238.04,110.5) ;
\draw [shift={(239.67,109.33)}, rotate = 504.32] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da18494381831895634] 
\draw    (182.33,150.5) -- (238.08,193.45) ;
\draw [shift={(239.67,194.67)}, rotate = 217.61] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da49166239871511697] 
\draw    (350.33,110.67) -- (431.84,74.15) ;
\draw [shift={(433.67,73.33)}, rotate = 515.87] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da6935259459415319] 
\draw    (350.33,110.67) -- (431.68,119.13) ;
\draw [shift={(433.67,119.33)}, rotate = 185.94] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da6050313931467277] 
\draw    (350.33,190.67) -- (431.85,227.84) ;
\draw [shift={(433.67,228.67)}, rotate = 204.51] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da8985137963347964] 
\draw    (350.33,188.67) -- (431.68,178.25) ;
\draw [shift={(433.67,178)}, rotate = 532.71] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (110,142.17) node [anchor=north west][inner sep=0.75pt]   [align=left] {Data};
% Text Node
\draw (254,102) node [anchor=north west][inner sep=0.75pt]   [align=left] {Categorical};
% Text Node
\draw (259.33,181.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {Numerical};
% Text Node
\draw (461.33,65) node [anchor=north west][inner sep=0.75pt]   [align=left] {Nominal};
% Text Node
\draw (462.67,112) node [anchor=north west][inner sep=0.75pt]   [align=left] {Ordinal};
% Text Node
\draw (462.67,168.83) node [anchor=north west][inner sep=0.75pt]   [align=left] {Interval};
% Text Node
\draw (471.33,218.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {Ratio};


\end{tikzpicture} \\
}

\subsubsection{Nominal Data}

A type of categorical data, nominal data value have names and describe the state of things. For example, your marriage status is nominal data because you can either be \textit{single}, \textit{married}, or \textit{separated}. Another example is the type of drink you're going to have. Will it be \textit{Milk}, \textit{Beer}, or \textit{Juice}?\\

The key here is that there can be no quantitative values assigned to each of these categories, as that would allow us to do math with them and would defeat the purpose of these labels. These values \textbf{cannot be easily compared}, so they have no material value. \textit{E.g. being single is not quantitatively better than being married (objectively), and vice versa}.\\

\begin{myproof}
\textbf{Example:} What is your marital status?
\begin{itemize}
	\item Married
	\item Divorced (separated)
	\item Single
\end{itemize}
\end{myproof}

\subsubsection{Ordinal Data}

Ordinal data represents values that have names that describe the state of things, but in this case, there \textbf{is} an ordering of those values. This is what sets it apart from nominal data. \\ 
\begin{myproof}
\textbf{Example:} What did you think of the movie?
\begin{itemize}
	\item Strongly liked
	\item Liked
	\item Indifferent
	\item Disliked
	\item Strongly Disliked
\end{itemize}
\end{myproof}

Given how subjective some of these things can be, the distinction between nominal and ordinal can be \textbf{blurry} at times. For example, going back to our nominal example, some people may think that being single is quantitatively better than being married.

\subsubsection{Interval and Ratio Data}

Interval and Ratio data are pretty similar, and both can be used to measure things that can be represented by either integers or real numbers. \\

\textbf{Interval} data scales with fixed but arbitrary values. That might sound silly, but a good example is \textbf{dates}. Below is an example of two data comparisons of interval data that seem arbitrary, but indeed hold integer value.\\

\begin{myproof}
\textbf{Example:} The following two operations are equal. \\\\
\texttt{10/1/2019 - 9/1/2019} \\
\texttt{10/1/2018 - 9/1/2018}
\end{myproof}

The measures don't look like integer values at first, but we can quantify them by marking them with days.\\

Here's what sets \textbf{Interval} data apart, however. You have \textbf{no method} of computing ratios or scales with it. For example, never mind that you can try computing \texttt{(9/1/2019 $\times$ 8/25/2015)}, the unit of the answer would be totally useless to us, and neither would the actual number, even if you went ahead with the operation.\\

\textbf{Ratio} data is, in essence, the same as interval data in that it is numerical, but the scale itself \textbf{has a true zero}. While dates don't necessarily have a true zero, we can say that money counts as ratio data. For example, having zero money means that you're at the absolute zero of that scale, whereas the absolute zero for dates is disputable. Are we saying we're starting at O A.D.? The Big Bang? Even earlier?\\

Differentiating between the two is usually a case-by-case basis thing, which is what I'm thinking is the best way to handle any conflicts I end up running into between ratio and interval data.\\

\begin{myproof}
\textbf{Example:} Interval data \\\\
Temperature on the scale of Celsius or Fahrenheit is interval-type data because $0\degree$ is set to an arbitrarily fixed point. Also, we can't scale it properly- $30\degree F$ isn't twice as hot as $15\degree F$.\\\\
\textbf{Example:} Ratio data \\\\
Temperature on the Kelvin scale is ratio data. $0K$ is set at legitimate absolute zero, and $50K$ is truly twice as cold as $100K$.
\end{myproof}

\subsection{Data Science at a Glance}

Data science is basically manipulating and computing using data. As such, we need to shift our thinking from writing \textbf{imperative} code to manipulate \textbf{data structures} to creating \textbf{sequences and pipelines} to conduct operations on \textbf{data}. That stuff is covered more in 420 and 424, for reference.\newline

More often than not, we have to take the data that we've found and make it easily understandable for humans. This is called Data Representation.\newline

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Data Representation} $\rightarrow$ The natural way to think about data, in a human way.
\end{tcolorbox}

Here are some ways that we think about data in this class:\newline

\begin{itemize}
	\item \textbf{One Dimensional Arrays} $\rightarrow$ E.g. \texttt{<'red', 'blue', 'green'>} or \texttt{<0,3,4>}. We can use functions like map, fold and filter to manipulate these.
	\item \textbf{N-Dimensional Arrays} $\rightarrow$ Also known as \textbf{tensors}.
	\begin{itemize}
		\item For example, a Tensor of dimensions \texttt{[6,4]} is just a $6\times 4$ matrix.
		\item Similarly, a Tensor of dimensions \texttt{[4,4,2]} is a 3D array.
		\item Here, we can start to make use of \textbf{Linear Algebra} for further data manipulation. Some example operations that we can use to mess with tensors:
		\begin{itemize}
			\item Matrix/Tensor Multiplications
			\item Transpose
			\item Vector Multiplication
			\item Matrix Factorization (we will explore this later)
		\end{itemize}
	\end{itemize}
	\item \textbf{Sets} of objects, or \textbf{Key/Value Pairs}
	\item \textbf{Tables/Relations} $\rightarrow$ This goes into relational databases, which is the basis of \texttt{SQL}. We'll go into this later.
	\item \textbf{Hierarchies/Trees/Graphs} $\rightarrow$ This sort of spills over into data structures, but they've got some additional nuances included with them.
	\begin{itemize}
		\item They tend to make use of 'path' queries
		\item Graph and Tree Algorithms will be useful here, efficiency is key
		\item Example: networks are represented this way, we'll cover that later in this class
	\end{itemize}
\end{itemize}

\section{Getting Data to Work With (L3)}

\subsection{Acquiring Data}

Here are some examples of how we can grab data from places. Pretty obvious, common sense stuff. We're going to explore all of these as we move forward.

\begin{itemize}
	\item Direct download from online or loading it from local storage
	\item Generate the data locally via a simulation or equivalent program
	\item Query data from a database 
	\item Query data from an API
	\item Scrape data from a website
\end{itemize}

When you pull from APIs, you're going to want to be using HTTP Requests.

\subsection{RESTful APIs}

This stands for REpresentational State Transfer APIs, and it's basically a standard that enforces that APIs do a few things. It says that they should support these basic operations:

\begin{itemize}
	\item \texttt{GET} $\rightarrow$ Query a data entry
	\item \texttt{POST} $\rightarrow$ Create a new data entry
	\item \texttt{PUT} $\rightarrow$ Update an existing data entry
	\item \texttt{DELETE} $\rightarrow$ Delete an existing data entry
\end{itemize}

RESTful APIs are also supposed to be stateless. That is, with every API request, you send a token of who you are, and you get a current capture of the data at that time/edit the data.\newline

A good example of a REST API is Github, where you can use REST API calls on your repositories.\newline

There are other guiding principles and miscellaneous guidelines for RESTful APIs, which can all be found at \texttt{https:/restfulapi.net}\newline

\begin{tcolorbox}[title=Aside: GRAPHQL,colframe=black,colback=white,arc=0pt,fonttitle=\bfseries]
\textbf{GraphQL} $\rightarrow$ REST has been adopted by many developers and is widely regarded as the traditional way to send data over HTTP. GraphQL, on the other hand, is a revolutionary new player that's presented as a way to \textit{replace} legacy REST APIs \textit{(back4apps)}
\end{tcolorbox}

\subsection{Oauth}

If you want to grant an app access to your identity without actually giving it your username and password, is there a way to do that? The answer is \textbf{yes}, because this is a common software engineering problem.\newline

\textbf{OAuth} is the standard for \textit{access delegation} used for internet users to grant websites access to their information on other sites. A pretty good example of this is Google's sign in page on other websites. How do you think other websites conduct sign in without knowing your password for your Google account?

\subsection{GET Requests}

Assume we used Python's \texttt{requests} module to query a server with a \texttt{GET} request.\newline

First, we'd get either a \texttt{CSV}, \texttt{JSON}, or \texttt{HTML/XML/XHTML} file back, in response. This is the data that we have to sift through. \textit{Note:} You might also get a domain-specific file, like an \textbf{rvt} file. You're always welcome to make your own filetype for storing data, but make sure it's actually documented somewhere.\newline

\begin{tcolorbox}[title=Aside: Parsing CSVs and JSON,colframe=black,colback=white,arc=0pt,fonttitle=\bfseries]
Never write your own \texttt{CSV} or \texttt{JSON} parsers. This is another example of reinventing the wheel. We'll use Python Libraries to do this more easily. \textit{E.g. PANDAS}
\end{tcolorbox}

\subsection{More on Data Storage Formats}

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Serialization} $\rightarrow$ The process of converting objects into strings.\newline
\textbf{Deserialization} $\rightarrow$ The process of converting strings back into objects.
\end{tcolorbox}

\textbf{\texttt{JSON}} is a pretty common format for serializing objects. Plus, serializing objects makes it easier for humans to read and perform sanity-checks on. In Python, \texttt{JSON} is built with Strings, Lists, Dictionaries, and sometimes mixes of a few of those together.\newline

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Document Object Model} $\rightarrow$ A tree-based data storage method. For example, HTML is structured this way.
\end{tcolorbox}

\subsubsection{SAX}

SAX is a lightweight way to process XML. It generates a stream of events as it parses an XML file. IT helps us pay attention to individual parts of an XML file without having to process through the rest of it.

\subsection{Parsing HTML}

Parsing HTML is the hardest to do in this case, as I've seen many times before in hackathon projects. Although HTML's specifications are pure, the real world examples of it are pretty nightmarish, thanks to how it interacts with JavaScript and loads dynamic content. All in all, it's fairly unreliable in terms of parsing it manually.\newline

In this case, we're best off using the Python library \texttt{BeautifulSoup}. We can also make use of Python's Regex, which is similar enough to Ruby regex that we worked with in 330. A website like Rubular-\texttt{https://pythex.org} will be useful in this case.\newline

By combining \texttt{BeautifulSoup}, Regular expressions, and \texttt{GET} requests, we can make the process fairly streamlined. This is usually what we'll be using to scrape websites. In order to scrape more dynamic websites, we'd probably have to make use of Selenium. Check my 320 folder to find an example of a simple webscraper with \texttt{BeautifulSoup}.

\section{NumPy, Best Practices, Ethics (L4)}

\subsection{Available Technologies}

Python has a bunch of 3rd party packages for scientific and numerical computation. Some examples are..

\begin{itemize}
	\item \textbf{\texttt{Numpy} and \texttt{Scipy}} $\rightarrow$ Numerical and scientific function libraries.
	\item \textbf{\texttt{NUMBA}} $\rightarrow$ A Python compiler supporting 'Just in Time' compilation. That is, it supports compilation of code while code is running.
	\item \textbf{\texttt{ALGLIB}} $\rightarrow$ A cross-platform numerical analysis library
	\item \textbf{\texttt{PANDAS}} $\rightarrow$ An extensive data analysis tool with some neat built-in data structures
	\item \textbf{\texttt{PyGSL}} $\rightarrow$ GNU Scientific Library in Python
	\item \textbf{Scientific Python} $\rightarrow$ A collection of scientific computing modules for Python
\end{itemize}

These are a bunch of examples of what's available to developers right now, but we won't focus on all of it. Particular emphasis will be placed on \texttt{Numpy} and \texttt{PANDAS}.

\subsection{NumPy Stack}

The \textbf{NumPy} stack is the most commonly used out of all of these packages. It includes the following:

\begin{itemize}
	\item \texttt{Numpy} - Works sort of like MatLab, just lets us handle a lot of number manipulation and mathematical operations
	\item \texttt{Matplotlib} - This is a plotting and graphing library
	\item \texttt{PANDAS} - This gives us a bunch of data structures and data analysis tools to play with/keep track of our data. (Usually, you'll want to import your data into a \texttt{PANDAS} \texttt{dataframe} or something.)
	\item \texttt{SciPy}
	\item \texttt{SymPy}
	\item \texttt{Jupyter} - This will be our medium for \textbf{literate programming}.
\end{itemize}

To see more about this stuff, search Google for the \textbf{NumPy Stack} and you'll find everything you need.

\subsubsection{Misc About NumPy}

Here are a few more notable things about \texttt{Numpy}:

\begin{itemize}
	\item It contains the \textbf{n-dimensional array} object
	\item It contains 'sophisticated' functions that we can use
	\item It provides us with excellent tools to integrate \texttt{C++}, \texttt{C}, and even \texttt{FORTRAN}
	\item It has math capabilities that are highly useful to us (e.g. Linear Algebra, Fourier Transform, etc)
	\item \texttt{Numpy} also comes with a bunch of new DataTypes for us to use.
\end{itemize}

\begin{tcolorbox}[title=Aside: Numpy Arrays,colframe=black,colback=white,arc=0pt,fonttitle=\bfseries]
Arrays in \texttt{Numpy} are different from regular lists in Python, so make sure your syntax is correct and you know the difference when you decide to use either one in practice.
\end{tcolorbox}

\subsubsection{Linear Algebra with NumPy}

One of \texttt{NumPy}'s most common uses lies within its \textbf{Linear Algebra} module. It allows us to do regular LA stuff, like \texttt{.transpose()} and \texttt{.inverse()} to matrices stored as n-dim arrays. Here's an example.

{\centering
\begin{lstlisting}[language=python]
# Note: remember, we have to use NumPy's n-dimensional array object here

array([[1.0, 2.0],
       [3.0, 4.0]]).transpose()
\end{lstlisting}
}

\subsubsection{SciPy}

\textbf{\texttt{SciPy}} includes various tools and functions for solving common problems in \textbf{scientific computing}.\\

We won't use it much for now, but it's supposed to be good to know. Often you'll be able to find higher-level \texttt{Scipy} functions that will work around the need to call lower-level \texttt{Scipy} functions. It's got a lot of functionality built in, so make sure not to overlook it.

\subsection{The Idea of Reproducibility}

Starting from the same dataset, can we reproduce your analysis and get the same results? \textbf{This is the goal that we're trying to fulfill with our analysis}- we want our stuff to be reproducible! (Otherwise, what exactly does it even mean?)

\subsection{Best Practices}

Honestly, most of this stuff should be common sense.

\begin{itemize}
	\item Use version control to keep track of code. (e.g. \texttt{git})
	\item Use unit testing. (e.g. \texttt{unittest} module in python)
	\item Use libraries when you can. (don't reinvent the wheel!)
\end{itemize}

\subsection{The Idea of Open Data}

Some data should be widely available for everyone to use as they want, without restrictions from copyright, etc.\\

This is probably where all of our free data comes from, so this idea is super helpful to us as data scientists.

\subsection{General Process}

Here's the general process for data science- just so we have an idea of what's going on.\\


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Rectangle [id:dp9523245055647409] 
\draw   (34.67,19.33) -- (210.33,19.33) -- (210.33,257.33) -- (34.67,257.33) -- cycle ;
%Shape: Rectangle [id:dp26296611780141976] 
\draw   (262,19.33) -- (437.67,19.33) -- (437.67,257.33) -- (262,257.33) -- cycle ;
%Shape: Rectangle [id:dp4749898829673477] 
\draw   (491.33,19.33) -- (667,19.33) -- (667,257.33) -- (491.33,257.33) -- cycle ;
%Straight Lines [id:da3772898435405757] 
\draw    (65.67,70.33) -- (177.67,69.67) ;
%Straight Lines [id:da47086799536926727] 
\draw    (292.33,70.33) -- (404.33,69.67) ;
%Straight Lines [id:da16258158592667293] 
\draw    (525.67,69.67) -- (637.67,69) ;
%Straight Lines [id:da8259565971285442] 
\draw    (215,138.33) -- (252.33,138.33) ;
\draw [shift={(254.33,138.33)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da6335608206253658] 
\draw    (444.33,138.33) -- (481.67,138.33) ;
\draw [shift={(483.67,138.33)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (63.33,35) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textbf{Data Acquisition}};
% Text Node
\draw (292.67,27.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textbf{Algorithm / Tool}\\\textbf{Development}};
% Text Node
\draw (520.67,37) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textbf{Data Acquisition}};
% Text Node
\draw (42,131.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {Acquire the data, then\\put it in a usable format.};
% Text Node
\draw (270,91.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {If new tools are required\\to analyze your data,\\create them.\\\\If new algorithms may\\be needed to analyze it,\\synthesize them.};
% Text Node
\draw (511.33,94.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {Use the tools you've\\built / setup and the \\algorithms you've \\created or \\implemented to \\analyze the data.};


\end{tikzpicture}

\hfill \break After we do that, we still technically have some programming left. In this new era of literate programming, there's one more step of processing we have to do with our results in order to make them publicly presentable and meaningful.\\


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Rectangle [id:dp9523245055647409] 
\draw   (34.67,19.33) -- (667.67,19.33) -- (667.67,198.67) -- (34.67,198.67) -- cycle ;
%Straight Lines [id:da3772898435405757] 
\draw    (285.67,69.67) -- (397.67,69.67) ;

% Text Node
\draw (246,35.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textbf{Communication of Results}};
% Text Node
\draw (126,98.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {Use the principles of \textbf{literate programming }to provide results, plots,\\and publication regarding your data science experiment. This gives\\you the ability to explain your choices and clearly interpret your\\results.};


\end{tikzpicture}


\hfill \break It's emphasized a lot here to think like an \textbf{algorithm developer}, as you're going to need efficiency in the data analysis that you perform. However, you also need to think like an experiment-conducting \textbf{data scientist}. We don't usually get enough training as the latter, so hopefully this course should be an introduction to that sort of stuff.

\subsection{Project Organization}

Make sure to organize your project in folders appropriately. Specifically, even if you have a lot of components, group with with a focus on experimental procedure.\\

You should certainly be isolating things like \textbf{data}, \textbf{tools}, and \textbf{experiments} into their own folders. Data could include your raw input data, along with data that you've done some processing on. Tools could include Python environment you're using, and experiments could include the meat of what your data science work will be- pipeline scripts, results, figures, plots, analysis scripts, etc.

\subsection{A Little on Bias, Ethics, Responsibility}

\begin{tcolorbox}[title=Aside: Fairness Through Blindness,colframe=black,colback=white,arc=0pt,fonttitle=\bfseries]
The concept of not letting an algorithm look at protected attributes in order to keep it from forming potentially harmful biases.
\end{tcolorbox}

\hfill \break A great example of fairness through blindness could be software that determines the outcome for a loan application. We want the results to be \textbf{independent} of an applicant's race, but they can be \textbf{dependent} on non-protected attributes, such as credit history and income.\\

\begin{tcolorbox}[title=Aside: FATML,colframe=black,colback=white,arc=0pt,fonttitle=\bfseries]
\textbf{FATML} stands for Fairness, Accountability, and Transparency in Machine Learning.
\end{tcolorbox}

\hfill \break Overall, here are some guiding principles for data ethics:

\begin{itemize}
	\item Start with clear user need, with a focus on public benefit. (Can't go wrong with this!)
	\item Use data and tools that emphasize \textbf{minimum} intrusion/invasion of privacy. (Sometimes, we have no choice but to handle sensitive data)
	\item Create robust data science models that minimize bias and focus on objective accuracy.
	\item Be alert to public perceptions.
	\item Be open and accountable for your actions.
	\item Security is key- especially if working with sensitive data.
\end{itemize}

\section{Tables, Relational DB, and Pandas (L5 + L6)}

\subsection{Tables}

Here's the idea- we can abstract data into our own little data structures just like computer scientists do, and a lot of the time, in data science, tables are the optimal way to do that. (This is why software like PANDAS and Numpy have excellent support for these structures.)\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Rectangle [id:dp5949882099932227] 
\draw  [color={rgb, 255:red, 139; green, 87; blue, 42 }  ,draw opacity=1 ] (52.33,67.67) -- (118.67,67.67) -- (118.67,90.67) -- (52.33,90.67) -- cycle ;
%Shape: Rectangle [id:dp5180660849056042] 
\draw   (119,67.67) -- (185.33,67.67) -- (185.33,90.67) -- (119,90.67) -- cycle ;
%Shape: Rectangle [id:dp928756418037382] 
\draw   (185,67.67) -- (251.33,67.67) -- (251.33,90.67) -- (185,90.67) -- cycle ;
%Shape: Rectangle [id:dp982011584275448] 
\draw   (251.67,67.67) -- (318,67.67) -- (318,90.67) -- (251.67,90.67) -- cycle ;
%Shape: Rectangle [id:dp4812611593055496] 
\draw  [color={rgb, 255:red, 139; green, 87; blue, 42 }  ,draw opacity=1 ] (52.33,90.33) -- (118.67,90.33) -- (118.67,200.67) -- (52.33,200.67) -- cycle ;
%Shape: Rectangle [id:dp14867838676065692] 
\draw   (118.67,90.67) -- (185,90.67) -- (185,200.67) -- (118.67,200.67) -- cycle ;
%Shape: Rectangle [id:dp7944164207923816] 
\draw   (185.33,90.67) -- (251.67,90.67) -- (251.67,200.67) -- (185.33,200.67) -- cycle ;
%Shape: Rectangle [id:dp6914540924812163] 
\draw   (251.67,90.67) -- (318,90.67) -- (318,200.67) -- (251.67,200.67) -- cycle ;
%Straight Lines [id:da8659955223770235] 
\draw [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ]   (325.67,26) -- (86.97,67.99) ;
\draw [shift={(85,68.33)}, rotate = 350.02] [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da7649126412714919] 
\draw [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ]   (325.67,26) -- (154.11,67.86) ;
\draw [shift={(152.17,68.33)}, rotate = 346.28999999999996] [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da04591151144441219] 
\draw [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ]   (325.67,26) -- (220.36,67.6) ;
\draw [shift={(218.5,68.33)}, rotate = 338.44] [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da1253621546988095] 
\draw [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ]   (325.67,26) -- (286.22,66.89) ;
\draw [shift={(284.83,68.33)}, rotate = 313.97] [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da7853774637498458] 
\draw [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ]   (448.33,121.33) -- (317.65,106.89) ;
\draw [shift={(315.67,106.67)}, rotate = 366.31] [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da2813931715021274] 
\draw [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ]   (448.33,121.33) -- (317,125.27) ;
\draw [shift={(315,125.33)}, rotate = 358.28] [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da28293623689911906] 
\draw [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ]   (448.33,121.33) -- (316.3,146.3) ;
\draw [shift={(314.33,146.67)}, rotate = 349.28999999999996] [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da8467882522352468] 
\draw [color={rgb, 255:red, 139; green, 87; blue, 42 }  ,draw opacity=1 ]   (152,248.67) -- (86.75,212.63) ;
\draw [shift={(85,211.67)}, rotate = 388.90999999999997] [color={rgb, 255:red, 139; green, 87; blue, 42 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (78.67,71.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {id};
% Text Node
\draw (79.5,94.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 1}\\{\fontfamily{pcr}\selectfont 2}\\{\fontfamily{pcr}\selectfont 3}\\{\fontfamily{pcr}\selectfont 4}\\{\fontfamily{pcr}\selectfont 5}};
% Text Node
\draw (138.67,71.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {age};
% Text Node
\draw (196,71.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {weight};
% Text Node
\draw (263.33,71.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {height};
% Text Node
\draw (136.67,95.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 12}\\{\fontfamily{pcr}\selectfont 11}\\{\fontfamily{pcr}\selectfont 13.5}\\{\fontfamily{pcr}\selectfont 14}\\{\fontfamily{pcr}\selectfont 15}};
% Text Node
\draw (201.5,95.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 42.3}\\{\fontfamily{pcr}\selectfont 50.9}\\{\fontfamily{pcr}\selectfont 61.3}\\{\fontfamily{pcr}\selectfont 41.4}\\{\fontfamily{pcr}\selectfont 55}};
% Text Node
\draw (264.33,96.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 145.6}\\{\fontfamily{pcr}\selectfont 161.2}\\{\fontfamily{pcr}\selectfont 181.3}\\{\fontfamily{pcr}\selectfont 121.4}\\{\fontfamily{pcr}\selectfont 135.5}};
% Text Node
\draw (326.67,17.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{Variables (Attributes/Labels/Columns)}};
% Text Node
\draw (453.33,112.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{Observations (rows, tuples)}};
% Text Node
\draw (155.17,226.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.55,0.34,0.16}{Known as the Index (ID) column. Usually, there are no duplicates of these}\\\textcolor[rgb]{0.55,0.34,0.16}{allowed, and they're often ordered like this.}};


\end{tikzpicture}

}


\hfill \break Here's an example table. I've highlighted and color coded the important aspects of it. Remember, don't think of this as the data structure itself- this is just an abstraction to help us keep track of our vast amounts of data. However, most table implementations do a pretty good job of representing the stuff I've color coded.\\

\subsubsection{Selecting / Slicing}

Selecting one or more of the rows or columns in particular to analyze. Examples:

\begin{itemize}
	\item Select only columns ID + Age
	\item Select all rows with weight > 41
	\item We can also apply a combination of the above 2. (\textit{You can combine select rules!})
\end{itemize}

\subsubsection{Aggregating / Reducing}

Combining values across a column into a single value. (We don't do this across rows, because that obviously wouldn't make any sense. Think about it!). Examples:

\begin{itemize}
	\item Find the sum of all row's columns
	\item Find the max of the weight column
\end{itemize}

\textbf{Note:} It's usually never useful to aggregate/reduce the ID column, so for most cases, we ignore it when we perform such operations.

\subsubsection{Map}

Apply a function to every row, possibly creating fewer or more columns. This one's a little weird to think about without a clear example, so I'm including one below.\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Rectangle [id:dp4887463946097159] 
\draw  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ] (362.33,88.33) -- (428.67,88.33) -- (428.67,111.33) -- (362.33,111.33) -- cycle ;
%Shape: Rectangle [id:dp5770783881644757] 
\draw  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ] (429.21,88.33) -- (537,88.33) -- (537,111.33) -- (429.21,111.33) -- cycle ;
%Shape: Rectangle [id:dp6134551110117208] 
\draw   (537,88.33) -- (603.33,88.33) -- (603.33,111.33) -- (537,111.33) -- cycle ;
%Shape: Rectangle [id:dp5531077995700737] 
\draw   (603.67,88.33) -- (670,88.33) -- (670,111.33) -- (603.67,111.33) -- cycle ;
%Shape: Rectangle [id:dp3313757499092773] 
\draw  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ] (362.33,111) -- (428.67,111) -- (428.67,198.67) -- (362.33,198.67) -- cycle ;
%Shape: Rectangle [id:dp9339502110856537] 
\draw  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ] (428.67,111.26) -- (536.46,111.26) -- (536.46,198.67) -- (428.67,198.67) -- cycle ;
%Shape: Rectangle [id:dp07892533389980638] 
\draw   (537.33,111.26) -- (603.67,111.26) -- (603.67,198.67) -- (537.33,198.67) -- cycle ;
%Shape: Rectangle [id:dp979611278461765] 
\draw   (603.67,111.26) -- (670,111.26) -- (670,198.67) -- (603.67,198.67) -- cycle ;
%Shape: Rectangle [id:dp717440949324254] 
\draw  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ] (5.67,87.67) -- (72,87.67) -- (72,110.67) -- (5.67,110.67) -- cycle ;
%Shape: Rectangle [id:dp5659022269665408] 
\draw  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ] (72.54,87.67) -- (308.33,87.67) -- (308.33,110.67) -- (72.54,110.67) -- cycle ;
%Shape: Rectangle [id:dp9470998578275607] 
\draw  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ] (5.67,110.33) -- (72,110.33) -- (72,198) -- (5.67,198) -- cycle ;
%Shape: Rectangle [id:dp5421589353414771] 
\draw  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ] (72,110.6) -- (308.33,110.6) -- (308.33,198) -- (72,198) -- cycle ;
%Curve Lines [id:da1781160675260307] 
\draw [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ]   (264.33,79.33) .. controls (294.18,20.96) and (362.31,12.75) .. (399.77,78.34) ;
\draw [shift={(400.33,79.33)}, rotate = 240.75] [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Curve Lines [id:da028217104849822316] 
\draw [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ]   (397.67,208.67) .. controls (363.84,267.04) and (314.17,266.01) .. (265.07,206.24) ;
\draw [shift={(264.33,205.33)}, rotate = 410.88] [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (388.67,92.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {id};
% Text Node
\draw (390.17,122.06) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 1}\\{\fontfamily{pcr}\selectfont 2}\\{\fontfamily{pcr}\selectfont 3}};
% Text Node
\draw (446.67,91) node [anchor=north west][inner sep=0.75pt]   [align=left] {city};
% Text Node
\draw (550.67,91) node [anchor=north west][inner sep=0.75pt]   [align=left] {state};
% Text Node
\draw (628,91) node [anchor=north west][inner sep=0.75pt]   [align=left] {zip};
% Text Node
\draw (433.5,122.06) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont College Park}\\{\fontfamily{pcr}\selectfont D.C.}\\{\fontfamily{pcr}\selectfont Cupertino}};
% Text Node
\draw (555.5,122.06) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont MD}\\{\fontfamily{pcr}\selectfont D.C.}\\{\fontfamily{pcr}\selectfont CA}};
% Text Node
\draw (616.83,122.06) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 20742}\\{\fontfamily{pcr}\selectfont 2000}\\{\fontfamily{pcr}\selectfont 95014}};
% Text Node
\draw (32,91.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {id};
% Text Node
\draw (33.5,121.39) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 1}\\{\fontfamily{pcr}\selectfont 2}\\{\fontfamily{pcr}\selectfont 3}};
% Text Node
\draw (90,90.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {address};
% Text Node
\draw (76.83,121.39) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont College Park,}\\{\fontfamily{pcr}\selectfont D.C., }\\{\fontfamily{pcr}\selectfont Cupertino, }};
% Text Node
\draw (186.83,121.39) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont MD}\\{\fontfamily{pcr}\selectfont D.C.}\\{\fontfamily{pcr}\selectfont CA}};
% Text Node
\draw (225.5,121.39) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 20742}\\{\fontfamily{pcr}\selectfont 2000}\\{\fontfamily{pcr}\selectfont 95014}};
% Text Node
\draw (315.33,5.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{map}};
% Text Node
\draw (321.33,256.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{map}};


\end{tikzpicture}

}

\hfill \break Notice how applying map to either table is valid in this case- sometimes we want to break down columns into more specific values, and sometimes we want to combine them into singular columns. Each of these operations is totally valid, and has its uses. (This is evident in the projects for this class).\\

Again, this is mostly about what you need. There's no necessary better or worse in this case (more columns does not always equal better data).

\subsubsection{Group By}

Group By is an operation that allows you to group tuples together based on the values in columns/dimensions. Let's say we had the following table of house addresses like earlier. This time, we'll add the number of people in each house as a column as well.\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Rectangle [id:dp816510738764739] 
\draw  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ] (185,62.33) -- (251.33,62.33) -- (251.33,85.33) -- (185,85.33) -- cycle ;
%Shape: Rectangle [id:dp39744801489204895] 
\draw  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ] (251.87,62.33) -- (359.67,62.33) -- (359.67,85.33) -- (251.87,85.33) -- cycle ;
%Shape: Rectangle [id:dp16790194728351993] 
\draw   (359.67,62.33) -- (426,62.33) -- (426,85.33) -- (359.67,85.33) -- cycle ;
%Shape: Rectangle [id:dp5121628997613088] 
\draw   (426.33,62.33) -- (492.67,62.33) -- (492.67,85.33) -- (426.33,85.33) -- cycle ;
%Shape: Rectangle [id:dp38170728474734883] 
\draw  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ] (185,85) -- (251.33,85) -- (251.33,194.67) -- (185,194.67) -- cycle ;
%Shape: Rectangle [id:dp0783518509199912] 
\draw  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ] (251.33,85.33) -- (359.12,85.33) -- (359.12,194.67) -- (251.33,194.67) -- cycle ;
%Shape: Rectangle [id:dp8309347187193188] 
\draw   (360,85.33) -- (426.33,85.33) -- (426.33,194.67) -- (360,194.67) -- cycle ;
%Shape: Rectangle [id:dp0878616379496251] 
\draw   (426.33,85.33) -- (492.67,85.33) -- (492.67,194.67) -- (426.33,194.67) -- cycle ;
%Shape: Rectangle [id:dp9451653351898317] 
\draw   (493,62.33) -- (559.33,62.33) -- (559.33,85.33) -- (493,85.33) -- cycle ;
%Shape: Rectangle [id:dp11005059918565219] 
\draw   (493,85.33) -- (559.33,85.33) -- (559.33,194.67) -- (493,194.67) -- cycle ;

% Text Node
\draw (211.33,66.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {id};
% Text Node
\draw (212.83,96.06) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 1}\\{\fontfamily{pcr}\selectfont 2}\\{\fontfamily{pcr}\selectfont 3}\\{\fontfamily{pcr}\selectfont 4}\\};
% Text Node
\draw (269.33,65) node [anchor=north west][inner sep=0.75pt]   [align=left] {city};
% Text Node
\draw (373.33,65) node [anchor=north west][inner sep=0.75pt]   [align=left] {state};
% Text Node
\draw (450.67,65) node [anchor=north west][inner sep=0.75pt]   [align=left] {zip};
% Text Node
\draw (256.17,96.06) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont College Park}\\{\fontfamily{pcr}\selectfont Washington}\\{\fontfamily{pcr}\selectfont Cupertino}\\{\fontfamily{pcr}\selectfont College Park}};
% Text Node
\draw (378.17,96.06) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont MD}\\{\fontfamily{pcr}\selectfont D.C.}\\{\fontfamily{pcr}\selectfont CA}\\{\fontfamily{pcr}\selectfont MD}};
% Text Node
\draw (440.17,95.39) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 20742}\\{\fontfamily{pcr}\selectfont 2000}\\{\fontfamily{pcr}\selectfont 95014}\\{\fontfamily{pcr}\selectfont 20742}};
% Text Node
\draw (505.33,64.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {people};
% Text Node
\draw (520.83,94.06) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 3}\\{\fontfamily{pcr}\selectfont 4}\\{\fontfamily{pcr}\selectfont 3}\\{\fontfamily{pcr}\selectfont 2}};


\end{tikzpicture}

}

Let's say we only wanted to see the data from a \textbf{single city}. In this case, let's pick College Park.\newline

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Rectangle [id:dp816510738764739] 
\draw  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ] (185,62.33) -- (251.33,62.33) -- (251.33,85.33) -- (185,85.33) -- cycle ;
%Shape: Rectangle [id:dp39744801489204895] 
\draw  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ] (251.87,62.33) -- (359.67,62.33) -- (359.67,85.33) -- (251.87,85.33) -- cycle ;
%Shape: Rectangle [id:dp16790194728351993] 
\draw   (359.67,62.33) -- (426,62.33) -- (426,85.33) -- (359.67,85.33) -- cycle ;
%Shape: Rectangle [id:dp5121628997613088] 
\draw   (426.33,62.33) -- (492.67,62.33) -- (492.67,85.33) -- (426.33,85.33) -- cycle ;
%Shape: Rectangle [id:dp38170728474734883] 
\draw  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ] (185,85) -- (251.33,85) -- (251.33,194.67) -- (185,194.67) -- cycle ;
%Shape: Rectangle [id:dp0783518509199912] 
\draw  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ] (251.33,85.33) -- (359.12,85.33) -- (359.12,194.67) -- (251.33,194.67) -- cycle ;
%Shape: Rectangle [id:dp8309347187193188] 
\draw   (360,85.33) -- (426.33,85.33) -- (426.33,194.67) -- (360,194.67) -- cycle ;
%Shape: Rectangle [id:dp0878616379496251] 
\draw   (426.33,85.33) -- (492.67,85.33) -- (492.67,194.67) -- (426.33,194.67) -- cycle ;
%Shape: Rectangle [id:dp9451653351898317] 
\draw   (493,62.33) -- (559.33,62.33) -- (559.33,85.33) -- (493,85.33) -- cycle ;
%Shape: Rectangle [id:dp11005059918565219] 
\draw   (493,85.33) -- (559.33,85.33) -- (559.33,194.67) -- (493,194.67) -- cycle ;
%Shape: Free Drawing [id:dp983490163154191] 
\draw  [color={rgb, 255:red, 255; green, 0; blue, 0 }  ,draw opacity=1 ][line width=3] [line join = round][line cap = round] (209,90.5) .. controls (198.12,90.5) and (168.09,89.91) .. (172,107.5) .. controls (173.05,112.21) and (201.95,116.35) .. (205,116.5) .. controls (211.67,116.83) and (218.33,117.44) .. (225,117.5) .. controls (265.32,117.85) and (345.68,117.33) .. (394,116.5) .. controls (416.57,116.11) and (439.52,112.83) .. (462,112.5) .. controls (480.28,112.24) and (514.78,111.08) .. (539,113.5) .. controls (546.8,114.28) and (556.96,114.91) .. (564,113.5) .. controls (566.72,112.96) and (577.71,112.02) .. (578,108.5) .. controls (581.35,68.27) and (524.35,93.19) .. (494,91.5) .. controls (427.64,87.81) and (413.09,87.87) .. (312,88.5) .. controls (284.83,88.67) and (256.23,89.85) .. (229,90.5) .. controls (222.28,90.66) and (216.95,92.5) .. (210,92.5) ;
%Curve Lines [id:da10861026576502275] 
\draw [color={rgb, 255:red, 255; green, 0; blue, 0 }  ,draw opacity=1 ][line width=3] [line join = round][line cap = round]   (218,139.5) .. controls (201.04,139.5) and (193.69,138.27) .. (181,142.5) .. controls (176.53,143.99) and (169.8,153.48) .. (171,159.5) .. controls (171.2,160.48) and (185.36,164.26) .. (189,164.5) .. controls (198.32,165.12) and (207.66,165.5) .. (217,165.5) .. controls (221.09,165.5) and (226.66,163.55) .. (231,163.5) .. controls (282.82,162.87) and (384.95,164.21) .. (430,164.5) .. controls (443.35,164.59) and (456.65,166.35) .. (470,166.5) .. controls (497.66,166.82) and (525.34,166.82) .. (553,166.5) .. controls (558.97,166.43) and (577.17,164.8) .. (579,157.5) .. controls (583.4,139.91) and (542.58,139.95) .. (534,137.5) .. controls (486.11,123.82) and (438.47,140.4) .. (391,141.5) .. controls (345.12,142.57) and (299.53,137.8) .. (254,136.5) .. controls (243.67,136.2) and (233.29,136.47) .. (223,137.5) .. controls (222.9,137.51) and (209,136.14) .. (209,139.5) ;

% Text Node
\draw (211.33,66.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {id};
% Text Node
\draw (212.83,96.06) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 1}\\{\fontfamily{pcr}\selectfont 2}\\{\fontfamily{pcr}\selectfont 3}\\{\fontfamily{pcr}\selectfont 4}\\};
% Text Node
\draw (269.33,65) node [anchor=north west][inner sep=0.75pt]   [align=left] {city};
% Text Node
\draw (373.33,65) node [anchor=north west][inner sep=0.75pt]   [align=left] {state};
% Text Node
\draw (450.67,65) node [anchor=north west][inner sep=0.75pt]   [align=left] {zip};
% Text Node
\draw (256.17,96.06) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont College Park}\\{\fontfamily{pcr}\selectfont Washington}\\{\fontfamily{pcr}\selectfont Cupertino}\\{\fontfamily{pcr}\selectfont College Park}};
% Text Node
\draw (378.17,96.06) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont MD}\\{\fontfamily{pcr}\selectfont D.C.}\\{\fontfamily{pcr}\selectfont CA}\\{\fontfamily{pcr}\selectfont MD}};
% Text Node
\draw (440.17,95.39) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 20742}\\{\fontfamily{pcr}\selectfont 2000}\\{\fontfamily{pcr}\selectfont 95014}\\{\fontfamily{pcr}\selectfont 20742}};
% Text Node
\draw (505.33,64.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {people};
% Text Node
\draw (520.83,94.06) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 3}\\{\fontfamily{pcr}\selectfont 4}\\{\fontfamily{pcr}\selectfont 3}\\{\fontfamily{pcr}\selectfont 2}};


\end{tikzpicture}

}

This is what a 'Group By' operation would be perfect for. It'll basically just get us the rows that are from the city that we want.

\subsubsection{Group By + Aggregate}

We can combined Group By and Aggregate in pretty cool ways to get results that we want. For example, let's say we wanted to leverage the above table and get the sum total of all people who live in College Park, D.C., and Cupertino, respectively. By using a combo of Group By and Aggregate, we can totally do that. (\textit{Group By} City, then perform summing \textit{aggregation} operation.)

\subsubsection{Union, Intersection, Difference}

These are your usual set operations from statistics. However, this only works if the tables have identical attributes (columns). If they have identical columns, they are called \textbf{compatible tables}.\\

Examples: (Table A) $\union$ (Table B) results in (Table C) where all three tables have the same attributes. Likewise, (Table A) $\cap$ (Table B) results in (Table D) where all three also have the same attributes.

\subsubsection{Merge or Join}

This is how you combine rows across tables, based on some distinguishing element (i.e. ID column). For example, you'd basically take the row with ID 1 in your first table and add all those elements to the row with ID 1 in your second table.\\

There isn't a graphical example here just because we'll be talking about this a lot more in depth in later lectures. For now, just remember it as a way to combine tables.

\subsubsection{Summary}

Overall, \textbf{Tables} are a simple and common abstraction. They're how we mainly keep track of data when we do most of these data science things, so it's worth learning how to employ them, and what basic operations we can use when manipulating them.\\

These \textbf{operations}, at a glance:

\begin{itemize}
	\item Select
	\item Map
	\item Aggregate/Reduce
	\item Join/Merge
	\item Union/Concat
	\item Group By
\end{itemize} 

Keep in mind that tables are an \textit{abstraction}, after all, so these operations may be named different things in the languages you use to manipulate them. That's why its important to not just memorize the names of these operations, but what they actually do. This will prepare you for work with any data table manipulation program.

\subsection{Pandas}

\texttt{PANDAS} is a data manipulation library for Python that's highly optimized for performance. It contains two key constructs, \textbf{\texttt{Dataframes}} and \textbf{\texttt{Series}}.

\subsubsection{Dataframes}

This is \texttt{PANDAS}'s way of representing the table abstraction we were looking at. Geeksforgeeks even calls this one a tabular data structure.\\

There are a lot of \texttt{PANDAS}-specific commands that you'll have to learn to be proficient with these, but Geeksforgeeks is a great reference for them. (You'll find it's pretty easy to conduct all the basic table operations)\\

{
\centering

\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Rectangle [id:dp2777454383178828] 
\draw   (109.67,31.83) -- (296.33,31.83) -- (296.33,60.17) -- (109.67,60.17) -- cycle ;
%Shape: Rectangle [id:dp24695275768955183] 
\draw   (109.67,71.67) -- (143.67,71.67) -- (143.67,243.33) -- (109.67,243.33) -- cycle ;
%Shape: Rectangle [id:dp32304758523801747] 
\draw   (159.67,71.67) -- (193.67,71.67) -- (193.67,243.33) -- (159.67,243.33) -- cycle ;
%Shape: Rectangle [id:dp3982940173231615] 
\draw   (210,71.67) -- (246,71.67) -- (246,243.33) -- (210,243.33) -- cycle ;
%Shape: Rectangle [id:dp37628168334169354] 
\draw   (259,71.67) -- (295.67,71.67) -- (295.67,243.33) -- (259,243.33) -- cycle ;
%Straight Lines [id:da7379961166553421] 
\draw    (127,54.33) -- (127,76.33) ;
\draw [shift={(127,78.33)}, rotate = 270] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da8157307687769006] 
\draw    (177,53.33) -- (177,75.33) ;
\draw [shift={(177,77.33)}, rotate = 270] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da85290367528958] 
\draw    (227.67,53.33) -- (227.67,75.33) ;
\draw [shift={(227.67,77.33)}, rotate = 270] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da06750596708048917] 
\draw    (275.67,53.33) -- (275.67,75.33) ;
\draw [shift={(275.67,77.33)}, rotate = 270] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Shape: Rectangle [id:dp30009607301124297] 
\draw  [color={rgb, 255:red, 255; green, 0; blue, 0 }  ,draw opacity=1 ] (55,71.67) -- (89,71.67) -- (89,243.33) -- (55,243.33) -- cycle ;
%Curve Lines [id:da3683895080925861] 
\draw [color={rgb, 255:red, 255; green, 0; blue, 0 }  ,draw opacity=1 ]   (389.67,254) .. controls (321.69,302.84) and (121.72,266.74) .. (73.09,244.02) ;
\draw [shift={(71.67,243.33)}, rotate = 386.23] [color={rgb, 255:red, 255; green, 0; blue, 0 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da5179190869487518] 
\draw    (404.33,71.67) -- (307.6,46.5) ;
\draw [shift={(305.67,46)}, rotate = 374.58000000000004] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (118,37.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {foo};
% Text Node
\draw (162.67,37.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {bar};
% Text Node
\draw (213,37.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {baz};
% Text Node
\draw (262,37.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {qux};
% Text Node
\draw (66,75.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {A\\B\\C\\D\\E\\.\\.\\.};
% Text Node
\draw (119.33,75.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 0}\\{\fontfamily{pcr}\selectfont 4}\\{\fontfamily{pcr}\selectfont 8}\\{\fontfamily{pcr}\selectfont -1}\\{\fontfamily{pcr}\selectfont 16}\\{\fontfamily{pcr}\selectfont .}\\{\fontfamily{pcr}\selectfont .}\\{\fontfamily{pcr}\selectfont .}};
% Text Node
\draw (171.33,75.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {x\\y\\z\\a\\b\\.\\.\\.};
% Text Node
\draw (213,75.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {2.7\\6.9\\4.2\\5.4\\6.1\\.\\.\\.};
% Text Node
\draw (270,75.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {T\\F\\T\\F\\F\\.\\.\\.};
% Text Node
\draw (398,231.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {Non-unique index values are allowed,\\ but they may cause issues.};
% Text Node
\draw (456.67,206.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textbf{\textcolor[rgb]{0.82,0.01,0.11}{Index (ID)} Column}};
% Text Node
\draw (409.33,63.17) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textbf{Columns} - can be extracted as series};


\end{tikzpicture}

}


\subsubsection{Series}

Think of a \texttt{series} as a subclass of Numpy's \texttt{ndarray}.\\

Geeksforgeeks calls this a 'one dimensional labeled array capable of holding any data type'. Think of it as a column in an Excel spreadsheet. In fact, their most common use is when you pull a single column out of a \texttt{dataframe} and want to analyze it individually.\\

The object itself supports integer and label based indexing (like letters), and allows us to perform a bunch of operations involving the index.\\

To create one of these we can grab a column from a \texttt{dataframe}, or we can create one out of a regular Python \texttt{list} or a Numpy  \texttt{ndarray}.

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Rectangle [id:dp043234274585036925] 
\draw   (352.67,87.33) -- (393,87.33) -- (393,180) -- (352.67,180) -- cycle ;
%Shape: Rectangle [id:dp5031953197422854] 
\draw   (288,86.67) -- (328.33,86.67) -- (328.33,181.33) -- (288,181.33) -- cycle ;
%Straight Lines [id:da8511139241388785] 
\draw    (323,106) -- (356.33,105.37) ;
\draw [shift={(358.33,105.33)}, rotate = 538.9200000000001] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da28107306227000595] 
\draw    (322.33,118.67) -- (355.67,118.04) ;
\draw [shift={(357.67,118)}, rotate = 538.9200000000001] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da8988638709435195] 
\draw    (321,132.67) -- (354.33,132.04) ;
\draw [shift={(356.33,132)}, rotate = 538.9200000000001] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da21706629371432573] 
\draw    (321,148) -- (354.33,147.37) ;
\draw [shift={(356.33,147.33)}, rotate = 538.9200000000001] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da8917362815148493] 
\draw    (323,164.67) -- (356.33,164.04) ;
\draw [shift={(358.33,164)}, rotate = 538.9200000000001] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (361.33,95) node [anchor=north west][inner sep=0.75pt]   [align=left] {5\\6\\12\\\mbox{-}5\\6.9};
% Text Node
\draw (301.33,96.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {A\\B\\C\\D\\E};
% Text Node
\draw (300.67,63.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {ID};
% Text Node
\draw (353.33,65) node [anchor=north west][inner sep=0.75pt]   [align=left] {Value};
\end{tikzpicture}


}

\hfill \break For this sort of stuff, you can probably look at \textbf{GeeksforGeeks} for more information. They have excellent documentation on \texttt{PANDAS} functionality.

\subsubsection{Creating a Dataframe}

To \textbf{create a \texttt{dataframe}}, you have a variety of options.

\begin{itemize}
	\item Get data directly from a Python dictionary, a bunch of series, or other data structures
	\item \texttt{Pandas.read\_csv()} $\rightarrow$ take in data from a \texttt{.csv} file
	\item \texttt{Pandas.read\_excel()} $\rightarrow$ take in data from an excel spreadsheet
	\item \texttt{Pandas.read\_html()} $\rightarrow$ take in data from a (static) website (e.g. a website with a big table of data on it)
	\item From a database by using SQL to make queries
	\item From clipboard, URL, many more options.
\end{itemize}

\subsection{Tidy Data}

There are \textbf{3} components of \textbf{tidy data}: \textbf{Labels},  \textbf{Variables} (values), and \textbf{Observations}.\\

{

\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Rectangle [id:dp30513253458375056] 
\draw   (164,83.33) -- (337.67,83.33) -- (337.67,191.33) -- (164,191.33) -- cycle ;
%Straight Lines [id:da42703415044249615] 
\draw    (164,116) -- (337.67,116) ;
%Straight Lines [id:da4850478853225664] 
\draw    (211.67,83.33) -- (211.67,190.67) ;
%Straight Lines [id:da18466499361867394] 
\draw [color={rgb, 255:red, 244; green, 0; blue, 0 }  ,draw opacity=1 ]   (250.33,57) -- (193.56,76.68) ;
\draw [shift={(191.67,77.33)}, rotate = 340.88] [color={rgb, 255:red, 244; green, 0; blue, 0 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da9858889971902263] 
\draw [color={rgb, 255:red, 244; green, 0; blue, 0 }  ,draw opacity=1 ]   (250.33,57) -- (250.33,75.33) ;
\draw [shift={(250.33,77.33)}, rotate = 270] [color={rgb, 255:red, 244; green, 0; blue, 0 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da5731285595473614] 
\draw [color={rgb, 255:red, 244; green, 0; blue, 0 }  ,draw opacity=1 ]   (250.33,57) -- (303.8,77.29) ;
\draw [shift={(305.67,78)}, rotate = 200.78] [color={rgb, 255:red, 244; green, 0; blue, 0 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da9175469839720249] 
\draw [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ]   (409,92.67) -- (354.99,96.52) ;
\draw [shift={(353,96.67)}, rotate = 355.90999999999997] [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da6295025530716172] 
\draw [color={rgb, 255:red, 65; green, 117; blue, 5 }  ,draw opacity=1 ]   (447.67,126.67) -- (342.33,130.59) ;
\draw [shift={(340.33,130.67)}, rotate = 357.87] [color={rgb, 255:red, 65; green, 117; blue, 5 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da28024077694656935] 
\draw [color={rgb, 255:red, 65; green, 117; blue, 5 }  ,draw opacity=1 ]   (447.67,126.67) -- (342.29,148.27) ;
\draw [shift={(340.33,148.67)}, rotate = 348.41999999999996] [color={rgb, 255:red, 65; green, 117; blue, 5 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da7161677482922817] 
\draw [color={rgb, 255:red, 65; green, 117; blue, 5 }  ,draw opacity=1 ]   (447.67,126.67) -- (343.53,167.93) ;
\draw [shift={(341.67,168.67)}, rotate = 338.39] [color={rgb, 255:red, 65; green, 117; blue, 5 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da23343105546492038] 
\draw    (279,84) -- (279,191.33) ;

% Text Node
\draw (174.33,93) node [anchor=north west][inner sep=0.75pt]   [align=left] {Age};
% Text Node
\draw (221.33,93) node [anchor=north west][inner sep=0.75pt]   [align=left] {Weight};
% Text Node
\draw (282,93) node [anchor=north west][inner sep=0.75pt]   [align=left] {Height};
% Text Node
\draw (172.33,120) node [anchor=north west][inner sep=0.75pt]   [align=left] {12.2\\11.0\\15.6\\35.1};
% Text Node
\draw (231,120) node [anchor=north west][inner sep=0.75pt]   [align=left] {42.3\\40.8\\65.3\\84.2};
% Text Node
\draw (289,120) node [anchor=north west][inner sep=0.75pt]   [align=left] {145.1\\143.8\\165.3\\185.5};
% Text Node
\draw (220.67,36) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{Variables}};
% Text Node
\draw (412,82.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{Labels (name of the whole row)}};
% Text Node
\draw (450,117.33) node [anchor=north west][inner sep=0.75pt]  [color={rgb, 255:red, 65; green, 117; blue, 5 }  ,opacity=1 ] [align=left] {Obvservations};


\end{tikzpicture}

}

\hfill \break Here's some elaboration.

\begin{itemize}
	\item \textbf{Variables} $\rightarrow$ A measure or attribute, e.g. age, weight, height, sex.
	\item \textbf{Value} $\rightarrow$ Measurement of a \textit{singular} attribute, e.g. \texttt{12.2 lbs} or \texttt{5.9 inches}.
	\item \textbf{Observation} $\rightarrow$ All measurements for an object; a \textit{row} in the table. E.g. a single observation in the above table would be \texttt{[12.2, 42.3, 145.1]}.
\end{itemize}

Tidying and melting data basically just means that you mix data around until it's nice and usable. Usually, you are tidying in pursuit of a specific use-case, so less columns or more columns are never the 'better' option. This is one of those things that takes practice and application.\\

\textbf{TL;DR} Clean up and organize your data before you mess with it!

\subsection{SQL and Relational Databases}

\textbf{Big Question:} What is a relation?\\
\textbf{Answer:} In a databases context, they usually mean, "a tabular set of data either permanently stored in the database (a table) or derived from tables according to a mathematical description (a view or a query result)." (\textit{Larry Lustig, Stackoverflow})\\

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Relation} $\rightarrow$ A relation is a data structure which consists of a heading and an unordered set of tuples which share the same type.\\
\textbf{Relation Schema} $\rightarrow$ A list of all attribute names and their domains. E.g. 'The Schema for a Table'.
\end{tcolorbox}

\subsubsection{Indexing}

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Index} $\rightarrow$ An auxiliary data structure of a relation database designed to speed up the retrieval of rows.
\end{tcolorbox}


\hfill \break How can we leverage \textbf{indexes} to improve search times for our relational databases? Take a look at the example below. Let's say we wanted to find all people from Canada (with a nat\_id of 2).\\

{
\centering

\includegraphics[scale=0.35]{img/table_1.png} 

}

\hfill \break Unfortunately, the time it takes for us to build this list every time we want to leverage the result of this search is \texttt{\textbf{O(n)}}. This is not so great for us. However, if we decide to build an \textbf{index} on the 'nat\_id' attribute, things change. \\

{
\centering

\includegraphics[scale=0.35]{img/table_2.png} 

}


\hfill \break Now, after establishing this index, which acts like a hidden sorted map of references to a specific attribute in a table, we are allowed \texttt{\textbf{O(log n)}} lookup with the parameter nat\_id.\\

You can choose to build an index on a certain attribute to improve search times for it, but they aren't free. They're expensive- establish an index with caution. Not only do they take time to initially build, but now, whenever you add to the table or update it, you also need to update the index. In that sense, establishing too many indexes could lead to other operations, such as changing table values, taking a very long time. It's a delicate balance.\\


\begin{tcolorbox}[title=Aside: Indexes,colframe=black,colback=white,arc=0pt,fonttitle=\bfseries]
Indexes are actually implemented with data structures like \textbf{B-trees}, which is why we are able to perform data access in \texttt{O(log n)} time. The worst case height of a B-tree is \texttt{O(log n)}, and since a search is dependent on height, B-tree lookups run in \texttt{O(log n)} on average.
\end{tcolorbox}


\subsection{Relationships}

Primary keys and foreign keys determine interactions between different tables. These are formally known as relationships. First, let's establish definitions for primary and foreign keys.\\

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Primary Keys} $\rightarrow$ Columns whose data can be used to uniquely identify each row in the table. Highlighted in red below. (E.g. the ID column)\\
\textbf{Foreign Keys} $\rightarrow$ Columns that refer to the primary key in another table. Highlighted in blue below.
\end{tcolorbox}

\hfill \break 


{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Image [id:dp37561613857384235] 
\draw (346.83,176.27) node  {\includegraphics[width=372.75pt,height=150.4pt]{img/table_2.png}};
%Straight Lines [id:da26185218947017463] 
\draw [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ]   (107,45) -- (159.26,74.52) ;
\draw [shift={(161,75.5)}, rotate = 209.46] [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da2879956461468336] 
\draw [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ]   (107,45) -- (475.67,77.82) ;
\draw [shift={(477.67,78)}, rotate = 185.09] [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da19357327119760281] 
\draw [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ]   (413.67,32.67) -- (406.66,74.69) ;
\draw [shift={(406.33,76.67)}, rotate = 279.46] [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Shape: Ellipse [id:dp8641831687478103] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ][line width=3.75]  (375,186) .. controls (375,139.61) and (386.42,102) .. (400.5,102) .. controls (414.58,102) and (426,139.61) .. (426,186) .. controls (426,232.39) and (414.58,270) .. (400.5,270) .. controls (386.42,270) and (375,232.39) .. (375,186) -- cycle ;
%Shape: Ellipse [id:dp28836148924857286] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=3.75]  (456.33,154.67) .. controls (456.33,125.21) and (464.69,101.33) .. (475,101.33) .. controls (485.31,101.33) and (493.67,125.21) .. (493.67,154.67) .. controls (493.67,184.12) and (485.31,208) .. (475,208) .. controls (464.69,208) and (456.33,184.12) .. (456.33,154.67) -- cycle ;
%Shape: Ellipse [id:dp4025421876242352] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=3.75]  (142.33,186.67) .. controls (142.33,140.27) and (153.75,102.67) .. (167.83,102.67) .. controls (181.92,102.67) and (193.33,140.27) .. (193.33,186.67) .. controls (193.33,233.06) and (181.92,270.67) .. (167.83,270.67) .. controls (153.75,270.67) and (142.33,233.06) .. (142.33,186.67) -- cycle ;

% Text Node
\draw (58,20) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{Primary Keys}};
% Text Node
\draw (368,8.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{Foreign Keys}};


\end{tikzpicture}


}

There are four main types of relationships between keys. Here they are with some examples.

\begin{itemize}
	\item \textbf{One-to-many / Many-to-one}: A person can have one nationality, but a nationality can have many people associated with it.
	\item \textbf{One-to-one}: People each have one unique SSN- no conflicts.
	\item \textbf{One-to-one-or-none}: People can either have 1 car, or no car.
	\item \textbf{Many-to-many}: Cats and colors. Red can be on many cats, and many colors can be on a single cat.
\end{itemize}

Even though this system can sort of be replicated in \texttt{PANDAS}, \texttt{PANDAS} is not strictly a relational data system. Notably, it doesn't have notions of primary or foreign keys built in.

\begin{myproof}
Do heavy, rough lifting at the \textbf{relational database} level, (e.g. when you're deciding what sort of \texttt{SQL} queries to make) and then do the fine-grained slicing and dicing and visualization with \texttt{\textbf{PANDAS}}.
\end{myproof}

\subsection{SQL and SQLite}

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{SQL} $\rightarrow$ Stands for 'Structured Query Language', and is the ANSI-standardized way for us to communicate with relational databases. Standard SQL commands like "Select", "Insert", "Update", "Delete", "Create", and "Drop" can be used to accomplish almost everything that one needs to do with a database.
\end{tcolorbox}

\hfill \break We use \textbf{SQLite}, an on-disk relational database management system, in order to interact with our databases. Most \textbf{RDMS}s connect to a server, which provides support for better concurrency, that sort of stuff takes longer to setup.\\

On the other hand, \texttt{SQLite} is pretty simple to install via \texttt{conda}, so we're going to go ahead and use it.\\

\texttt{SQLite} provides us with two main ways to communicate with the \texttt{SQL} database that it maintains. First of all, it gives us a cool GUI-based environment where we can deal with manipulating data manually, but it also allows us to write \texttt{SQL} statements to interact with it, whether that be from the command line or from within Python.\\

Here's an example of how a relational database fits into our workflow.\\\\

{

\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Folded Corner [id:dp17966687822939686] 
\draw  [fill={rgb, 255:red, 245; green, 166; blue, 35 }  ,fill opacity=1 ] (138.6,85.33) -- (62,85.33) -- (62,17.33) -- (159,17.33) -- (159,64.93) -- cycle -- (138.6,85.33) ; \draw   (159,64.93) -- (142.68,69.01) -- (138.6,85.33) ;
%Straight Lines [id:da19188911599755476] 
\draw    (166.33,45.33) -- (261.15,84.57) ;
\draw [shift={(263,85.33)}, rotate = 202.48] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Flowchart: Document [id:dp5212307516849849] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ][fill={rgb, 255:red, 242; green, 220; blue, 105 }  ,fill opacity=1 ][line width=2.25]  (270.67,59.33) -- (381.5,59.33) -- (381.5,111.58) .. controls (312.23,111.58) and (326.08,130.42) .. (270.67,118.23) -- cycle ;
%Straight Lines [id:da2960662304041648] 
\draw    (390.67,99.67) -- (442.33,99.99) ;
\draw [shift={(444.33,100)}, rotate = 180.36] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da4888130737896952] 
\draw    (442.33,86.67) -- (390.33,86.02) ;
\draw [shift={(388.33,86)}, rotate = 360.71000000000004] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Shape: Can [id:dp9185662032628965] 
\draw  [fill={rgb, 255:red, 194; green, 219; blue, 249 }  ,fill opacity=1 ] (604.33,61.5) -- (604.33,110.5) .. controls (604.33,116.3) and (571.43,121) .. (530.83,121) .. controls (490.24,121) and (457.33,116.3) .. (457.33,110.5) -- (457.33,61.5) .. controls (457.33,55.7) and (490.24,51) .. (530.83,51) .. controls (571.43,51) and (604.33,55.7) .. (604.33,61.5) .. controls (604.33,67.3) and (571.43,72) .. (530.83,72) .. controls (490.24,72) and (457.33,67.3) .. (457.33,61.5) ;
%Straight Lines [id:da4630895722743661] 
\draw    (530.83,137.33) -- (530.83,188) ;
\draw [shift={(530.83,190)}, rotate = 270] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da7457072915555859] 
\draw    (539.67,186.67) -- (539.67,137.33) ;
\draw [shift={(539.67,135.33)}, rotate = 450] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Flowchart: Prodefined Process [id:dp6652571645774823] 
\draw  [fill={rgb, 255:red, 239; green, 255; blue, 221 }  ,fill opacity=1 ] (132.33,191) -- (319.67,191) -- (319.67,286) -- (132.33,286) -- cycle ; \draw   (155.75,191) -- (155.75,286) ; \draw   (296.25,191) -- (296.25,286) ;
%Straight Lines [id:da7038486863895331] 
\draw    (287.67,129.33) -- (240.98,183.16) ;
\draw [shift={(239.67,184.67)}, rotate = 310.94] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Rounded Rect [id:dp7535419443712124] 
\draw  [fill={rgb, 255:red, 155; green, 155; blue, 155 }  ,fill opacity=0.6 ] (485.5,206.27) .. controls (485.5,198.02) and (492.19,191.33) .. (500.43,191.33) -- (576.73,191.33) .. controls (584.98,191.33) and (591.67,198.02) .. (591.67,206.27) -- (591.67,251.07) .. controls (591.67,259.31) and (584.98,266) .. (576.73,266) -- (500.43,266) .. controls (492.19,266) and (485.5,259.31) .. (485.5,251.07) -- cycle ;
%Straight Lines [id:da07613681473682243] 
\draw    (251,184) -- (297.03,130.19) ;
\draw [shift={(298.33,128.67)}, rotate = 490.54] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (95.33,31.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {Raw\\Input};
% Text Node
\draw (300.67,77) node [anchor=north west][inner sep=0.75pt]   [align=left] {Python};
% Text Node
\draw (494,84) node [anchor=north west][inner sep=0.75pt]   [align=left] {SQLite File};
% Text Node
\draw (400.67,53.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {SQL};
% Text Node
\draw (549.33,152.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {SQL};
% Text Node
\draw (505.17,207.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {SQLite\\CLI + GUI};
% Text Node
\draw (166,196) node [anchor=north west][inner sep=0.75pt]   [align=left] {Structured Output};
% Text Node
\draw (162.67,219) node [anchor=north west][inner sep=0.75pt]   [align=left] {(trained classifiers,\\JSON for D3,\\visualizations)};


\end{tikzpicture}


}

To work with \texttt{SQLite} in Python3, simply install and import the \texttt{SQLite3} package, and use that.

\subsection{Joining Data}

A \textbf{join} operations merges two or more tables into a single relation, based on their columns. There are four total types of join operations.\\ 

\begin{myproof}
Formally, the way we format join statements is the following:\\\\
\texttt{<type of join> join (<left table>, <right table>) on (<left table column>, <right table column>)}\\

Example: \texttt{Right join (cats, visits) on (id, cat\_id)}
\end{myproof}

\subsubsection{Types of Joins}

\begin{itemize}
	\item \textbf{Inner Join} $\rightarrow$ Returns merged rows that share the \textbf{same} value in the column that they are being joined on. Let's say we had the following two tables and wanted to join them.\\
	
	\includegraphics[scale=0.35]{img/table_3.png}\\
	
	In order to perform an inner join on these two tables, we need to pick a column to 'join them on'. Here, let's inner join these two tables on \texttt{id} and \texttt{cat\_id}. The result would look like this:\\
	
	\includegraphics[scale=0.35]{img/table_4.png}\\
	
	Inner joins are the most common type of joins; they get us the results that are shared by both tables.
	
	\item \textbf{Left Join} $\rightarrow$ A left join gets us \textbf{all} the results from the \textbf{left} table, but only \textbf{some} (the corresponding matching results) from the \textbf{right table}. So, what happens if we Left Joined \texttt{cats} and \texttt{visits} on \texttt{(id, cat\_id)}?\\
	
	\includegraphics[scale=0.35]{img/table_5.png}\\
	
	You'll notice that the fields that we couldn't fill out get populated with \textbf{\texttt{NULL}}.
	
	\item \textbf{Right Join} $\rightarrow$ A right join gets us \textbf{all} the results from the \textbf{right} table, but only \textbf{some} (the corresponding matching results) from the \textbf{left} table. Here's an example, with updated \texttt{cats} and \texttt{visits} tables.\\
	
	\includegraphics[scale=0.35]{img/table_6.png}\\
	
	If we were to perform a right join on these two tables, here's what would happen. It's basically just a flipped version of the left join.\\
	
	\includegraphics[scale=0.35]{img/table_7.png}\\
	
	Again, this time, notice how the row entries missing from the left table are now set to \texttt{\textbf{NULL}}
	
	\item \textbf{Full Outer Join} $\rightarrow$ The full outer join combines the left and right join. It's analogous to a \textbf{union} operation. Here's an example of a full outer join of \texttt{cats} and \texttt{visits} on \texttt{id} and \texttt{cat\_id}.\\
	
	\includegraphics[scale=0.35]{img/table_8.png}\\
	
\end{itemize}

\subsubsection{Syntax in PANDAS}

Here's how to write some basic join syntax in \texttt{PANDAS}. This should be easy once you learn how to phrase join statements- you're basically just translating it into code.\\

First, this is how you'd read from \texttt{SQLite} (or any other database you're hooked up to) using \texttt{Pandas} and generate the appropriate dataframes to work with.\\

{\centering
\begin{lstlisting}[language=python]
# establish dataframes from SQL
df_cats = pd.read_sql_query("SELECT * from cats", conn)
df_visits = pd.read_sql_query("SELECT * from visits", conn)
\end{lstlisting}
}

\hfill \break Now, here's how to do the joins.

{\centering
\begin{lstlisting}[language=python]
# inner join
df_cats.merge(df_visits, how = "inner",
              left_on = "id", right_on = "cat_id")
              
# left join
df_cats.merge(df_visits, how = "left",
              left_on = "id", right_on = "cat_id")
              
# right join
df_cats.merge(df_visits, how = "right",
              left_on = "id", right_on = "cat_id")
              
# full outer join
df_cats.merge(df_visits, how = "outer",
              left_on = "id", right_on = "cat_id")
\end{lstlisting}
}

There are also ways you can perform most of these joins via SQL (save for the right join), but I would prefer doing them from Python. As such, I won't include the \texttt{SQL} syntax here.

\hfill \break
\begin{tcolorbox}[title=Aside: Raw \texttt{SQL} with Pandas,colframe=black,colback=white,arc=0pt,fonttitle=\bfseries]
If you want to use raw \texttt{SQL} queries to interact with \texttt{PANDAS} dataframes, you are free to do so when you use the \texttt{PandaSQL} library.
\end{tcolorbox}

\subsubsection{Visual Example}

Here's a neat way to visualize joins using Venn Diagrams.\\\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Circle [id:dp6533662955757997] 
\draw   (116,75.43) .. controls (116,47.78) and (138.42,25.37) .. (166.07,25.37) .. controls (193.72,25.37) and (216.13,47.78) .. (216.13,75.43) .. controls (216.13,103.08) and (193.72,125.5) .. (166.07,125.5) .. controls (138.42,125.5) and (116,103.08) .. (116,75.43) -- cycle ;
%Shape: Circle [id:dp3081615610888653] 
\draw   (166.28,75.43) .. controls (166.28,47.78) and (188.69,25.37) .. (216.35,25.37) .. controls (244,25.37) and (266.41,47.78) .. (266.41,75.43) .. controls (266.41,103.08) and (244,125.5) .. (216.35,125.5) .. controls (188.69,125.5) and (166.28,103.08) .. (166.28,75.43) -- cycle ;
%Shape: Free Drawing [id:dp702537410998645] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=6] [line join = round][line cap = round] (192.9,36.98) .. controls (190.46,38.37) and (188.04,40.43) .. (186.56,42.9) .. controls (186.33,43.28) and (187.42,42.66) .. (187.83,42.48) .. controls (188.93,41.99) and (190.11,41.71) .. (191.21,41.21) .. controls (191.67,41) and (192.02,40.59) .. (192.47,40.36) .. controls (192.76,40.22) and (193.63,39.89) .. (193.32,39.94) .. controls (189.56,40.57) and (187.18,42.64) .. (184.45,45.01) .. controls (183.08,46.19) and (181.68,47.33) .. (180.22,48.39) .. controls (180.11,48.47) and (179.67,48.45) .. (179.8,48.39) .. controls (184.42,46.23) and (189.08,44.13) .. (193.74,42.05) .. controls (194.15,41.87) and (195.45,41.63) .. (195.01,41.63) .. controls (191.07,41.63) and (185.3,47.08) .. (183.18,48.81) .. controls (182.25,49.57) and (181.44,50.46) .. (180.64,51.35) .. controls (180.17,51.87) and (178.75,52.72) .. (179.38,53.04) .. controls (180.29,53.5) and (181.39,52.59) .. (182.33,52.19) .. controls (187.87,49.91) and (193.3,47.36) .. (198.81,45.01) .. controls (198.94,44.96) and (199.37,44.98) .. (199.23,45.01) .. controls (193.44,46.46) and (190.25,48.45) .. (184.45,51.35) .. controls (183.49,51.83) and (182.44,52.12) .. (181.49,52.62) .. controls (180.04,53.39) and (178.71,54.37) .. (177.26,55.15) .. controls (177.14,55.22) and (176.71,55.2) .. (176.84,55.15) .. controls (184.24,52.69) and (191.53,49.91) .. (198.81,47.12) .. controls (199.64,46.81) and (200.58,46.74) .. (201.35,46.28) .. controls (201.52,46.18) and (201.96,45.8) .. (201.77,45.86) .. controls (194.66,48.05) and (187.64,50.93) .. (181.49,55.15) .. controls (179.84,56.29) and (178.35,57.64) .. (176.84,58.95) .. controls (176.16,59.55) and (174.09,60.01) .. (174.73,60.64) .. controls (175.89,61.81) and (179.51,59.7) .. (180.64,59.38) .. controls (187.06,57.54) and (193.05,55.09) .. (199.23,52.62) .. controls (199.81,52.39) and (204.3,51.45) .. (204.3,50.5) .. controls (204.3,49.85) and (200.11,51.76) .. (200.08,51.77) .. controls (198.42,52.29) and (196.69,52.57) .. (195.01,53.04) .. controls (193.16,53.56) and (191.32,54.09) .. (189.52,54.73) .. controls (186.03,55.96) and (182.54,57.21) .. (178.95,58.11) .. controls (178.76,58.16) and (179.19,57.77) .. (179.38,57.69) .. controls (181.18,56.91) and (183.08,56.38) .. (184.87,55.57) .. controls (187.45,54.41) and (189.86,52.86) .. (192.47,51.77) .. controls (194,51.14) and (195.96,51.25) .. (197.12,50.08) .. controls (197.75,49.45) and (198.86,49.64) .. (199.66,49.24) .. controls (199.83,49.15) and (200.27,48.78) .. (200.08,48.81) .. controls (198.41,49.15) and (196.95,50.17) .. (195.43,50.93) .. controls (188.39,54.45) and (180.52,56.66) .. (174.31,61.49) .. controls (173.86,61.84) and (175.44,61.57) .. (176,61.49) .. controls (178.4,61.15) and (180.83,60.86) .. (183.18,60.22) .. controls (190.7,58.18) and (198.12,55.74) .. (205.57,53.46) .. controls (206.13,53.29) and (204.42,53.66) .. (203.88,53.88) .. controls (203.01,54.26) and (202.19,54.73) .. (201.35,55.15) .. controls (198.64,56.5) and (195.95,57.89) .. (193.32,59.38) .. controls (187.76,62.5) and (181.69,67.1) .. (176,69.94) .. controls (175.74,70.07) and (176.58,70.05) .. (176.84,69.94) .. controls (182.66,67.5) and (188.41,64.92) .. (194.16,62.33) .. controls (195.31,61.82) and (196.44,61.26) .. (197.54,60.64) .. controls (198.09,60.34) and (199.85,59.68) .. (199.23,59.8) .. controls (193.97,60.81) and (188.71,61.98) .. (183.6,63.6) .. controls (181.94,64.13) and (180.21,64.39) .. (178.53,64.87) .. controls (178.23,64.96) and (177.42,65.47) .. (177.69,65.29) .. controls (183.06,61.71) and (191.12,59.19) .. (197.12,57.69) .. controls (197.82,57.51) and (203.24,56.18) .. (203.46,56.84) .. controls (203.76,57.74) and (201.7,57.57) .. (200.92,58.11) .. controls (199.86,58.85) and (199.09,59.99) .. (197.97,60.64) .. controls (191.6,64.37) and (185.04,67.75) .. (178.53,71.21) .. controls (178.14,71.42) and (176.82,71.63) .. (177.26,71.63) .. controls (178.78,71.63) and (180.08,70.5) .. (181.49,69.94) .. controls (188.6,67.09) and (196.2,65.64) .. (203.46,63.18) .. controls (204.9,62.69) and (209.04,60.81) .. (207.68,61.49) .. controls (198.98,65.84) and (191.74,68.69) .. (183.18,73.74) .. controls (181.98,74.45) and (180.49,74.6) .. (179.38,75.43) .. controls (178.8,75.86) and (180.8,75.2) .. (181.49,75.01) .. controls (183.31,74.51) and (184.84,73.25) .. (186.56,72.48) .. controls (190.54,70.68) and (194.71,69.34) .. (198.81,67.83) .. controls (200.95,67.04) and (203.06,66.2) .. (205.15,65.29) .. controls (205.33,65.21) and (205.75,64.79) .. (205.57,64.87) .. controls (200.2,67.29) and (194.7,70.07) .. (189.52,72.9) .. controls (187.63,73.93) and (185.88,75.19) .. (184.02,76.28) .. controls (183.42,76.63) and (181.63,77.55) .. (182.33,77.55) .. controls (186.09,77.55) and (202.6,70.12) .. (203.04,69.94) .. controls (204.62,69.31) and (206.08,68.4) .. (207.68,67.83) .. controls (208.08,67.69) and (209.37,67.83) .. (208.95,67.83) .. controls (207.75,67.83) and (206.65,68.56) .. (205.57,69.1) .. controls (199.71,72.02) and (193.71,74.67) .. (187.83,77.55) .. controls (185.67,78.6) and (178.98,81.36) .. (179.8,81.77) .. controls (180.96,82.35) and (182.37,81.34) .. (183.6,80.93) .. controls (186.86,79.84) and (190.1,78.73) .. (193.32,77.55) .. controls (197.88,75.87) and (202.61,74.72) .. (207.26,73.32) .. controls (209.75,72.57) and (212.19,70.79) .. (214.87,70.79) .. controls (215.07,70.79) and (214.64,71.15) .. (214.44,71.21) .. controls (213.62,71.44) and (212.7,71.3) .. (211.91,71.63) .. controls (210.28,72.3) and (208.88,73.47) .. (207.26,74.17) .. controls (200.92,76.9) and (194.36,79.1) .. (187.83,81.35) .. controls (185.5,82.15) and (181.07,84.31) .. (181.07,84.31) .. controls (181.07,84.31) and (182.17,83.43) .. (182.76,83.04) .. controls (185.73,81.05) and (188.73,79.56) .. (192.05,77.97) .. controls (193.87,77.1) and (198.97,76.86) .. (197.54,75.43) .. controls (196.23,74.12) and (193.83,75.77) .. (192.05,76.28) .. controls (187.93,77.45) and (181.35,80.5) .. (176.84,80.5) .. controls (176.7,80.5) and (177.14,80.57) .. (177.26,80.5) .. controls (179.13,79.44) and (180.89,78.19) .. (182.76,77.12) .. controls (183.85,76.5) and (185.03,76.04) .. (186.14,75.43) .. controls (188.13,74.35) and (190.05,73.12) .. (192.05,72.05) .. controls (192.18,71.99) and (192.62,72.05) .. (192.47,72.05) .. controls (186.99,72.05) and (181.92,74.16) .. (176.84,75.43) .. controls (176.39,75.54) and (173.04,76.28) .. (173.04,76.28) .. controls (173.04,76.28) and (176.8,75.08) .. (178.53,74.17) .. controls (181.59,72.54) and (184.69,70.98) .. (187.83,69.52) .. controls (188.23,69.33) and (187,69.88) .. (186.56,69.94) .. controls (185.72,70.04) and (184.86,69.86) .. (184.02,69.94) .. controls (180.95,70.25) and (176.33,71.07) .. (173.88,71.63) .. controls (173.02,71.83) and (170.62,72.98) .. (171.35,72.48) .. controls (174.68,70.16) and (178.53,68.49) .. (181.49,65.71) .. controls (181.82,65.4) and (176.65,66.76) .. (176,66.98) .. controls (175.32,67.21) and (173.17,67.35) .. (173.88,67.4) .. controls (176.27,67.6) and (178.68,67.28) .. (181.07,67.4) .. controls (181.38,67.42) and (181.95,67.51) .. (181.91,67.83) .. controls (181.84,68.5) and (179.98,72.96) .. (179.38,73.74) .. controls (177.55,76.11) and (175.51,78.32) .. (173.46,80.5) .. controls (173.36,80.61) and (172.9,80.5) .. (173.04,80.5) .. controls (176.8,80.5) and (180.01,78.57) .. (183.6,77.97) .. controls (183.74,77.94) and (183.3,77.89) .. (183.18,77.97) .. controls (181.59,79.03) and (180.13,80.29) .. (178.53,81.35) .. controls (177.16,82.26) and (175.62,82.9) .. (174.31,83.88) .. controls (174.08,84.05) and (174.88,83.94) .. (175.15,83.88) .. controls (176.15,83.66) and (177.16,83.42) .. (178.11,83.04) .. controls (179.98,82.29) and (181.72,81.23) .. (183.6,80.5) .. controls (184.08,80.32) and (182.74,81.04) .. (182.33,81.35) .. controls (179.85,83.21) and (177.6,85.36) .. (175.15,87.26) .. controls (175.04,87.35) and (174.59,87.3) .. (174.73,87.26) .. controls (181.95,85.46) and (188.64,82.47) .. (195.85,80.5) .. controls (197.03,80.18) and (191.9,82.87) .. (185.29,86.84) .. controls (182.79,88.34) and (178.78,90.37) .. (176.42,92.33) .. controls (176.31,92.42) and (176.71,92.38) .. (176.84,92.33) .. controls (183.9,89.56) and (190.98,86.83) .. (197.97,83.88) .. controls (198.63,83.6) and (196.51,84.02) .. (195.85,84.31) .. controls (193.26,85.44) and (190.81,86.89) .. (188.25,88.11) .. controls (187.56,88.44) and (186.81,88.61) .. (186.14,88.95) .. controls (185.96,89.04) and (185.54,89.46) .. (185.71,89.38) .. controls (191.02,86.72) and (195.95,83.01) .. (201.35,80.5) .. controls (203.41,79.54) and (206.67,80) .. (207.68,77.97) .. controls (208.01,77.33) and (206.22,78.09) .. (205.57,78.39) .. controls (203.46,79.37) and (201.27,80.22) .. (199.23,81.35) .. controls (198.52,81.75) and (196.39,82.98) .. (197.12,82.62) .. controls (198.65,81.85) and (206.83,76.7) .. (208.11,77.12) .. controls (209.07,77.44) and (206.45,78.3) .. (205.57,78.81) .. controls (204.76,79.29) and (203.83,79.57) .. (203.04,80.08) .. controls (199.91,82.07) and (196.97,83.98) .. (193.74,86) .. controls (193.1,86.4) and (190.91,87.07) .. (191.63,86.84) .. controls (194.01,86.1) and (196.4,85.36) .. (198.81,84.73) .. controls (209.07,82.05) and (192.6,87.75) .. (189.52,90.22) .. controls (188.92,90.69) and (190.9,89.59) .. (191.63,89.38) .. controls (192.45,89.13) and (193.36,89.26) .. (194.16,88.95) .. controls (196.34,88.12) and (198.36,86.91) .. (200.5,86) .. controls (201.57,85.54) and (203.88,85.15) .. (203.88,85.15) .. controls (203.88,85.15) and (203.29,85.02) .. (203.04,85.15) .. controls (198.52,87.41) and (194.22,90.08) .. (189.94,92.76) .. controls (188.91,93.4) and (185.82,95.2) .. (186.98,94.87) .. controls (191.19,93.67) and (203.36,88.5) .. (205.15,87.69) .. controls (205.72,87.43) and (207.44,86.64) .. (206.84,86.84) .. controls (200.15,89.07) and (194.05,92.81) .. (187.83,96.14) .. controls (187.27,96.43) and (185.53,97.13) .. (186.14,96.98) .. controls (190.69,95.84) and (199.62,90.64) .. (203.04,90.64) .. controls (203.62,90.64) and (201.86,90.79) .. (201.35,91.07) .. controls (196.67,93.6) and (192.38,96.78) .. (187.83,99.52) .. controls (187.12,99.94) and (184.98,100.42) .. (185.71,100.78) .. controls (186.33,101.09) and (188.71,99.66) .. (189.09,99.52) .. controls (192.17,98.33) and (195.32,97.33) .. (198.39,96.14) .. controls (200.39,95.36) and (202.36,94.51) .. (204.3,93.6) .. controls (204.48,93.52) and (204.93,93.18) .. (204.73,93.18) .. controls (200.95,93.18) and (197.28,97.33) .. (194.59,99.09) .. controls (192.32,100.58) and (189.71,101.48) .. (187.4,102.9) .. controls (186.49,103.46) and (189.35,101.99) .. (190.36,101.63) .. controls (193,100.67) and (195.8,100.17) .. (198.39,99.09) .. controls (199.22,98.75) and (199.68,97.76) .. (200.5,97.4) .. controls (202.01,96.74) and (206.73,95.26) .. (205.15,95.71) .. controls (201.62,96.72) and (189.19,103.4) .. (187.4,105.01) .. controls (186.44,105.88) and (189.98,104.59) .. (191.21,104.16) .. controls (195.97,102.52) and (196.97,101.54) .. (200.5,100.36) .. controls (200.64,100.32) and (200.21,100.3) .. (200.08,100.36) .. controls (198.78,101.01) and (197.48,101.66) .. (196.28,102.47) .. controls (193.95,104.06) and (191.69,105.75) .. (189.52,107.54) .. controls (188.97,107.99) and (187.16,109.03) .. (187.83,108.81) .. controls (192.71,107.18) and (198.47,105.16) .. (202.61,102.05) .. controls (203.22,101.6) and (201.12,102.46) .. (200.5,102.9) .. controls (199.6,103.53) and (198.89,104.41) .. (197.97,105.01) .. controls (197.3,105.44) and (189.52,109.5) .. (189.52,110.08) .. controls (189.52,111.24) and (191.77,109.5) .. (192.9,109.23) .. controls (194.92,108.76) and (197.24,109.11) .. (198.81,107.54) .. controls (199.03,107.32) and (198.21,107.77) .. (197.97,107.97) .. controls (197.5,108.34) and (197.16,108.85) .. (196.7,109.23) .. controls (195.3,110.4) and (193.63,111.82) .. (192.05,112.61) .. controls (191.32,112.98) and (193.45,111.75) .. (194.16,111.35) .. controls (195.32,110.69) and (196.46,110) .. (197.54,109.23) .. controls (200.01,107.51) and (196.39,108.58) .. (196.7,107.97) .. controls (197.84,105.68) and (202.71,101.76) .. (203.04,100.78) .. controls (203.4,99.68) and (197.78,101.56) .. (197.54,101.63) .. controls (197.24,101.71) and (196.7,102.37) .. (196.7,102.05) .. controls (196.7,100.79) and (198.99,100.99) .. (200.08,100.36) .. controls (201.3,99.66) and (202.29,98.61) .. (203.46,97.83) .. controls (204.26,97.29) and (205.19,96.67) .. (205.99,96.14) .. controls (206.11,96.06) and (206.56,96.14) .. (206.42,96.14) .. controls (205.37,96.14) and (203.56,96.2) .. (202.61,96.56) .. controls (200.73,97.27) and (198.98,98.3) .. (197.12,99.09) .. controls (196.99,99.15) and (196.57,99.16) .. (196.7,99.09) .. controls (201.07,96.91) and (207.21,94.92) .. (210.64,91.49) .. controls (211.24,90.89) and (208.93,91.31) .. (208.11,91.49) .. controls (206.23,91.88) and (204.43,92.57) .. (202.61,93.18) .. controls (201.74,93.47) and (201.38,93.31) .. (200.5,93.6) .. controls (200.31,93.66) and (199.94,94.16) .. (200.08,94.02) .. controls (202.65,91.45) and (206.22,90.08) .. (208.95,87.69) .. controls (209.91,86.85) and (206.32,87.6) .. (205.15,88.11) .. controls (203.53,88.82) and (202.1,89.9) .. (200.5,90.64) .. controls (199.98,90.89) and (199.36,90.88) .. (198.81,91.07) .. controls (198.51,91.17) and (197.72,91.69) .. (197.97,91.49) .. controls (202.08,88.06) and (203.07,88.14) .. (208.53,84.73) .. controls (208.91,84.49) and (207.69,85.04) .. (207.26,85.15) .. controls (205.97,85.47) and (204.65,85.82) .. (203.46,86.42) .. controls (203.28,86.51) and (202.9,86.98) .. (203.04,86.84) .. controls (203.89,85.98) and (204.98,85.38) .. (205.99,84.73) .. controls (207.44,83.8) and (208.88,82.84) .. (210.22,81.77) .. controls (210.38,81.65) and (210.84,81.35) .. (210.64,81.35) .. controls (208.75,81.35) and (206.42,82.74) .. (204.73,83.46) .. controls (203.51,83.98) and (200.24,86.31) .. (201.35,85.57) .. controls (203.96,83.83) and (206.76,81.82) .. (209.37,80.08) .. controls (209.49,80) and (209.08,80.02) .. (208.95,80.08) .. controls (206.11,81.43) and (203.26,82.78) .. (200.5,84.31) .. controls (194.96,87.37) and (190.26,91.21) .. (184.87,94.45) .. controls (183.39,95.33) and (180.11,95.94) .. (179.38,97.4) .. controls (179.31,97.53) and (179.67,97.45) .. (179.8,97.4) .. controls (180.77,97.08) and (181.86,96.21) .. (182.76,95.71) .. controls (185.19,94.37) and (188.2,93.66) .. (189.94,91.49) .. controls (190.73,90.5) and (187.39,91.28) .. (186.14,91.49) .. controls (183.81,91.88) and (179.38,91.24) .. (179.38,93.6) .. controls (179.38,93.62) and (187.94,89.2) .. (188.25,88.95) .. controls (188.6,88.67) and (187.33,89.1) .. (186.98,89.38) .. controls (185.89,90.25) and (185.05,91.38) .. (184.02,92.33) .. controls (183.07,93.22) and (180.15,93.95) .. (181.07,94.87) .. controls (182.37,96.17) and (186.37,94.02) .. (188.25,94.02) .. controls (190.25,94.02) and (185.74,97.14) .. (184.45,98.67) .. controls (184.06,99.13) and (182.6,100.08) .. (183.18,99.94) .. controls (184.44,99.62) and (185.73,99.43) .. (186.98,99.09) .. controls (187.29,99.01) and (187.83,98.36) .. (187.83,98.67) .. controls (187.83,100.08) and (185.68,100.52) .. (184.45,101.21) .. controls (183.83,101.55) and (182.07,102.61) .. (182.76,102.47) .. controls (184.2,102.18) and (185.6,101.71) .. (186.98,101.21) .. controls (187.17,101.14) and (187.54,100.64) .. (187.4,100.78) .. controls (185.95,102.24) and (183.28,103.95) .. (182.33,105.85) .. controls (182.07,106.37) and (183.47,105.61) .. (184.02,105.43) .. controls (184.07,105.41) and (187.4,104.31) .. (187.4,104.16) .. controls (187.4,103.06) and (183.86,106.72) .. (184.87,106.28) .. controls (186.26,105.66) and (187.66,105.07) .. (189.09,104.59) .. controls (189.52,104.44) and (188.67,105.41) .. (188.67,105.85) .. controls (188.67,105.99) and (188.58,105.54) .. (188.67,105.43) .. controls (189.45,104.52) and (190.47,103.84) .. (191.21,102.9) .. controls (194.81,98.26) and (194.45,97.77) .. (197.12,92.76) .. controls (198.13,90.86) and (200.6,89.15) .. (199.23,86.42) .. controls (198.66,85.28) and (194.96,87.18) .. (195.43,86) .. controls (196.49,83.35) and (199.24,81.54) .. (200.08,78.81) .. controls (200.29,78.14) and (198.58,78.47) .. (197.97,78.81) .. controls (195.95,79.93) and (194.26,81.58) .. (192.47,83.04) .. controls (192.23,83.24) and (193.06,82.79) .. (193.32,82.62) .. controls (194.89,81.52) and (198.49,81.08) .. (197.97,79.24) .. controls (197.46,77.47) and (194.3,79.43) .. (192.47,79.66) .. controls (189.92,79.98) and (187.43,81.1) .. (184.87,80.93) .. controls (184.45,80.9) and (184.74,80.06) .. (184.87,79.66) .. controls (185.48,77.84) and (187.68,77.81) .. (186.98,75.01) .. controls (186.81,74.33) and (185.54,74.8) .. (184.87,75.01) .. controls (181.18,76.16) and (180.2,77.55) .. (177.26,77.55) .. controls (176.76,77.55) and (178.13,77) .. (178.53,76.7) .. controls (179.34,76.09) and (180.16,75.46) .. (181.07,75.01) .. controls (181.07,75.01) and (195.18,70.02) .. (197.12,66.14) .. controls (197.82,64.75) and (194.01,65.93) .. (192.47,65.71) .. controls (191.66,65.6) and (193.88,64.86) .. (194.59,64.45) .. controls (197.09,62.98) and (199.44,61.14) .. (202.19,60.22) .. controls (202.31,60.18) and (206.08,59.04) .. (205.57,58.53) .. controls (205.42,58.39) and (204.8,58.89) .. (204.73,58.95) .. controls (203.4,60.01) and (201.52,60.75) .. (200.92,62.33) .. controls (200.63,63.13) and (202.62,62.4) .. (203.46,62.33) .. controls (204.59,62.25) and (205.73,62.16) .. (206.84,61.91) .. controls (207.27,61.82) and (208.31,61.09) .. (208.11,61.49) .. controls (207.39,62.92) and (205.61,63.54) .. (204.73,64.87) .. controls (204.31,65.5) and (206.09,64.15) .. (206.84,64.02) .. controls (208.02,63.83) and (208.64,63.89) .. (209.8,63.6) .. controls (210.1,63.53) and (210.88,62.97) .. (210.64,63.18) .. controls (208.82,64.74) and (206.85,66.13) .. (205.15,67.83) .. controls (204.61,68.36) and (206.54,67.22) .. (207.26,66.98) .. controls (208.11,66.7) and (209,66.54) .. (209.8,66.14) .. controls (209.92,66.07) and (210.22,66.14) .. (210.22,66.14) .. controls (210.22,66.14) and (204.3,70.36) .. (204.3,70.36) .. controls (204.86,70.92) and (209.09,69.8) .. (209.8,69.1) .. controls (210.74,68.16) and (207.56,70.52) .. (206.42,71.21) .. controls (205.34,71.86) and (204.02,72.11) .. (203.04,72.9) .. controls (202.71,73.16) and (203.89,72.98) .. (204.3,72.9) .. controls (205.18,72.72) and (206.01,72.38) .. (206.84,72.05) .. controls (207.02,71.98) and (207.44,71.54) .. (207.26,71.63) .. controls (199.15,75.69) and (191,78.57) .. (183.18,83.04) .. controls (180.96,84.31) and (177.34,85.24) .. (175.57,86.42) .. controls (175.2,86.66) and (174.7,86.64) .. (174.31,86.84) .. controls (173.91,87.04) and (172.59,87.26) .. (173.04,87.26) .. controls (176.14,87.26) and (178.69,85.15) .. (181.91,85.15) .. controls (182.19,85.15) and (181.32,85.02) .. (181.07,85.15) .. controls (178.7,86.33) and (176.15,87.17) .. (173.88,88.53) .. controls (173.64,88.68) and (174.08,89.18) .. (173.88,89.38) .. controls (173.68,89.57) and (172.77,89.45) .. (173.04,89.38) .. controls (175.27,88.74) and (177.57,88.32) .. (179.8,87.69) .. controls (180.53,87.48) and (181.19,87.08) .. (181.91,86.84) .. controls (188.25,84.73) and (176.71,88.4) .. (176.42,88.53) .. controls (175.28,89.06) and (174.3,90.22) .. (173.04,90.22) .. controls (172.9,90.22) and (173.34,90.28) .. (173.46,90.22) .. controls (174.48,89.71) and (175.48,89.17) .. (176.42,88.53) .. controls (177.61,87.71) and (186.24,81.92) .. (186.56,80.08) .. controls (187.13,76.75) and (186.69,73.32) .. (186.56,69.94) .. controls (186.53,69.08) and (186.95,67.68) .. (186.14,67.4) .. controls (182.44,66.17) and (180.96,67.84) .. (177.69,68.25) .. controls (176.79,68.36) and (178.81,66.84) .. (179.38,66.14) .. controls (180.57,64.64) and (181.81,63.15) .. (182.76,61.49) .. controls (182.97,61.12) and (183.13,60.41) .. (182.76,60.22) .. controls (181.74,59.71) and (180.47,60.33) .. (179.38,60.64) .. controls (176.29,61.54) and (173.14,62.2) .. (170.08,63.18) .. controls (169.48,63.37) and (167.76,64.02) .. (168.39,64.02) .. controls (171.07,64.02) and (172.96,61.49) .. (175.57,61.49) ;
%Shape: Circle [id:dp26651215999910927] 
\draw   (360,74.43) .. controls (360,46.78) and (382.42,24.37) .. (410.07,24.37) .. controls (437.72,24.37) and (460.13,46.78) .. (460.13,74.43) .. controls (460.13,102.08) and (437.72,124.5) .. (410.07,124.5) .. controls (382.42,124.5) and (360,102.08) .. (360,74.43) -- cycle ;
%Shape: Circle [id:dp6536384159124671] 
\draw   (410.28,74.43) .. controls (410.28,46.78) and (432.69,24.37) .. (460.35,24.37) .. controls (488,24.37) and (510.41,46.78) .. (510.41,74.43) .. controls (510.41,102.08) and (488,124.5) .. (460.35,124.5) .. controls (432.69,124.5) and (410.28,102.08) .. (410.28,74.43) -- cycle ;
%Shape: Free Drawing [id:dp6023952838342782] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=6] [line join = round][line cap = round] (436.9,35.98) .. controls (434.46,37.37) and (432.04,39.43) .. (430.56,41.9) .. controls (430.33,42.28) and (431.42,41.66) .. (431.83,41.48) .. controls (432.93,40.99) and (434.11,40.71) .. (435.21,40.21) .. controls (435.67,40) and (436.02,39.59) .. (436.47,39.36) .. controls (436.76,39.22) and (437.63,38.89) .. (437.32,38.94) .. controls (433.56,39.57) and (431.18,41.64) .. (428.45,44.01) .. controls (427.08,45.19) and (425.68,46.33) .. (424.22,47.39) .. controls (424.11,47.47) and (423.67,47.45) .. (423.8,47.39) .. controls (428.42,45.23) and (433.08,43.13) .. (437.74,41.05) .. controls (438.15,40.87) and (439.45,40.63) .. (439.01,40.63) .. controls (435.07,40.63) and (429.3,46.08) .. (427.18,47.81) .. controls (426.25,48.57) and (425.44,49.46) .. (424.64,50.35) .. controls (424.17,50.87) and (422.75,51.72) .. (423.38,52.04) .. controls (424.29,52.5) and (425.39,51.59) .. (426.33,51.19) .. controls (431.87,48.91) and (437.3,46.36) .. (442.81,44.01) .. controls (442.94,43.96) and (443.37,43.98) .. (443.23,44.01) .. controls (437.44,45.46) and (434.25,47.45) .. (428.45,50.35) .. controls (427.49,50.83) and (426.44,51.12) .. (425.49,51.62) .. controls (424.04,52.39) and (422.71,53.37) .. (421.26,54.15) .. controls (421.14,54.22) and (420.71,54.2) .. (420.84,54.15) .. controls (428.24,51.69) and (435.53,48.91) .. (442.81,46.12) .. controls (443.64,45.81) and (444.58,45.74) .. (445.35,45.28) .. controls (445.52,45.18) and (445.96,44.8) .. (445.77,44.86) .. controls (438.66,47.05) and (431.64,49.93) .. (425.49,54.15) .. controls (423.84,55.29) and (422.35,56.64) .. (420.84,57.95) .. controls (420.16,58.55) and (418.09,59.01) .. (418.73,59.64) .. controls (419.89,60.81) and (423.51,58.7) .. (424.64,58.38) .. controls (431.06,56.54) and (437.05,54.09) .. (443.23,51.62) .. controls (443.81,51.39) and (448.3,50.45) .. (448.3,49.5) .. controls (448.3,48.85) and (444.11,50.76) .. (444.08,50.77) .. controls (442.42,51.29) and (440.69,51.57) .. (439.01,52.04) .. controls (437.16,52.56) and (435.32,53.09) .. (433.52,53.73) .. controls (430.03,54.96) and (426.54,56.21) .. (422.95,57.11) .. controls (422.76,57.16) and (423.19,56.77) .. (423.38,56.69) .. controls (425.18,55.91) and (427.08,55.38) .. (428.87,54.57) .. controls (431.45,53.41) and (433.86,51.86) .. (436.47,50.77) .. controls (438,50.14) and (439.96,50.25) .. (441.12,49.08) .. controls (441.75,48.45) and (442.86,48.64) .. (443.66,48.24) .. controls (443.83,48.15) and (444.27,47.78) .. (444.08,47.81) .. controls (442.41,48.15) and (440.95,49.17) .. (439.43,49.93) .. controls (432.39,53.45) and (424.52,55.66) .. (418.31,60.49) .. controls (417.86,60.84) and (419.44,60.57) .. (420,60.49) .. controls (422.4,60.15) and (424.83,59.86) .. (427.18,59.22) .. controls (434.7,57.18) and (442.12,54.74) .. (449.57,52.46) .. controls (450.13,52.29) and (448.42,52.66) .. (447.88,52.88) .. controls (447.01,53.26) and (446.19,53.73) .. (445.35,54.15) .. controls (442.64,55.5) and (439.95,56.89) .. (437.32,58.38) .. controls (431.76,61.5) and (425.69,66.1) .. (420,68.94) .. controls (419.74,69.07) and (420.58,69.05) .. (420.84,68.94) .. controls (426.66,66.5) and (432.41,63.92) .. (438.16,61.33) .. controls (439.31,60.82) and (440.44,60.26) .. (441.54,59.64) .. controls (442.09,59.34) and (443.85,58.68) .. (443.23,58.8) .. controls (437.97,59.81) and (432.71,60.98) .. (427.6,62.6) .. controls (425.94,63.13) and (424.21,63.39) .. (422.53,63.87) .. controls (422.23,63.96) and (421.42,64.47) .. (421.69,64.29) .. controls (427.06,60.71) and (435.12,58.19) .. (441.12,56.69) .. controls (441.82,56.51) and (447.24,55.18) .. (447.46,55.84) .. controls (447.76,56.74) and (445.7,56.57) .. (444.92,57.11) .. controls (443.86,57.85) and (443.09,58.99) .. (441.97,59.64) .. controls (435.6,63.37) and (429.04,66.75) .. (422.53,70.21) .. controls (422.14,70.42) and (420.82,70.63) .. (421.26,70.63) .. controls (422.78,70.63) and (424.08,69.5) .. (425.49,68.94) .. controls (432.6,66.09) and (440.2,64.64) .. (447.46,62.18) .. controls (448.9,61.69) and (453.04,59.81) .. (451.68,60.49) .. controls (442.98,64.84) and (435.74,67.69) .. (427.18,72.74) .. controls (425.98,73.45) and (424.49,73.6) .. (423.38,74.43) .. controls (422.8,74.86) and (424.8,74.2) .. (425.49,74.01) .. controls (427.31,73.51) and (428.84,72.25) .. (430.56,71.48) .. controls (434.54,69.68) and (438.71,68.34) .. (442.81,66.83) .. controls (444.95,66.04) and (447.06,65.2) .. (449.15,64.29) .. controls (449.33,64.21) and (449.75,63.79) .. (449.57,63.87) .. controls (444.2,66.29) and (438.7,69.07) .. (433.52,71.9) .. controls (431.63,72.93) and (429.88,74.19) .. (428.02,75.28) .. controls (427.42,75.63) and (425.63,76.55) .. (426.33,76.55) .. controls (430.09,76.55) and (446.6,69.12) .. (447.04,68.94) .. controls (448.62,68.31) and (450.08,67.4) .. (451.68,66.83) .. controls (452.08,66.69) and (453.37,66.83) .. (452.95,66.83) .. controls (451.75,66.83) and (450.65,67.56) .. (449.57,68.1) .. controls (443.71,71.02) and (437.71,73.67) .. (431.83,76.55) .. controls (429.67,77.6) and (422.98,80.36) .. (423.8,80.77) .. controls (424.96,81.35) and (426.37,80.34) .. (427.6,79.93) .. controls (430.86,78.84) and (434.1,77.73) .. (437.32,76.55) .. controls (441.88,74.87) and (446.61,73.72) .. (451.26,72.32) .. controls (453.75,71.57) and (456.19,69.79) .. (458.87,69.79) .. controls (459.07,69.79) and (458.64,70.15) .. (458.44,70.21) .. controls (457.62,70.44) and (456.7,70.3) .. (455.91,70.63) .. controls (454.28,71.3) and (452.88,72.47) .. (451.26,73.17) .. controls (444.92,75.9) and (438.36,78.1) .. (431.83,80.35) .. controls (429.5,81.15) and (425.07,83.31) .. (425.07,83.31) .. controls (425.07,83.31) and (426.17,82.43) .. (426.76,82.04) .. controls (429.73,80.05) and (432.73,78.56) .. (436.05,76.97) .. controls (437.87,76.1) and (442.97,75.86) .. (441.54,74.43) .. controls (440.23,73.12) and (437.83,74.77) .. (436.05,75.28) .. controls (431.93,76.45) and (425.35,79.5) .. (420.84,79.5) .. controls (420.7,79.5) and (421.14,79.57) .. (421.26,79.5) .. controls (423.13,78.44) and (424.89,77.19) .. (426.76,76.12) .. controls (427.85,75.5) and (429.03,75.04) .. (430.14,74.43) .. controls (432.13,73.35) and (434.05,72.12) .. (436.05,71.05) .. controls (436.18,70.99) and (436.62,71.05) .. (436.47,71.05) .. controls (430.99,71.05) and (425.92,73.16) .. (420.84,74.43) .. controls (420.39,74.54) and (417.04,75.28) .. (417.04,75.28) .. controls (417.04,75.28) and (420.8,74.08) .. (422.53,73.17) .. controls (425.59,71.54) and (428.69,69.98) .. (431.83,68.52) .. controls (432.23,68.33) and (431,68.88) .. (430.56,68.94) .. controls (429.72,69.04) and (428.86,68.86) .. (428.02,68.94) .. controls (424.95,69.25) and (420.33,70.07) .. (417.88,70.63) .. controls (417.02,70.83) and (414.62,71.98) .. (415.35,71.48) .. controls (418.68,69.16) and (422.53,67.49) .. (425.49,64.71) .. controls (425.82,64.4) and (420.65,65.76) .. (420,65.98) .. controls (419.32,66.21) and (417.17,66.35) .. (417.88,66.4) .. controls (420.27,66.6) and (422.68,66.28) .. (425.07,66.4) .. controls (425.38,66.42) and (425.95,66.51) .. (425.91,66.83) .. controls (425.84,67.5) and (423.98,71.96) .. (423.38,72.74) .. controls (421.55,75.11) and (419.51,77.32) .. (417.46,79.5) .. controls (417.36,79.61) and (416.9,79.5) .. (417.04,79.5) .. controls (420.8,79.5) and (424.01,77.57) .. (427.6,76.97) .. controls (427.74,76.94) and (427.3,76.89) .. (427.18,76.97) .. controls (425.59,78.03) and (424.13,79.29) .. (422.53,80.35) .. controls (421.16,81.26) and (419.62,81.9) .. (418.31,82.88) .. controls (418.08,83.05) and (418.88,82.94) .. (419.15,82.88) .. controls (420.15,82.66) and (421.16,82.42) .. (422.11,82.04) .. controls (423.98,81.29) and (425.72,80.23) .. (427.6,79.5) .. controls (428.08,79.32) and (426.74,80.04) .. (426.33,80.35) .. controls (423.85,82.21) and (421.6,84.36) .. (419.15,86.26) .. controls (419.04,86.35) and (418.59,86.3) .. (418.73,86.26) .. controls (425.95,84.46) and (432.64,81.47) .. (439.85,79.5) .. controls (441.03,79.18) and (435.9,81.87) .. (429.29,85.84) .. controls (426.79,87.34) and (422.78,89.37) .. (420.42,91.33) .. controls (420.31,91.42) and (420.71,91.38) .. (420.84,91.33) .. controls (427.9,88.56) and (434.98,85.83) .. (441.97,82.88) .. controls (442.63,82.6) and (440.51,83.02) .. (439.85,83.31) .. controls (437.26,84.44) and (434.81,85.89) .. (432.25,87.11) .. controls (431.56,87.44) and (430.81,87.61) .. (430.14,87.95) .. controls (429.96,88.04) and (429.54,88.46) .. (429.71,88.38) .. controls (435.02,85.72) and (439.95,82.01) .. (445.35,79.5) .. controls (447.41,78.54) and (450.67,79) .. (451.68,76.97) .. controls (452.01,76.33) and (450.22,77.09) .. (449.57,77.39) .. controls (447.46,78.37) and (445.27,79.22) .. (443.23,80.35) .. controls (442.52,80.75) and (440.39,81.98) .. (441.12,81.62) .. controls (442.65,80.85) and (450.83,75.7) .. (452.11,76.12) .. controls (453.07,76.44) and (450.45,77.3) .. (449.57,77.81) .. controls (448.76,78.29) and (447.83,78.57) .. (447.04,79.08) .. controls (443.91,81.07) and (440.97,82.98) .. (437.74,85) .. controls (437.1,85.4) and (434.91,86.07) .. (435.63,85.84) .. controls (438.01,85.1) and (440.4,84.36) .. (442.81,83.73) .. controls (453.07,81.05) and (436.6,86.75) .. (433.52,89.22) .. controls (432.92,89.69) and (434.9,88.59) .. (435.63,88.38) .. controls (436.45,88.13) and (437.36,88.26) .. (438.16,87.95) .. controls (440.34,87.12) and (442.36,85.91) .. (444.5,85) .. controls (445.57,84.54) and (447.88,84.15) .. (447.88,84.15) .. controls (447.88,84.15) and (447.29,84.02) .. (447.04,84.15) .. controls (442.52,86.41) and (438.22,89.08) .. (433.94,91.76) .. controls (432.91,92.4) and (429.82,94.2) .. (430.98,93.87) .. controls (435.19,92.67) and (447.36,87.5) .. (449.15,86.69) .. controls (449.72,86.43) and (451.44,85.64) .. (450.84,85.84) .. controls (444.15,88.07) and (438.05,91.81) .. (431.83,95.14) .. controls (431.27,95.43) and (429.53,96.13) .. (430.14,95.98) .. controls (434.69,94.84) and (443.62,89.64) .. (447.04,89.64) .. controls (447.62,89.64) and (445.86,89.79) .. (445.35,90.07) .. controls (440.67,92.6) and (436.38,95.78) .. (431.83,98.52) .. controls (431.12,98.94) and (428.98,99.42) .. (429.71,99.78) .. controls (430.33,100.09) and (432.71,98.66) .. (433.09,98.52) .. controls (436.17,97.33) and (439.32,96.33) .. (442.39,95.14) .. controls (444.39,94.36) and (446.36,93.51) .. (448.3,92.6) .. controls (448.48,92.52) and (448.93,92.18) .. (448.73,92.18) .. controls (444.95,92.18) and (441.28,96.33) .. (438.59,98.09) .. controls (436.32,99.58) and (433.71,100.48) .. (431.4,101.9) .. controls (430.49,102.46) and (433.35,100.99) .. (434.36,100.63) .. controls (437,99.67) and (439.8,99.17) .. (442.39,98.09) .. controls (443.22,97.75) and (443.68,96.76) .. (444.5,96.4) .. controls (446.01,95.74) and (450.73,94.26) .. (449.15,94.71) .. controls (445.62,95.72) and (433.19,102.4) .. (431.4,104.01) .. controls (430.44,104.88) and (433.98,103.59) .. (435.21,103.16) .. controls (439.97,101.52) and (440.97,100.54) .. (444.5,99.36) .. controls (444.64,99.32) and (444.21,99.3) .. (444.08,99.36) .. controls (442.78,100.01) and (441.48,100.66) .. (440.28,101.47) .. controls (437.95,103.06) and (435.69,104.75) .. (433.52,106.54) .. controls (432.97,106.99) and (431.16,108.03) .. (431.83,107.81) .. controls (436.71,106.18) and (442.47,104.16) .. (446.61,101.05) .. controls (447.22,100.6) and (445.12,101.46) .. (444.5,101.9) .. controls (443.6,102.53) and (442.89,103.41) .. (441.97,104.01) .. controls (441.3,104.44) and (433.52,108.5) .. (433.52,109.08) .. controls (433.52,110.24) and (435.77,108.5) .. (436.9,108.23) .. controls (438.92,107.76) and (441.24,108.11) .. (442.81,106.54) .. controls (443.03,106.32) and (442.21,106.77) .. (441.97,106.97) .. controls (441.5,107.34) and (441.16,107.85) .. (440.7,108.23) .. controls (439.3,109.4) and (437.63,110.82) .. (436.05,111.61) .. controls (435.32,111.98) and (437.45,110.75) .. (438.16,110.35) .. controls (439.32,109.69) and (440.46,109) .. (441.54,108.23) .. controls (444.01,106.51) and (440.39,107.58) .. (440.7,106.97) .. controls (441.84,104.68) and (446.71,100.76) .. (447.04,99.78) .. controls (447.4,98.68) and (441.78,100.56) .. (441.54,100.63) .. controls (441.24,100.71) and (440.7,101.37) .. (440.7,101.05) .. controls (440.7,99.79) and (442.99,99.99) .. (444.08,99.36) .. controls (445.3,98.66) and (446.29,97.61) .. (447.46,96.83) .. controls (448.26,96.29) and (449.19,95.67) .. (449.99,95.14) .. controls (450.11,95.06) and (450.56,95.14) .. (450.42,95.14) .. controls (449.37,95.14) and (447.56,95.2) .. (446.61,95.56) .. controls (444.73,96.27) and (442.98,97.3) .. (441.12,98.09) .. controls (440.99,98.15) and (440.57,98.16) .. (440.7,98.09) .. controls (445.07,95.91) and (451.21,93.92) .. (454.64,90.49) .. controls (455.24,89.89) and (452.93,90.31) .. (452.11,90.49) .. controls (450.23,90.88) and (448.43,91.57) .. (446.61,92.18) .. controls (445.74,92.47) and (445.38,92.31) .. (444.5,92.6) .. controls (444.31,92.66) and (443.94,93.16) .. (444.08,93.02) .. controls (446.65,90.45) and (450.22,89.08) .. (452.95,86.69) .. controls (453.91,85.85) and (450.32,86.6) .. (449.15,87.11) .. controls (447.53,87.82) and (446.1,88.9) .. (444.5,89.64) .. controls (443.98,89.89) and (443.36,89.88) .. (442.81,90.07) .. controls (442.51,90.17) and (441.72,90.69) .. (441.97,90.49) .. controls (446.08,87.06) and (447.07,87.14) .. (452.53,83.73) .. controls (452.91,83.49) and (451.69,84.04) .. (451.26,84.15) .. controls (449.97,84.47) and (448.65,84.82) .. (447.46,85.42) .. controls (447.28,85.51) and (446.9,85.98) .. (447.04,85.84) .. controls (447.89,84.98) and (448.98,84.38) .. (449.99,83.73) .. controls (451.44,82.8) and (452.88,81.84) .. (454.22,80.77) .. controls (454.38,80.65) and (454.84,80.35) .. (454.64,80.35) .. controls (452.75,80.35) and (450.42,81.74) .. (448.73,82.46) .. controls (447.51,82.98) and (444.24,85.31) .. (445.35,84.57) .. controls (447.96,82.83) and (450.76,80.82) .. (453.37,79.08) .. controls (453.49,79) and (453.08,79.02) .. (452.95,79.08) .. controls (450.11,80.43) and (447.26,81.78) .. (444.5,83.31) .. controls (438.96,86.37) and (434.26,90.21) .. (428.87,93.45) .. controls (427.39,94.33) and (424.11,94.94) .. (423.38,96.4) .. controls (423.31,96.53) and (423.67,96.45) .. (423.8,96.4) .. controls (424.77,96.08) and (425.86,95.21) .. (426.76,94.71) .. controls (429.19,93.37) and (432.2,92.66) .. (433.94,90.49) .. controls (434.73,89.5) and (431.39,90.28) .. (430.14,90.49) .. controls (427.81,90.88) and (423.38,90.24) .. (423.38,92.6) .. controls (423.38,92.62) and (431.94,88.2) .. (432.25,87.95) .. controls (432.6,87.67) and (431.33,88.1) .. (430.98,88.38) .. controls (429.89,89.25) and (429.05,90.38) .. (428.02,91.33) .. controls (427.07,92.22) and (424.15,92.95) .. (425.07,93.87) .. controls (426.37,95.17) and (430.37,93.02) .. (432.25,93.02) .. controls (434.25,93.02) and (429.74,96.14) .. (428.45,97.67) .. controls (428.06,98.13) and (426.6,99.08) .. (427.18,98.94) .. controls (428.44,98.62) and (429.73,98.43) .. (430.98,98.09) .. controls (431.29,98.01) and (431.83,97.36) .. (431.83,97.67) .. controls (431.83,99.08) and (429.68,99.52) .. (428.45,100.21) .. controls (427.83,100.55) and (426.07,101.61) .. (426.76,101.47) .. controls (428.2,101.18) and (429.6,100.71) .. (430.98,100.21) .. controls (431.17,100.14) and (431.54,99.64) .. (431.4,99.78) .. controls (429.95,101.24) and (427.28,102.95) .. (426.33,104.85) .. controls (426.07,105.37) and (427.47,104.61) .. (428.02,104.43) .. controls (428.07,104.41) and (431.4,103.31) .. (431.4,103.16) .. controls (431.4,102.06) and (427.86,105.72) .. (428.87,105.28) .. controls (430.26,104.66) and (431.66,104.07) .. (433.09,103.59) .. controls (433.52,103.44) and (432.67,104.41) .. (432.67,104.85) .. controls (432.67,104.99) and (432.58,104.54) .. (432.67,104.43) .. controls (433.45,103.52) and (434.47,102.84) .. (435.21,101.9) .. controls (438.81,97.26) and (438.45,96.77) .. (441.12,91.76) .. controls (442.13,89.86) and (444.6,88.15) .. (443.23,85.42) .. controls (442.66,84.28) and (438.96,86.18) .. (439.43,85) .. controls (440.49,82.35) and (443.24,80.54) .. (444.08,77.81) .. controls (444.29,77.14) and (442.58,77.47) .. (441.97,77.81) .. controls (439.95,78.93) and (438.26,80.58) .. (436.47,82.04) .. controls (436.23,82.24) and (437.06,81.79) .. (437.32,81.62) .. controls (438.89,80.52) and (442.49,80.08) .. (441.97,78.24) .. controls (441.46,76.47) and (438.3,78.43) .. (436.47,78.66) .. controls (433.92,78.98) and (431.43,80.1) .. (428.87,79.93) .. controls (428.45,79.9) and (428.74,79.06) .. (428.87,78.66) .. controls (429.48,76.84) and (431.68,76.81) .. (430.98,74.01) .. controls (430.81,73.33) and (429.54,73.8) .. (428.87,74.01) .. controls (425.18,75.16) and (424.2,76.55) .. (421.26,76.55) .. controls (420.76,76.55) and (422.13,76) .. (422.53,75.7) .. controls (423.34,75.09) and (424.16,74.46) .. (425.07,74.01) .. controls (425.07,74.01) and (439.18,69.02) .. (441.12,65.14) .. controls (441.82,63.75) and (438.01,64.93) .. (436.47,64.71) .. controls (435.66,64.6) and (437.88,63.86) .. (438.59,63.45) .. controls (441.09,61.98) and (443.44,60.14) .. (446.19,59.22) .. controls (446.31,59.18) and (450.08,58.04) .. (449.57,57.53) .. controls (449.42,57.39) and (448.8,57.89) .. (448.73,57.95) .. controls (447.4,59.01) and (445.52,59.75) .. (444.92,61.33) .. controls (444.63,62.13) and (446.62,61.4) .. (447.46,61.33) .. controls (448.59,61.25) and (449.73,61.16) .. (450.84,60.91) .. controls (451.27,60.82) and (452.31,60.09) .. (452.11,60.49) .. controls (451.39,61.92) and (449.61,62.54) .. (448.73,63.87) .. controls (448.31,64.5) and (450.09,63.15) .. (450.84,63.02) .. controls (452.02,62.83) and (452.64,62.89) .. (453.8,62.6) .. controls (454.1,62.53) and (454.88,61.97) .. (454.64,62.18) .. controls (452.82,63.74) and (450.85,65.13) .. (449.15,66.83) .. controls (448.61,67.36) and (450.54,66.22) .. (451.26,65.98) .. controls (452.11,65.7) and (453,65.54) .. (453.8,65.14) .. controls (453.92,65.07) and (454.22,65.14) .. (454.22,65.14) .. controls (454.22,65.14) and (448.3,69.36) .. (448.3,69.36) .. controls (448.86,69.92) and (453.09,68.8) .. (453.8,68.1) .. controls (454.74,67.16) and (451.56,69.52) .. (450.42,70.21) .. controls (449.34,70.86) and (448.02,71.11) .. (447.04,71.9) .. controls (446.71,72.16) and (447.89,71.98) .. (448.3,71.9) .. controls (449.18,71.72) and (450.01,71.38) .. (450.84,71.05) .. controls (451.02,70.98) and (451.44,70.54) .. (451.26,70.63) .. controls (443.15,74.69) and (435,77.57) .. (427.18,82.04) .. controls (424.96,83.31) and (421.34,84.24) .. (419.57,85.42) .. controls (419.2,85.66) and (418.7,85.64) .. (418.31,85.84) .. controls (417.91,86.04) and (416.59,86.26) .. (417.04,86.26) .. controls (420.14,86.26) and (422.69,84.15) .. (425.91,84.15) .. controls (426.19,84.15) and (425.32,84.02) .. (425.07,84.15) .. controls (422.7,85.33) and (420.15,86.17) .. (417.88,87.53) .. controls (417.64,87.68) and (418.08,88.18) .. (417.88,88.38) .. controls (417.68,88.57) and (416.77,88.45) .. (417.04,88.38) .. controls (419.27,87.74) and (421.57,87.32) .. (423.8,86.69) .. controls (424.53,86.48) and (425.19,86.08) .. (425.91,85.84) .. controls (432.25,83.73) and (420.71,87.4) .. (420.42,87.53) .. controls (419.28,88.06) and (418.3,89.22) .. (417.04,89.22) .. controls (416.9,89.22) and (417.34,89.28) .. (417.46,89.22) .. controls (418.48,88.71) and (419.48,88.17) .. (420.42,87.53) .. controls (421.61,86.71) and (430.24,80.92) .. (430.56,79.08) .. controls (431.13,75.75) and (430.69,72.32) .. (430.56,68.94) .. controls (430.53,68.08) and (430.95,66.68) .. (430.14,66.4) .. controls (426.44,65.17) and (424.96,66.84) .. (421.69,67.25) .. controls (420.79,67.36) and (422.81,65.84) .. (423.38,65.14) .. controls (424.57,63.64) and (425.81,62.15) .. (426.76,60.49) .. controls (426.97,60.12) and (427.13,59.41) .. (426.76,59.22) .. controls (425.74,58.71) and (424.47,59.33) .. (423.38,59.64) .. controls (420.29,60.54) and (417.14,61.2) .. (414.08,62.18) .. controls (413.48,62.37) and (411.76,63.02) .. (412.39,63.02) .. controls (415.07,63.02) and (416.96,60.49) .. (419.57,60.49) ;
%Shape: Free Drawing [id:dp10829018737391016] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=6] [line join = round][line cap = round] (411,34.5) .. controls (408.21,35.06) and (405.55,36.23) .. (403,37.5) .. controls (401.39,38.3) and (396.2,39.5) .. (398,39.5) .. controls (400.85,39.5) and (407.27,33.95) .. (406,36.5) .. controls (403.19,42.13) and (393.77,41.47) .. (390,46.5) .. controls (343.94,107.91) and (402,36.9) .. (402,42.5) .. controls (402,47.59) and (392.6,54.1) .. (389,50.5) .. controls (385.79,47.29) and (398.75,46.86) .. (400,42.5) .. controls (400.83,39.6) and (391,46.52) .. (391,43.5) .. controls (391,39.89) and (403.42,38.64) .. (400,37.5) .. controls (397,36.5) and (389.37,43.21) .. (391,40.5) .. controls (393.07,37.06) and (396.87,35.01) .. (400,32.5) .. controls (400.26,32.29) and (401.32,32.39) .. (401,32.5) .. controls (395.19,34.44) and (389.6,37.01) .. (384,39.5) .. controls (381.11,40.78) and (394.59,33.77) .. (393,36.5) .. controls (390.58,40.65) and (387.79,44.59) .. (385,48.5) .. controls (384.45,49.27) and (382.58,49.66) .. (383,50.5) .. controls (384.3,53.09) and (398.03,43.47) .. (394,47.5) .. controls (388.77,52.73) and (381.23,55.27) .. (376,60.5) .. controls (373.36,63.14) and (382.62,57.08) .. (386,55.5) .. controls (387.63,54.74) and (392.56,52.61) .. (391,53.5) .. controls (387.12,55.72) and (382.83,57.2) .. (379,59.5) .. controls (377.79,60.23) and (374.69,63.03) .. (376,62.5) .. controls (376.06,62.48) and (390,54.36) .. (390,52.5) .. controls (390,50.56) and (383.26,54.63) .. (385,55.5) .. controls (385.88,55.94) and (396.5,51.25) .. (398,50.5) .. controls (400.08,49.46) and (394.55,52.65) .. (395,52.5) .. controls (399.4,51.03) and (408,47.5) .. (408,47.5) .. controls (408,47.5) and (402,52.17) .. (399,54.5) .. controls (394.26,58.19) and (389.1,61.32) .. (384,64.5) .. controls (383.11,65.06) and (379.98,65.76) .. (381,65.5) .. controls (382.65,65.09) and (384.42,65.13) .. (386,64.5) .. controls (387.8,63.78) and (389.22,62.28) .. (391,61.5) .. controls (394.57,59.94) and (398.38,58.95) .. (402,57.5) .. controls (402.31,57.38) and (401.29,57.34) .. (401,57.5) .. controls (394.41,61.05) and (379.8,67.65) .. (372,73.5) .. controls (371.85,73.61) and (370,75.5) .. (370,75.5) .. controls (370,75.5) and (370.98,72.59) .. (372,71.5) .. controls (375.86,67.36) and (381.82,64.72) .. (384,59.5) .. controls (384.9,57.35) and (379.32,59.29) .. (377,59.5) .. controls (373.91,59.78) and (367.03,63.49) .. (365,64.5) .. controls (364.7,64.65) and (365.71,64.67) .. (366,64.5) .. controls (369.09,62.64) and (372.12,60.66) .. (375,58.5) .. controls (376.53,57.35) and (384.58,50.08) .. (385,50.5) .. controls (385.24,50.74) and (385.21,51.24) .. (385,51.5) .. controls (381.09,56.39) and (373.95,62.34) .. (371,67.5) .. controls (370.5,68.37) and (373.05,67.82) .. (374,67.5) .. controls (378.11,66.13) and (386,58.17) .. (386,62.5) .. controls (386,68.16) and (378.13,70.63) .. (374,74.5) .. controls (372.78,75.64) and (371.18,76.32) .. (370,77.5) .. controls (369.33,78.17) and (367.11,79.8) .. (368,79.5) .. controls (368.14,79.45) and (386,75.2) .. (386,72.5) .. controls (386,66.03) and (375.09,79.5) .. (370,83.5) .. controls (368.83,84.42) and (372.67,82.17) .. (374,81.5) .. controls (375.37,80.81) and (388.7,75.57) .. (391,72.5) .. controls (391.6,71.7) and (388.89,72.05) .. (388,72.5) .. controls (384.6,74.2) and (381.11,76.34) .. (379,79.5) .. controls (378.24,80.64) and (381.82,79.21) .. (383,78.5) .. controls (386.89,76.17) and (390.23,73.01) .. (394,70.5) .. controls (395.86,69.26) and (406,63.5) .. (406,63.5) .. controls (406,63.5) and (404.59,63.19) .. (404,63.5) .. controls (397.52,66.9) and (391.09,70.44) .. (385,74.5) .. controls (383.04,75.81) and (381.81,77.99) .. (380,79.5) .. controls (379.74,79.71) and (378.67,79.5) .. (379,79.5) .. controls (384.66,79.5) and (392.74,74) .. (398,72.5) .. controls (398.91,72.24) and (396.75,73.93) .. (396,74.5) .. controls (394.08,75.94) and (391.92,77.06) .. (390,78.5) .. controls (386.59,81.06) and (383.45,83.99) .. (380,86.5) .. controls (379.03,87.21) and (375.8,88.5) .. (377,88.5) .. controls (379.43,88.5) and (381.81,87.54) .. (384,86.5) .. controls (388.19,84.52) and (391.92,81.7) .. (396,79.5) .. controls (396.29,79.34) and (397.33,79.5) .. (397,79.5) .. controls (395.06,79.5) and (393.59,81.38) .. (392,82.5) .. controls (387.47,85.7) and (382.57,88.36) .. (378,91.5) .. controls (377.22,92.03) and (375.12,93.83) .. (376,93.5) .. controls (383.15,90.87) and (390.07,87.65) .. (397,84.5) .. controls (397.43,84.3) and (398.33,83.17) .. (398,83.5) .. controls (392.21,89.29) and (387.55,92.77) .. (381,98.5) .. controls (379.72,99.62) and (384.48,98.26) .. (386,97.5) .. controls (391.33,94.83) and (402,89.5) .. (402,89.5) .. controls (402,89.5) and (380.43,105.53) .. (389,101.5) .. controls (394.27,99.02) and (399.72,96.96) .. (405,94.5) .. controls (405.96,94.05) and (402.81,94.83) .. (402,95.5) .. controls (398.55,98.37) and (395.17,101.33) .. (392,104.5) .. controls (389.24,107.26) and (399.33,101.83) .. (403,100.5) .. controls (403.7,100.25) and (405.61,99.07) .. (405,99.5) .. controls (401.04,102.33) and (397.18,105.32) .. (394,108.5) .. controls (390.63,111.87) and (409.86,98.68) .. (407,102.5) .. controls (405.44,104.58) and (403.84,106.66) .. (402,108.5) .. controls (401.67,108.83) and (400.53,109.5) .. (401,109.5) .. controls (402.68,109.5) and (409.31,105.4) .. (412,104.5) .. controls (412.32,104.39) and (411.24,104.26) .. (411,104.5) .. controls (408.49,107.01) and (405.58,109.33) .. (404,112.5) .. controls (403.91,112.69) and (415.29,108.5) .. (415,108.5) .. controls (410.76,108.5) and (409,114.5) .. (406,117.5) .. controls (405.76,117.74) and (404.67,117.5) .. (405,117.5) .. controls (407.84,117.5) and (415.84,112.34) .. (417,113.5) .. controls (417.61,114.11) and (409.78,113.81) .. (409,112.5) .. controls (406.93,109.06) and (410.44,104.49) .. (410,100.5) .. controls (409.87,99.31) and (407.47,99.6) .. (407,98.5) .. controls (405.64,95.32) and (408.57,90.9) .. (408,87.5) .. controls (407.75,86.03) and (405.71,84.96) .. (406,83.5) .. controls (406.21,82.47) and (407.75,81.25) .. (407,80.5) .. controls (405.25,78.75) and (403.58,77.67) .. (399,78.5) .. controls (396.61,78.93) and (392.59,82.85) .. (392,80.5) .. controls (390.73,75.4) and (397.26,69.86) .. (399,65.5) .. controls (400.51,61.72) and (398.48,54.81) .. (402,50.5) .. controls (403.8,48.3) and (409.08,42.42) .. (411,40.5) .. controls (412.37,39.13) and (417.37,38.87) .. (416,37.5) .. controls (414.26,35.76) and (412.89,37.61) .. (412,38.5) ;
%Shape: Free Drawing [id:dp6685682904471533] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=6] [line join = round][line cap = round] (461,33.5) .. controls (458.27,33.5) and (461.27,33.5) .. (464,33.5) .. controls (464.47,33.5) and (463.33,34.17) .. (463,34.5) .. controls (462.76,34.74) and (463.76,34.26) .. (464,34.5) .. controls (465.18,35.68) and (459.33,38.5) .. (461,38.5) .. controls (464.67,38.5) and (469.54,37.65) .. (473,36.5) .. controls (474.41,36.03) and (475.59,34.97) .. (477,34.5) .. controls (477.63,34.29) and (479.6,34.2) .. (479,34.5) .. controls (471.4,38.3) and (462.05,42.82) .. (478,37.5) .. controls (479.14,37.12) and (475.98,38.81) .. (475,39.5) .. controls (472.27,41.41) and (469.36,43.14) .. (467,45.5) .. controls (466.25,46.25) and (462.95,46.5) .. (464,46.5) .. controls (469.5,46.5) and (475.6,39.2) .. (480,42.5) .. controls (483.26,44.94) and (473.43,47.32) .. (470,49.5) .. controls (469.6,49.75) and (468.55,50.65) .. (469,50.5) .. controls (471.27,49.74) and (482,45.5) .. (482,45.5) .. controls (482,45.5) and (470.45,51.05) .. (466,55.5) .. controls (464.28,57.22) and (470.75,54.4) .. (473,53.5) .. controls (478.29,51.38) and (483.6,49.3) .. (489,47.5) .. controls (489.71,47.26) and (491.63,46.1) .. (491,46.5) .. controls (486.71,49.23) and (482.51,52.14) .. (478,54.5) .. controls (475.48,55.82) and (472.55,56.23) .. (470,57.5) .. controls (463.62,60.69) and (479.81,55.76) .. (484,54.5) .. controls (491.05,52.39) and (493.87,48.24) .. (495,50.5) .. controls (495.42,51.34) and (493.79,51.99) .. (493,52.5) .. controls (489.1,55) and (485.18,57.5) .. (481,59.5) .. controls (477.48,61.18) and (473.7,62.27) .. (470,63.5) .. controls (469.55,63.65) and (468.53,64.5) .. (469,64.5) .. controls (476.06,64.5) and (482.3,59.73) .. (489,57.5) .. controls (491.07,56.81) and (491.93,57.19) .. (494,56.5) .. controls (494.71,56.26) and (496.62,55.09) .. (496,55.5) .. controls (489.02,60.15) and (484.86,64.88) .. (477,69.5) .. controls (476.98,69.51) and (467.32,72.82) .. (468,73.5) .. controls (469.91,75.41) and (475.2,70.15) .. (477,69.5) .. controls (482.8,67.39) and (491.06,63.5) .. (497,63.5) .. controls (498.94,63.5) and (493.68,65.51) .. (492,66.5) .. controls (487.12,69.37) and (482.07,71.97) .. (477,74.5) .. controls (474.35,75.83) and (469.26,74.71) .. (468,78.5) .. controls (467.33,80.5) and (472,77.17) .. (474,76.5) .. controls (474.95,76.18) and (476.05,76.82) .. (477,76.5) .. controls (484.11,74.13) and (490.51,68.5) .. (498,68.5) .. controls (502.53,68.5) and (490.9,74.19) .. (487,76.5) .. controls (483.53,78.55) and (479.6,79.7) .. (476,81.5) .. controls (475.58,81.71) and (474.53,82.5) .. (475,82.5) .. controls (480.85,82.5) and (496.62,73.31) .. (499,74.5) .. controls (503.3,76.65) and (491.15,80.07) .. (487,82.5) .. controls (480.44,86.35) and (473.81,90.1) .. (467,93.5) .. controls (464.73,94.64) and (471.64,91.43) .. (474,90.5) .. controls (479.6,88.31) and (485.27,86.31) .. (491,84.5) .. controls (493.01,83.87) and (498.75,81.33) .. (497,82.5) .. controls (490.29,86.98) and (482.08,91.96) .. (475,95.5) .. controls (473.11,96.44) and (469.94,95.61) .. (469,97.5) .. controls (468.4,98.69) and (471.71,97.84) .. (473,97.5) .. controls (481.76,95.16) and (490.4,92.37) .. (499,89.5) .. controls (499.71,89.26) and (501,88.5) .. (501,88.5) .. controls (501,88.5) and (495.22,92.38) .. (495,92.5) .. controls (492.38,93.93) and (489.76,95.38) .. (487,96.5) .. controls (480.46,99.16) and (459.94,103.5) .. (467,103.5) .. controls (474.07,103.5) and (490,96.5) .. (490,96.5) .. controls (490,96.5) and (468.07,106.82) .. (464,107.5) .. controls (461.73,107.88) and (459.95,108.85) .. (458,109.5) .. controls (457,109.83) and (453.98,110.78) .. (455,110.5) .. controls (465.67,107.51) and (477.09,106.46) .. (487,101.5) .. controls (489,100.5) and (483,103.5) .. (481,104.5) .. controls (477,106.5) and (473.19,108.93) .. (469,110.5) .. controls (467.03,111.24) and (460.89,112.5) .. (463,112.5) .. controls (467.94,112.5) and (477.46,109.56) .. (481,108.5) .. controls (485.86,107.04) and (461.92,114.5) .. (467,114.5) ;
%Shape: Circle [id:dp638166009053535] 
\draw   (117,204.43) .. controls (117,176.78) and (139.42,154.37) .. (167.07,154.37) .. controls (194.72,154.37) and (217.13,176.78) .. (217.13,204.43) .. controls (217.13,232.08) and (194.72,254.5) .. (167.07,254.5) .. controls (139.42,254.5) and (117,232.08) .. (117,204.43) -- cycle ;
%Shape: Circle [id:dp3716131532538812] 
\draw   (167.28,204.43) .. controls (167.28,176.78) and (189.69,154.37) .. (217.35,154.37) .. controls (245,154.37) and (267.41,176.78) .. (267.41,204.43) .. controls (267.41,232.08) and (245,254.5) .. (217.35,254.5) .. controls (189.69,254.5) and (167.28,232.08) .. (167.28,204.43) -- cycle ;
%Shape: Free Drawing [id:dp23015248724760407] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=6] [line join = round][line cap = round] (193.9,165.98) .. controls (191.46,167.37) and (189.04,169.43) .. (187.56,171.9) .. controls (187.33,172.28) and (188.42,171.66) .. (188.83,171.48) .. controls (189.93,170.99) and (191.11,170.71) .. (192.21,170.21) .. controls (192.67,170) and (193.02,169.59) .. (193.47,169.36) .. controls (193.76,169.22) and (194.63,168.89) .. (194.32,168.94) .. controls (190.56,169.57) and (188.18,171.64) .. (185.45,174.01) .. controls (184.08,175.19) and (182.68,176.33) .. (181.22,177.39) .. controls (181.11,177.47) and (180.67,177.45) .. (180.8,177.39) .. controls (185.42,175.23) and (190.08,173.13) .. (194.74,171.05) .. controls (195.15,170.87) and (196.45,170.63) .. (196.01,170.63) .. controls (192.07,170.63) and (186.3,176.08) .. (184.18,177.81) .. controls (183.25,178.57) and (182.44,179.46) .. (181.64,180.35) .. controls (181.17,180.87) and (179.75,181.72) .. (180.38,182.04) .. controls (181.29,182.5) and (182.39,181.59) .. (183.33,181.19) .. controls (188.87,178.91) and (194.3,176.36) .. (199.81,174.01) .. controls (199.94,173.96) and (200.37,173.98) .. (200.23,174.01) .. controls (194.44,175.46) and (191.25,177.45) .. (185.45,180.35) .. controls (184.49,180.83) and (183.44,181.12) .. (182.49,181.62) .. controls (181.04,182.39) and (179.71,183.37) .. (178.26,184.15) .. controls (178.14,184.22) and (177.71,184.2) .. (177.84,184.15) .. controls (185.24,181.69) and (192.53,178.91) .. (199.81,176.12) .. controls (200.64,175.81) and (201.58,175.74) .. (202.35,175.28) .. controls (202.52,175.18) and (202.96,174.8) .. (202.77,174.86) .. controls (195.66,177.05) and (188.64,179.93) .. (182.49,184.15) .. controls (180.84,185.29) and (179.35,186.64) .. (177.84,187.95) .. controls (177.16,188.55) and (175.09,189.01) .. (175.73,189.64) .. controls (176.89,190.81) and (180.51,188.7) .. (181.64,188.38) .. controls (188.06,186.54) and (194.05,184.09) .. (200.23,181.62) .. controls (200.81,181.39) and (205.3,180.45) .. (205.3,179.5) .. controls (205.3,178.85) and (201.11,180.76) .. (201.08,180.77) .. controls (199.42,181.29) and (197.69,181.57) .. (196.01,182.04) .. controls (194.16,182.56) and (192.32,183.09) .. (190.52,183.73) .. controls (187.03,184.96) and (183.54,186.21) .. (179.95,187.11) .. controls (179.76,187.16) and (180.19,186.77) .. (180.38,186.69) .. controls (182.18,185.91) and (184.08,185.38) .. (185.87,184.57) .. controls (188.45,183.41) and (190.86,181.86) .. (193.47,180.77) .. controls (195,180.14) and (196.96,180.25) .. (198.12,179.08) .. controls (198.75,178.45) and (199.86,178.64) .. (200.66,178.24) .. controls (200.83,178.15) and (201.27,177.78) .. (201.08,177.81) .. controls (199.41,178.15) and (197.95,179.17) .. (196.43,179.93) .. controls (189.39,183.45) and (181.52,185.66) .. (175.31,190.49) .. controls (174.86,190.84) and (176.44,190.57) .. (177,190.49) .. controls (179.4,190.15) and (181.83,189.86) .. (184.18,189.22) .. controls (191.7,187.18) and (199.12,184.74) .. (206.57,182.46) .. controls (207.13,182.29) and (205.42,182.66) .. (204.88,182.88) .. controls (204.01,183.26) and (203.19,183.73) .. (202.35,184.15) .. controls (199.64,185.5) and (196.95,186.89) .. (194.32,188.38) .. controls (188.76,191.5) and (182.69,196.1) .. (177,198.94) .. controls (176.74,199.07) and (177.58,199.05) .. (177.84,198.94) .. controls (183.66,196.5) and (189.41,193.92) .. (195.16,191.33) .. controls (196.31,190.82) and (197.44,190.26) .. (198.54,189.64) .. controls (199.09,189.34) and (200.85,188.68) .. (200.23,188.8) .. controls (194.97,189.81) and (189.71,190.98) .. (184.6,192.6) .. controls (182.94,193.13) and (181.21,193.39) .. (179.53,193.87) .. controls (179.23,193.96) and (178.42,194.47) .. (178.69,194.29) .. controls (184.06,190.71) and (192.12,188.19) .. (198.12,186.69) .. controls (198.82,186.51) and (204.24,185.18) .. (204.46,185.84) .. controls (204.76,186.74) and (202.7,186.57) .. (201.92,187.11) .. controls (200.86,187.85) and (200.09,188.99) .. (198.97,189.64) .. controls (192.6,193.37) and (186.04,196.75) .. (179.53,200.21) .. controls (179.14,200.42) and (177.82,200.63) .. (178.26,200.63) .. controls (179.78,200.63) and (181.08,199.5) .. (182.49,198.94) .. controls (189.6,196.09) and (197.2,194.64) .. (204.46,192.18) .. controls (205.9,191.69) and (210.04,189.81) .. (208.68,190.49) .. controls (199.98,194.84) and (192.74,197.69) .. (184.18,202.74) .. controls (182.98,203.45) and (181.49,203.6) .. (180.38,204.43) .. controls (179.8,204.86) and (181.8,204.2) .. (182.49,204.01) .. controls (184.31,203.51) and (185.84,202.25) .. (187.56,201.48) .. controls (191.54,199.68) and (195.71,198.34) .. (199.81,196.83) .. controls (201.95,196.04) and (204.06,195.2) .. (206.15,194.29) .. controls (206.33,194.21) and (206.75,193.79) .. (206.57,193.87) .. controls (201.2,196.29) and (195.7,199.07) .. (190.52,201.9) .. controls (188.63,202.93) and (186.88,204.19) .. (185.02,205.28) .. controls (184.42,205.63) and (182.63,206.55) .. (183.33,206.55) .. controls (187.09,206.55) and (203.6,199.12) .. (204.04,198.94) .. controls (205.62,198.31) and (207.08,197.4) .. (208.68,196.83) .. controls (209.08,196.69) and (210.37,196.83) .. (209.95,196.83) .. controls (208.75,196.83) and (207.65,197.56) .. (206.57,198.1) .. controls (200.71,201.02) and (194.71,203.67) .. (188.83,206.55) .. controls (186.67,207.6) and (179.98,210.36) .. (180.8,210.77) .. controls (181.96,211.35) and (183.37,210.34) .. (184.6,209.93) .. controls (187.86,208.84) and (191.1,207.73) .. (194.32,206.55) .. controls (198.88,204.87) and (203.61,203.72) .. (208.26,202.32) .. controls (210.75,201.57) and (213.19,199.79) .. (215.87,199.79) .. controls (216.07,199.79) and (215.64,200.15) .. (215.44,200.21) .. controls (214.62,200.44) and (213.7,200.3) .. (212.91,200.63) .. controls (211.28,201.3) and (209.88,202.47) .. (208.26,203.17) .. controls (201.92,205.9) and (195.36,208.1) .. (188.83,210.35) .. controls (186.5,211.15) and (182.07,213.31) .. (182.07,213.31) .. controls (182.07,213.31) and (183.17,212.43) .. (183.76,212.04) .. controls (186.73,210.05) and (189.73,208.56) .. (193.05,206.97) .. controls (194.87,206.1) and (199.97,205.86) .. (198.54,204.43) .. controls (197.23,203.12) and (194.83,204.77) .. (193.05,205.28) .. controls (188.93,206.45) and (182.35,209.5) .. (177.84,209.5) .. controls (177.7,209.5) and (178.14,209.57) .. (178.26,209.5) .. controls (180.13,208.44) and (181.89,207.19) .. (183.76,206.12) .. controls (184.85,205.5) and (186.03,205.04) .. (187.14,204.43) .. controls (189.13,203.35) and (191.05,202.12) .. (193.05,201.05) .. controls (193.18,200.99) and (193.62,201.05) .. (193.47,201.05) .. controls (187.99,201.05) and (182.92,203.16) .. (177.84,204.43) .. controls (177.39,204.54) and (174.04,205.28) .. (174.04,205.28) .. controls (174.04,205.28) and (177.8,204.08) .. (179.53,203.17) .. controls (182.59,201.54) and (185.69,199.98) .. (188.83,198.52) .. controls (189.23,198.33) and (188,198.88) .. (187.56,198.94) .. controls (186.72,199.04) and (185.86,198.86) .. (185.02,198.94) .. controls (181.95,199.25) and (177.33,200.07) .. (174.88,200.63) .. controls (174.02,200.83) and (171.62,201.98) .. (172.35,201.48) .. controls (175.68,199.16) and (179.53,197.49) .. (182.49,194.71) .. controls (182.82,194.4) and (177.65,195.76) .. (177,195.98) .. controls (176.32,196.21) and (174.17,196.35) .. (174.88,196.4) .. controls (177.27,196.6) and (179.68,196.28) .. (182.07,196.4) .. controls (182.38,196.42) and (182.95,196.51) .. (182.91,196.83) .. controls (182.84,197.5) and (180.98,201.96) .. (180.38,202.74) .. controls (178.55,205.11) and (176.51,207.32) .. (174.46,209.5) .. controls (174.36,209.61) and (173.9,209.5) .. (174.04,209.5) .. controls (177.8,209.5) and (181.01,207.57) .. (184.6,206.97) .. controls (184.74,206.94) and (184.3,206.89) .. (184.18,206.97) .. controls (182.59,208.03) and (181.13,209.29) .. (179.53,210.35) .. controls (178.16,211.26) and (176.62,211.9) .. (175.31,212.88) .. controls (175.08,213.05) and (175.88,212.94) .. (176.15,212.88) .. controls (177.15,212.66) and (178.16,212.42) .. (179.11,212.04) .. controls (180.98,211.29) and (182.72,210.23) .. (184.6,209.5) .. controls (185.08,209.32) and (183.74,210.04) .. (183.33,210.35) .. controls (180.85,212.21) and (178.6,214.36) .. (176.15,216.26) .. controls (176.04,216.35) and (175.59,216.3) .. (175.73,216.26) .. controls (182.95,214.46) and (189.64,211.47) .. (196.85,209.5) .. controls (198.03,209.18) and (192.9,211.87) .. (186.29,215.84) .. controls (183.79,217.34) and (179.78,219.37) .. (177.42,221.33) .. controls (177.31,221.42) and (177.71,221.38) .. (177.84,221.33) .. controls (184.9,218.56) and (191.98,215.83) .. (198.97,212.88) .. controls (199.63,212.6) and (197.51,213.02) .. (196.85,213.31) .. controls (194.26,214.44) and (191.81,215.89) .. (189.25,217.11) .. controls (188.56,217.44) and (187.81,217.61) .. (187.14,217.95) .. controls (186.96,218.04) and (186.54,218.46) .. (186.71,218.38) .. controls (192.02,215.72) and (196.95,212.01) .. (202.35,209.5) .. controls (204.41,208.54) and (207.67,209) .. (208.68,206.97) .. controls (209.01,206.33) and (207.22,207.09) .. (206.57,207.39) .. controls (204.46,208.37) and (202.27,209.22) .. (200.23,210.35) .. controls (199.52,210.75) and (197.39,211.98) .. (198.12,211.62) .. controls (199.65,210.85) and (207.83,205.7) .. (209.11,206.12) .. controls (210.07,206.44) and (207.45,207.3) .. (206.57,207.81) .. controls (205.76,208.29) and (204.83,208.57) .. (204.04,209.08) .. controls (200.91,211.07) and (197.97,212.98) .. (194.74,215) .. controls (194.1,215.4) and (191.91,216.07) .. (192.63,215.84) .. controls (195.01,215.1) and (197.4,214.36) .. (199.81,213.73) .. controls (210.07,211.05) and (193.6,216.75) .. (190.52,219.22) .. controls (189.92,219.69) and (191.9,218.59) .. (192.63,218.38) .. controls (193.45,218.13) and (194.36,218.26) .. (195.16,217.95) .. controls (197.34,217.12) and (199.36,215.91) .. (201.5,215) .. controls (202.57,214.54) and (204.88,214.15) .. (204.88,214.15) .. controls (204.88,214.15) and (204.29,214.02) .. (204.04,214.15) .. controls (199.52,216.41) and (195.22,219.08) .. (190.94,221.76) .. controls (189.91,222.4) and (186.82,224.2) .. (187.98,223.87) .. controls (192.19,222.67) and (204.36,217.5) .. (206.15,216.69) .. controls (206.72,216.43) and (208.44,215.64) .. (207.84,215.84) .. controls (201.15,218.07) and (195.05,221.81) .. (188.83,225.14) .. controls (188.27,225.43) and (186.53,226.13) .. (187.14,225.98) .. controls (191.69,224.84) and (200.62,219.64) .. (204.04,219.64) .. controls (204.62,219.64) and (202.86,219.79) .. (202.35,220.07) .. controls (197.67,222.6) and (193.38,225.78) .. (188.83,228.52) .. controls (188.12,228.94) and (185.98,229.42) .. (186.71,229.78) .. controls (187.33,230.09) and (189.71,228.66) .. (190.09,228.52) .. controls (193.17,227.33) and (196.32,226.33) .. (199.39,225.14) .. controls (201.39,224.36) and (203.36,223.51) .. (205.3,222.6) .. controls (205.48,222.52) and (205.93,222.18) .. (205.73,222.18) .. controls (201.95,222.18) and (198.28,226.33) .. (195.59,228.09) .. controls (193.32,229.58) and (190.71,230.48) .. (188.4,231.9) .. controls (187.49,232.46) and (190.35,230.99) .. (191.36,230.63) .. controls (194,229.67) and (196.8,229.17) .. (199.39,228.09) .. controls (200.22,227.75) and (200.68,226.76) .. (201.5,226.4) .. controls (203.01,225.74) and (207.73,224.26) .. (206.15,224.71) .. controls (202.62,225.72) and (190.19,232.4) .. (188.4,234.01) .. controls (187.44,234.88) and (190.98,233.59) .. (192.21,233.16) .. controls (196.97,231.52) and (197.97,230.54) .. (201.5,229.36) .. controls (201.64,229.32) and (201.21,229.3) .. (201.08,229.36) .. controls (199.78,230.01) and (198.48,230.66) .. (197.28,231.47) .. controls (194.95,233.06) and (192.69,234.75) .. (190.52,236.54) .. controls (189.97,236.99) and (188.16,238.03) .. (188.83,237.81) .. controls (193.71,236.18) and (199.47,234.16) .. (203.61,231.05) .. controls (204.22,230.6) and (202.12,231.46) .. (201.5,231.9) .. controls (200.6,232.53) and (199.89,233.41) .. (198.97,234.01) .. controls (198.3,234.44) and (190.52,238.5) .. (190.52,239.08) .. controls (190.52,240.24) and (192.77,238.5) .. (193.9,238.23) .. controls (195.92,237.76) and (198.24,238.11) .. (199.81,236.54) .. controls (200.03,236.32) and (199.21,236.77) .. (198.97,236.97) .. controls (198.5,237.34) and (198.16,237.85) .. (197.7,238.23) .. controls (196.3,239.4) and (194.63,240.82) .. (193.05,241.61) .. controls (192.32,241.98) and (194.45,240.75) .. (195.16,240.35) .. controls (196.32,239.69) and (197.46,239) .. (198.54,238.23) .. controls (201.01,236.51) and (197.39,237.58) .. (197.7,236.97) .. controls (198.84,234.68) and (203.71,230.76) .. (204.04,229.78) .. controls (204.4,228.68) and (198.78,230.56) .. (198.54,230.63) .. controls (198.24,230.71) and (197.7,231.37) .. (197.7,231.05) .. controls (197.7,229.79) and (199.99,229.99) .. (201.08,229.36) .. controls (202.3,228.66) and (203.29,227.61) .. (204.46,226.83) .. controls (205.26,226.29) and (206.19,225.67) .. (206.99,225.14) .. controls (207.11,225.06) and (207.56,225.14) .. (207.42,225.14) .. controls (206.37,225.14) and (204.56,225.2) .. (203.61,225.56) .. controls (201.73,226.27) and (199.98,227.3) .. (198.12,228.09) .. controls (197.99,228.15) and (197.57,228.16) .. (197.7,228.09) .. controls (202.07,225.91) and (208.21,223.92) .. (211.64,220.49) .. controls (212.24,219.89) and (209.93,220.31) .. (209.11,220.49) .. controls (207.23,220.88) and (205.43,221.57) .. (203.61,222.18) .. controls (202.74,222.47) and (202.38,222.31) .. (201.5,222.6) .. controls (201.31,222.66) and (200.94,223.16) .. (201.08,223.02) .. controls (203.65,220.45) and (207.22,219.08) .. (209.95,216.69) .. controls (210.91,215.85) and (207.32,216.6) .. (206.15,217.11) .. controls (204.53,217.82) and (203.1,218.9) .. (201.5,219.64) .. controls (200.98,219.89) and (200.36,219.88) .. (199.81,220.07) .. controls (199.51,220.17) and (198.72,220.69) .. (198.97,220.49) .. controls (203.08,217.06) and (204.07,217.14) .. (209.53,213.73) .. controls (209.91,213.49) and (208.69,214.04) .. (208.26,214.15) .. controls (206.97,214.47) and (205.65,214.82) .. (204.46,215.42) .. controls (204.28,215.51) and (203.9,215.98) .. (204.04,215.84) .. controls (204.89,214.98) and (205.98,214.38) .. (206.99,213.73) .. controls (208.44,212.8) and (209.88,211.84) .. (211.22,210.77) .. controls (211.38,210.65) and (211.84,210.35) .. (211.64,210.35) .. controls (209.75,210.35) and (207.42,211.74) .. (205.73,212.46) .. controls (204.51,212.98) and (201.24,215.31) .. (202.35,214.57) .. controls (204.96,212.83) and (207.76,210.82) .. (210.37,209.08) .. controls (210.49,209) and (210.08,209.02) .. (209.95,209.08) .. controls (207.11,210.43) and (204.26,211.78) .. (201.5,213.31) .. controls (195.96,216.37) and (191.26,220.21) .. (185.87,223.45) .. controls (184.39,224.33) and (181.11,224.94) .. (180.38,226.4) .. controls (180.31,226.53) and (180.67,226.45) .. (180.8,226.4) .. controls (181.77,226.08) and (182.86,225.21) .. (183.76,224.71) .. controls (186.19,223.37) and (189.2,222.66) .. (190.94,220.49) .. controls (191.73,219.5) and (188.39,220.28) .. (187.14,220.49) .. controls (184.81,220.88) and (180.38,220.24) .. (180.38,222.6) .. controls (180.38,222.62) and (188.94,218.2) .. (189.25,217.95) .. controls (189.6,217.67) and (188.33,218.1) .. (187.98,218.38) .. controls (186.89,219.25) and (186.05,220.38) .. (185.02,221.33) .. controls (184.07,222.22) and (181.15,222.95) .. (182.07,223.87) .. controls (183.37,225.17) and (187.37,223.02) .. (189.25,223.02) .. controls (191.25,223.02) and (186.74,226.14) .. (185.45,227.67) .. controls (185.06,228.13) and (183.6,229.08) .. (184.18,228.94) .. controls (185.44,228.62) and (186.73,228.43) .. (187.98,228.09) .. controls (188.29,228.01) and (188.83,227.36) .. (188.83,227.67) .. controls (188.83,229.08) and (186.68,229.52) .. (185.45,230.21) .. controls (184.83,230.55) and (183.07,231.61) .. (183.76,231.47) .. controls (185.2,231.18) and (186.6,230.71) .. (187.98,230.21) .. controls (188.17,230.14) and (188.54,229.64) .. (188.4,229.78) .. controls (186.95,231.24) and (184.28,232.95) .. (183.33,234.85) .. controls (183.07,235.37) and (184.47,234.61) .. (185.02,234.43) .. controls (185.07,234.41) and (188.4,233.31) .. (188.4,233.16) .. controls (188.4,232.06) and (184.86,235.72) .. (185.87,235.28) .. controls (187.26,234.66) and (188.66,234.07) .. (190.09,233.59) .. controls (190.52,233.44) and (189.67,234.41) .. (189.67,234.85) .. controls (189.67,234.99) and (189.58,234.54) .. (189.67,234.43) .. controls (190.45,233.52) and (191.47,232.84) .. (192.21,231.9) .. controls (195.81,227.26) and (195.45,226.77) .. (198.12,221.76) .. controls (199.13,219.86) and (201.6,218.15) .. (200.23,215.42) .. controls (199.66,214.28) and (195.96,216.18) .. (196.43,215) .. controls (197.49,212.35) and (200.24,210.54) .. (201.08,207.81) .. controls (201.29,207.14) and (199.58,207.47) .. (198.97,207.81) .. controls (196.95,208.93) and (195.26,210.58) .. (193.47,212.04) .. controls (193.23,212.24) and (194.06,211.79) .. (194.32,211.62) .. controls (195.89,210.52) and (199.49,210.08) .. (198.97,208.24) .. controls (198.46,206.47) and (195.3,208.43) .. (193.47,208.66) .. controls (190.92,208.98) and (188.43,210.1) .. (185.87,209.93) .. controls (185.45,209.9) and (185.74,209.06) .. (185.87,208.66) .. controls (186.48,206.84) and (188.68,206.81) .. (187.98,204.01) .. controls (187.81,203.33) and (186.54,203.8) .. (185.87,204.01) .. controls (182.18,205.16) and (181.2,206.55) .. (178.26,206.55) .. controls (177.76,206.55) and (179.13,206) .. (179.53,205.7) .. controls (180.34,205.09) and (181.16,204.46) .. (182.07,204.01) .. controls (182.07,204.01) and (196.18,199.02) .. (198.12,195.14) .. controls (198.82,193.75) and (195.01,194.93) .. (193.47,194.71) .. controls (192.66,194.6) and (194.88,193.86) .. (195.59,193.45) .. controls (198.09,191.98) and (200.44,190.14) .. (203.19,189.22) .. controls (203.31,189.18) and (207.08,188.04) .. (206.57,187.53) .. controls (206.42,187.39) and (205.8,187.89) .. (205.73,187.95) .. controls (204.4,189.01) and (202.52,189.75) .. (201.92,191.33) .. controls (201.63,192.13) and (203.62,191.4) .. (204.46,191.33) .. controls (205.59,191.25) and (206.73,191.16) .. (207.84,190.91) .. controls (208.27,190.82) and (209.31,190.09) .. (209.11,190.49) .. controls (208.39,191.92) and (206.61,192.54) .. (205.73,193.87) .. controls (205.31,194.5) and (207.09,193.15) .. (207.84,193.02) .. controls (209.02,192.83) and (209.64,192.89) .. (210.8,192.6) .. controls (211.1,192.53) and (211.88,191.97) .. (211.64,192.18) .. controls (209.82,193.74) and (207.85,195.13) .. (206.15,196.83) .. controls (205.61,197.36) and (207.54,196.22) .. (208.26,195.98) .. controls (209.11,195.7) and (210,195.54) .. (210.8,195.14) .. controls (210.92,195.07) and (211.22,195.14) .. (211.22,195.14) .. controls (211.22,195.14) and (205.3,199.36) .. (205.3,199.36) .. controls (205.86,199.92) and (210.09,198.8) .. (210.8,198.1) .. controls (211.74,197.16) and (208.56,199.52) .. (207.42,200.21) .. controls (206.34,200.86) and (205.02,201.11) .. (204.04,201.9) .. controls (203.71,202.16) and (204.89,201.98) .. (205.3,201.9) .. controls (206.18,201.72) and (207.01,201.38) .. (207.84,201.05) .. controls (208.02,200.98) and (208.44,200.54) .. (208.26,200.63) .. controls (200.15,204.69) and (192,207.57) .. (184.18,212.04) .. controls (181.96,213.31) and (178.34,214.24) .. (176.57,215.42) .. controls (176.2,215.66) and (175.7,215.64) .. (175.31,215.84) .. controls (174.91,216.04) and (173.59,216.26) .. (174.04,216.26) .. controls (177.14,216.26) and (179.69,214.15) .. (182.91,214.15) .. controls (183.19,214.15) and (182.32,214.02) .. (182.07,214.15) .. controls (179.7,215.33) and (177.15,216.17) .. (174.88,217.53) .. controls (174.64,217.68) and (175.08,218.18) .. (174.88,218.38) .. controls (174.68,218.57) and (173.77,218.45) .. (174.04,218.38) .. controls (176.27,217.74) and (178.57,217.32) .. (180.8,216.69) .. controls (181.53,216.48) and (182.19,216.08) .. (182.91,215.84) .. controls (189.25,213.73) and (177.71,217.4) .. (177.42,217.53) .. controls (176.28,218.06) and (175.3,219.22) .. (174.04,219.22) .. controls (173.9,219.22) and (174.34,219.28) .. (174.46,219.22) .. controls (175.48,218.71) and (176.48,218.17) .. (177.42,217.53) .. controls (178.61,216.71) and (187.24,210.92) .. (187.56,209.08) .. controls (188.13,205.75) and (187.69,202.32) .. (187.56,198.94) .. controls (187.53,198.08) and (187.95,196.68) .. (187.14,196.4) .. controls (183.44,195.17) and (181.96,196.84) .. (178.69,197.25) .. controls (177.79,197.36) and (179.81,195.84) .. (180.38,195.14) .. controls (181.57,193.64) and (182.81,192.15) .. (183.76,190.49) .. controls (183.97,190.12) and (184.13,189.41) .. (183.76,189.22) .. controls (182.74,188.71) and (181.47,189.33) .. (180.38,189.64) .. controls (177.29,190.54) and (174.14,191.2) .. (171.08,192.18) .. controls (170.48,192.37) and (168.76,193.02) .. (169.39,193.02) .. controls (172.07,193.02) and (173.96,190.49) .. (176.57,190.49) ;
%Shape: Free Drawing [id:dp8157206335475502] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=6] [line join = round][line cap = round] (168,164.5) .. controls (165.21,165.06) and (162.55,166.23) .. (160,167.5) .. controls (158.39,168.3) and (153.2,169.5) .. (155,169.5) .. controls (157.85,169.5) and (164.27,163.95) .. (163,166.5) .. controls (160.19,172.13) and (150.77,171.47) .. (147,176.5) .. controls (100.94,237.91) and (159,166.9) .. (159,172.5) .. controls (159,177.59) and (149.6,184.1) .. (146,180.5) .. controls (142.79,177.29) and (155.75,176.86) .. (157,172.5) .. controls (157.83,169.6) and (148,176.52) .. (148,173.5) .. controls (148,169.89) and (160.42,168.64) .. (157,167.5) .. controls (154,166.5) and (146.37,173.21) .. (148,170.5) .. controls (150.07,167.06) and (153.87,165.01) .. (157,162.5) .. controls (157.26,162.29) and (158.32,162.39) .. (158,162.5) .. controls (152.19,164.44) and (146.6,167.01) .. (141,169.5) .. controls (138.11,170.78) and (151.59,163.77) .. (150,166.5) .. controls (147.58,170.65) and (144.79,174.59) .. (142,178.5) .. controls (141.45,179.27) and (139.58,179.66) .. (140,180.5) .. controls (141.3,183.09) and (155.03,173.47) .. (151,177.5) .. controls (145.77,182.73) and (138.23,185.27) .. (133,190.5) .. controls (130.36,193.14) and (139.62,187.08) .. (143,185.5) .. controls (144.63,184.74) and (149.56,182.61) .. (148,183.5) .. controls (144.12,185.72) and (139.83,187.2) .. (136,189.5) .. controls (134.79,190.23) and (131.69,193.03) .. (133,192.5) .. controls (133.06,192.48) and (147,184.36) .. (147,182.5) .. controls (147,180.56) and (140.26,184.63) .. (142,185.5) .. controls (142.88,185.94) and (153.5,181.25) .. (155,180.5) .. controls (157.08,179.46) and (151.55,182.65) .. (152,182.5) .. controls (156.4,181.03) and (165,177.5) .. (165,177.5) .. controls (165,177.5) and (159,182.17) .. (156,184.5) .. controls (151.26,188.19) and (146.1,191.32) .. (141,194.5) .. controls (140.11,195.06) and (136.98,195.76) .. (138,195.5) .. controls (139.65,195.09) and (141.42,195.13) .. (143,194.5) .. controls (144.8,193.78) and (146.22,192.28) .. (148,191.5) .. controls (151.57,189.94) and (155.38,188.95) .. (159,187.5) .. controls (159.31,187.38) and (158.29,187.34) .. (158,187.5) .. controls (151.41,191.05) and (136.8,197.65) .. (129,203.5) .. controls (128.85,203.61) and (127,205.5) .. (127,205.5) .. controls (127,205.5) and (127.98,202.59) .. (129,201.5) .. controls (132.86,197.36) and (138.82,194.72) .. (141,189.5) .. controls (141.9,187.35) and (136.32,189.29) .. (134,189.5) .. controls (130.91,189.78) and (124.03,193.49) .. (122,194.5) .. controls (121.7,194.65) and (122.71,194.67) .. (123,194.5) .. controls (126.09,192.64) and (129.12,190.66) .. (132,188.5) .. controls (133.53,187.35) and (141.58,180.08) .. (142,180.5) .. controls (142.24,180.74) and (142.21,181.24) .. (142,181.5) .. controls (138.09,186.39) and (130.95,192.34) .. (128,197.5) .. controls (127.5,198.37) and (130.05,197.82) .. (131,197.5) .. controls (135.11,196.13) and (143,188.17) .. (143,192.5) .. controls (143,198.16) and (135.13,200.63) .. (131,204.5) .. controls (129.78,205.64) and (128.18,206.32) .. (127,207.5) .. controls (126.33,208.17) and (124.11,209.8) .. (125,209.5) .. controls (125.14,209.45) and (143,205.2) .. (143,202.5) .. controls (143,196.03) and (132.09,209.5) .. (127,213.5) .. controls (125.83,214.42) and (129.67,212.17) .. (131,211.5) .. controls (132.37,210.81) and (145.7,205.57) .. (148,202.5) .. controls (148.6,201.7) and (145.89,202.05) .. (145,202.5) .. controls (141.6,204.2) and (138.11,206.34) .. (136,209.5) .. controls (135.24,210.64) and (138.82,209.21) .. (140,208.5) .. controls (143.89,206.17) and (147.23,203.01) .. (151,200.5) .. controls (152.86,199.26) and (163,193.5) .. (163,193.5) .. controls (163,193.5) and (161.59,193.19) .. (161,193.5) .. controls (154.52,196.9) and (148.09,200.44) .. (142,204.5) .. controls (140.04,205.81) and (138.81,207.99) .. (137,209.5) .. controls (136.74,209.71) and (135.67,209.5) .. (136,209.5) .. controls (141.66,209.5) and (149.74,204) .. (155,202.5) .. controls (155.91,202.24) and (153.75,203.93) .. (153,204.5) .. controls (151.08,205.94) and (148.92,207.06) .. (147,208.5) .. controls (143.59,211.06) and (140.45,213.99) .. (137,216.5) .. controls (136.03,217.21) and (132.8,218.5) .. (134,218.5) .. controls (136.43,218.5) and (138.81,217.54) .. (141,216.5) .. controls (145.19,214.52) and (148.92,211.7) .. (153,209.5) .. controls (153.29,209.34) and (154.33,209.5) .. (154,209.5) .. controls (152.06,209.5) and (150.59,211.38) .. (149,212.5) .. controls (144.47,215.7) and (139.57,218.36) .. (135,221.5) .. controls (134.22,222.03) and (132.12,223.83) .. (133,223.5) .. controls (140.15,220.87) and (147.07,217.65) .. (154,214.5) .. controls (154.43,214.3) and (155.33,213.17) .. (155,213.5) .. controls (149.21,219.29) and (144.55,222.77) .. (138,228.5) .. controls (136.72,229.62) and (141.48,228.26) .. (143,227.5) .. controls (148.33,224.83) and (159,219.5) .. (159,219.5) .. controls (159,219.5) and (137.43,235.53) .. (146,231.5) .. controls (151.27,229.02) and (156.72,226.96) .. (162,224.5) .. controls (162.96,224.05) and (159.81,224.83) .. (159,225.5) .. controls (155.55,228.37) and (152.17,231.33) .. (149,234.5) .. controls (146.24,237.26) and (156.33,231.83) .. (160,230.5) .. controls (160.7,230.25) and (162.61,229.07) .. (162,229.5) .. controls (158.04,232.33) and (154.18,235.32) .. (151,238.5) .. controls (147.63,241.87) and (166.86,228.68) .. (164,232.5) .. controls (162.44,234.58) and (160.84,236.66) .. (159,238.5) .. controls (158.67,238.83) and (157.53,239.5) .. (158,239.5) .. controls (159.68,239.5) and (166.31,235.4) .. (169,234.5) .. controls (169.32,234.39) and (168.24,234.26) .. (168,234.5) .. controls (165.49,237.01) and (162.58,239.33) .. (161,242.5) .. controls (160.91,242.69) and (172.29,238.5) .. (172,238.5) .. controls (167.76,238.5) and (166,244.5) .. (163,247.5) .. controls (162.76,247.74) and (161.67,247.5) .. (162,247.5) .. controls (164.84,247.5) and (172.84,242.34) .. (174,243.5) .. controls (174.61,244.11) and (166.78,243.81) .. (166,242.5) .. controls (163.93,239.06) and (167.44,234.49) .. (167,230.5) .. controls (166.87,229.31) and (164.47,229.6) .. (164,228.5) .. controls (162.64,225.32) and (165.57,220.9) .. (165,217.5) .. controls (164.75,216.03) and (162.71,214.96) .. (163,213.5) .. controls (163.21,212.47) and (164.75,211.25) .. (164,210.5) .. controls (162.25,208.75) and (160.58,207.67) .. (156,208.5) .. controls (153.61,208.93) and (149.59,212.85) .. (149,210.5) .. controls (147.73,205.4) and (154.26,199.86) .. (156,195.5) .. controls (157.51,191.72) and (155.48,184.81) .. (159,180.5) .. controls (160.8,178.3) and (166.08,172.42) .. (168,170.5) .. controls (169.37,169.13) and (174.37,168.87) .. (173,167.5) .. controls (171.26,165.76) and (169.89,167.61) .. (169,168.5) ;
%Shape: Circle [id:dp12065054841945821] 
\draw   (362,206.43) .. controls (362,178.78) and (384.42,156.37) .. (412.07,156.37) .. controls (439.72,156.37) and (462.13,178.78) .. (462.13,206.43) .. controls (462.13,234.08) and (439.72,256.5) .. (412.07,256.5) .. controls (384.42,256.5) and (362,234.08) .. (362,206.43) -- cycle ;
%Shape: Circle [id:dp6182840523741042] 
\draw   (412.28,206.43) .. controls (412.28,178.78) and (434.69,156.37) .. (462.35,156.37) .. controls (490,156.37) and (512.41,178.78) .. (512.41,206.43) .. controls (512.41,234.08) and (490,256.5) .. (462.35,256.5) .. controls (434.69,256.5) and (412.28,234.08) .. (412.28,206.43) -- cycle ;
%Shape: Free Drawing [id:dp8450866967457007] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=6] [line join = round][line cap = round] (438.9,167.98) .. controls (436.46,169.37) and (434.04,171.43) .. (432.56,173.9) .. controls (432.33,174.28) and (433.42,173.66) .. (433.83,173.48) .. controls (434.93,172.99) and (436.11,172.71) .. (437.21,172.21) .. controls (437.67,172) and (438.02,171.59) .. (438.47,171.36) .. controls (438.76,171.22) and (439.63,170.89) .. (439.32,170.94) .. controls (435.56,171.57) and (433.18,173.64) .. (430.45,176.01) .. controls (429.08,177.19) and (427.68,178.33) .. (426.22,179.39) .. controls (426.11,179.47) and (425.67,179.45) .. (425.8,179.39) .. controls (430.42,177.23) and (435.08,175.13) .. (439.74,173.05) .. controls (440.15,172.87) and (441.45,172.63) .. (441.01,172.63) .. controls (437.07,172.63) and (431.3,178.08) .. (429.18,179.81) .. controls (428.25,180.57) and (427.44,181.46) .. (426.64,182.35) .. controls (426.17,182.87) and (424.75,183.72) .. (425.38,184.04) .. controls (426.29,184.5) and (427.39,183.59) .. (428.33,183.19) .. controls (433.87,180.91) and (439.3,178.36) .. (444.81,176.01) .. controls (444.94,175.96) and (445.37,175.98) .. (445.23,176.01) .. controls (439.44,177.46) and (436.25,179.45) .. (430.45,182.35) .. controls (429.49,182.83) and (428.44,183.12) .. (427.49,183.62) .. controls (426.04,184.39) and (424.71,185.37) .. (423.26,186.15) .. controls (423.14,186.22) and (422.71,186.2) .. (422.84,186.15) .. controls (430.24,183.69) and (437.53,180.91) .. (444.81,178.12) .. controls (445.64,177.81) and (446.58,177.74) .. (447.35,177.28) .. controls (447.52,177.18) and (447.96,176.8) .. (447.77,176.86) .. controls (440.66,179.05) and (433.64,181.93) .. (427.49,186.15) .. controls (425.84,187.29) and (424.35,188.64) .. (422.84,189.95) .. controls (422.16,190.55) and (420.09,191.01) .. (420.73,191.64) .. controls (421.89,192.81) and (425.51,190.7) .. (426.64,190.38) .. controls (433.06,188.54) and (439.05,186.09) .. (445.23,183.62) .. controls (445.81,183.39) and (450.3,182.45) .. (450.3,181.5) .. controls (450.3,180.85) and (446.11,182.76) .. (446.08,182.77) .. controls (444.42,183.29) and (442.69,183.57) .. (441.01,184.04) .. controls (439.16,184.56) and (437.32,185.09) .. (435.52,185.73) .. controls (432.03,186.96) and (428.54,188.21) .. (424.95,189.11) .. controls (424.76,189.16) and (425.19,188.77) .. (425.38,188.69) .. controls (427.18,187.91) and (429.08,187.38) .. (430.87,186.57) .. controls (433.45,185.41) and (435.86,183.86) .. (438.47,182.77) .. controls (440,182.14) and (441.96,182.25) .. (443.12,181.08) .. controls (443.75,180.45) and (444.86,180.64) .. (445.66,180.24) .. controls (445.83,180.15) and (446.27,179.78) .. (446.08,179.81) .. controls (444.41,180.15) and (442.95,181.17) .. (441.43,181.93) .. controls (434.39,185.45) and (426.52,187.66) .. (420.31,192.49) .. controls (419.86,192.84) and (421.44,192.57) .. (422,192.49) .. controls (424.4,192.15) and (426.83,191.86) .. (429.18,191.22) .. controls (436.7,189.18) and (444.12,186.74) .. (451.57,184.46) .. controls (452.13,184.29) and (450.42,184.66) .. (449.88,184.88) .. controls (449.01,185.26) and (448.19,185.73) .. (447.35,186.15) .. controls (444.64,187.5) and (441.95,188.89) .. (439.32,190.38) .. controls (433.76,193.5) and (427.69,198.1) .. (422,200.94) .. controls (421.74,201.07) and (422.58,201.05) .. (422.84,200.94) .. controls (428.66,198.5) and (434.41,195.92) .. (440.16,193.33) .. controls (441.31,192.82) and (442.44,192.26) .. (443.54,191.64) .. controls (444.09,191.34) and (445.85,190.68) .. (445.23,190.8) .. controls (439.97,191.81) and (434.71,192.98) .. (429.6,194.6) .. controls (427.94,195.13) and (426.21,195.39) .. (424.53,195.87) .. controls (424.23,195.96) and (423.42,196.47) .. (423.69,196.29) .. controls (429.06,192.71) and (437.12,190.19) .. (443.12,188.69) .. controls (443.82,188.51) and (449.24,187.18) .. (449.46,187.84) .. controls (449.76,188.74) and (447.7,188.57) .. (446.92,189.11) .. controls (445.86,189.85) and (445.09,190.99) .. (443.97,191.64) .. controls (437.6,195.37) and (431.04,198.75) .. (424.53,202.21) .. controls (424.14,202.42) and (422.82,202.63) .. (423.26,202.63) .. controls (424.78,202.63) and (426.08,201.5) .. (427.49,200.94) .. controls (434.6,198.09) and (442.2,196.64) .. (449.46,194.18) .. controls (450.9,193.69) and (455.04,191.81) .. (453.68,192.49) .. controls (444.98,196.84) and (437.74,199.69) .. (429.18,204.74) .. controls (427.98,205.45) and (426.49,205.6) .. (425.38,206.43) .. controls (424.8,206.86) and (426.8,206.2) .. (427.49,206.01) .. controls (429.31,205.51) and (430.84,204.25) .. (432.56,203.48) .. controls (436.54,201.68) and (440.71,200.34) .. (444.81,198.83) .. controls (446.95,198.04) and (449.06,197.2) .. (451.15,196.29) .. controls (451.33,196.21) and (451.75,195.79) .. (451.57,195.87) .. controls (446.2,198.29) and (440.7,201.07) .. (435.52,203.9) .. controls (433.63,204.93) and (431.88,206.19) .. (430.02,207.28) .. controls (429.42,207.63) and (427.63,208.55) .. (428.33,208.55) .. controls (432.09,208.55) and (448.6,201.12) .. (449.04,200.94) .. controls (450.62,200.31) and (452.08,199.4) .. (453.68,198.83) .. controls (454.08,198.69) and (455.37,198.83) .. (454.95,198.83) .. controls (453.75,198.83) and (452.65,199.56) .. (451.57,200.1) .. controls (445.71,203.02) and (439.71,205.67) .. (433.83,208.55) .. controls (431.67,209.6) and (424.98,212.36) .. (425.8,212.77) .. controls (426.96,213.35) and (428.37,212.34) .. (429.6,211.93) .. controls (432.86,210.84) and (436.1,209.73) .. (439.32,208.55) .. controls (443.88,206.87) and (448.61,205.72) .. (453.26,204.32) .. controls (455.75,203.57) and (458.19,201.79) .. (460.87,201.79) .. controls (461.07,201.79) and (460.64,202.15) .. (460.44,202.21) .. controls (459.62,202.44) and (458.7,202.3) .. (457.91,202.63) .. controls (456.28,203.3) and (454.88,204.47) .. (453.26,205.17) .. controls (446.92,207.9) and (440.36,210.1) .. (433.83,212.35) .. controls (431.5,213.15) and (427.07,215.31) .. (427.07,215.31) .. controls (427.07,215.31) and (428.17,214.43) .. (428.76,214.04) .. controls (431.73,212.05) and (434.73,210.56) .. (438.05,208.97) .. controls (439.87,208.1) and (444.97,207.86) .. (443.54,206.43) .. controls (442.23,205.12) and (439.83,206.77) .. (438.05,207.28) .. controls (433.93,208.45) and (427.35,211.5) .. (422.84,211.5) .. controls (422.7,211.5) and (423.14,211.57) .. (423.26,211.5) .. controls (425.13,210.44) and (426.89,209.19) .. (428.76,208.12) .. controls (429.85,207.5) and (431.03,207.04) .. (432.14,206.43) .. controls (434.13,205.35) and (436.05,204.12) .. (438.05,203.05) .. controls (438.18,202.99) and (438.62,203.05) .. (438.47,203.05) .. controls (432.99,203.05) and (427.92,205.16) .. (422.84,206.43) .. controls (422.39,206.54) and (419.04,207.28) .. (419.04,207.28) .. controls (419.04,207.28) and (422.8,206.08) .. (424.53,205.17) .. controls (427.59,203.54) and (430.69,201.98) .. (433.83,200.52) .. controls (434.23,200.33) and (433,200.88) .. (432.56,200.94) .. controls (431.72,201.04) and (430.86,200.86) .. (430.02,200.94) .. controls (426.95,201.25) and (422.33,202.07) .. (419.88,202.63) .. controls (419.02,202.83) and (416.62,203.98) .. (417.35,203.48) .. controls (420.68,201.16) and (424.53,199.49) .. (427.49,196.71) .. controls (427.82,196.4) and (422.65,197.76) .. (422,197.98) .. controls (421.32,198.21) and (419.17,198.35) .. (419.88,198.4) .. controls (422.27,198.6) and (424.68,198.28) .. (427.07,198.4) .. controls (427.38,198.42) and (427.95,198.51) .. (427.91,198.83) .. controls (427.84,199.5) and (425.98,203.96) .. (425.38,204.74) .. controls (423.55,207.11) and (421.51,209.32) .. (419.46,211.5) .. controls (419.36,211.61) and (418.9,211.5) .. (419.04,211.5) .. controls (422.8,211.5) and (426.01,209.57) .. (429.6,208.97) .. controls (429.74,208.94) and (429.3,208.89) .. (429.18,208.97) .. controls (427.59,210.03) and (426.13,211.29) .. (424.53,212.35) .. controls (423.16,213.26) and (421.62,213.9) .. (420.31,214.88) .. controls (420.08,215.05) and (420.88,214.94) .. (421.15,214.88) .. controls (422.15,214.66) and (423.16,214.42) .. (424.11,214.04) .. controls (425.98,213.29) and (427.72,212.23) .. (429.6,211.5) .. controls (430.08,211.32) and (428.74,212.04) .. (428.33,212.35) .. controls (425.85,214.21) and (423.6,216.36) .. (421.15,218.26) .. controls (421.04,218.35) and (420.59,218.3) .. (420.73,218.26) .. controls (427.95,216.46) and (434.64,213.47) .. (441.85,211.5) .. controls (443.03,211.18) and (437.9,213.87) .. (431.29,217.84) .. controls (428.79,219.34) and (424.78,221.37) .. (422.42,223.33) .. controls (422.31,223.42) and (422.71,223.38) .. (422.84,223.33) .. controls (429.9,220.56) and (436.98,217.83) .. (443.97,214.88) .. controls (444.63,214.6) and (442.51,215.02) .. (441.85,215.31) .. controls (439.26,216.44) and (436.81,217.89) .. (434.25,219.11) .. controls (433.56,219.44) and (432.81,219.61) .. (432.14,219.95) .. controls (431.96,220.04) and (431.54,220.46) .. (431.71,220.38) .. controls (437.02,217.72) and (441.95,214.01) .. (447.35,211.5) .. controls (449.41,210.54) and (452.67,211) .. (453.68,208.97) .. controls (454.01,208.33) and (452.22,209.09) .. (451.57,209.39) .. controls (449.46,210.37) and (447.27,211.22) .. (445.23,212.35) .. controls (444.52,212.75) and (442.39,213.98) .. (443.12,213.62) .. controls (444.65,212.85) and (452.83,207.7) .. (454.11,208.12) .. controls (455.07,208.44) and (452.45,209.3) .. (451.57,209.81) .. controls (450.76,210.29) and (449.83,210.57) .. (449.04,211.08) .. controls (445.91,213.07) and (442.97,214.98) .. (439.74,217) .. controls (439.1,217.4) and (436.91,218.07) .. (437.63,217.84) .. controls (440.01,217.1) and (442.4,216.36) .. (444.81,215.73) .. controls (455.07,213.05) and (438.6,218.75) .. (435.52,221.22) .. controls (434.92,221.69) and (436.9,220.59) .. (437.63,220.38) .. controls (438.45,220.13) and (439.36,220.26) .. (440.16,219.95) .. controls (442.34,219.12) and (444.36,217.91) .. (446.5,217) .. controls (447.57,216.54) and (449.88,216.15) .. (449.88,216.15) .. controls (449.88,216.15) and (449.29,216.02) .. (449.04,216.15) .. controls (444.52,218.41) and (440.22,221.08) .. (435.94,223.76) .. controls (434.91,224.4) and (431.82,226.2) .. (432.98,225.87) .. controls (437.19,224.67) and (449.36,219.5) .. (451.15,218.69) .. controls (451.72,218.43) and (453.44,217.64) .. (452.84,217.84) .. controls (446.15,220.07) and (440.05,223.81) .. (433.83,227.14) .. controls (433.27,227.43) and (431.53,228.13) .. (432.14,227.98) .. controls (436.69,226.84) and (445.62,221.64) .. (449.04,221.64) .. controls (449.62,221.64) and (447.86,221.79) .. (447.35,222.07) .. controls (442.67,224.6) and (438.38,227.78) .. (433.83,230.52) .. controls (433.12,230.94) and (430.98,231.42) .. (431.71,231.78) .. controls (432.33,232.09) and (434.71,230.66) .. (435.09,230.52) .. controls (438.17,229.33) and (441.32,228.33) .. (444.39,227.14) .. controls (446.39,226.36) and (448.36,225.51) .. (450.3,224.6) .. controls (450.48,224.52) and (450.93,224.18) .. (450.73,224.18) .. controls (446.95,224.18) and (443.28,228.33) .. (440.59,230.09) .. controls (438.32,231.58) and (435.71,232.48) .. (433.4,233.9) .. controls (432.49,234.46) and (435.35,232.99) .. (436.36,232.63) .. controls (439,231.67) and (441.8,231.17) .. (444.39,230.09) .. controls (445.22,229.75) and (445.68,228.76) .. (446.5,228.4) .. controls (448.01,227.74) and (452.73,226.26) .. (451.15,226.71) .. controls (447.62,227.72) and (435.19,234.4) .. (433.4,236.01) .. controls (432.44,236.88) and (435.98,235.59) .. (437.21,235.16) .. controls (441.97,233.52) and (442.97,232.54) .. (446.5,231.36) .. controls (446.64,231.32) and (446.21,231.3) .. (446.08,231.36) .. controls (444.78,232.01) and (443.48,232.66) .. (442.28,233.47) .. controls (439.95,235.06) and (437.69,236.75) .. (435.52,238.54) .. controls (434.97,238.99) and (433.16,240.03) .. (433.83,239.81) .. controls (438.71,238.18) and (444.47,236.16) .. (448.61,233.05) .. controls (449.22,232.6) and (447.12,233.46) .. (446.5,233.9) .. controls (445.6,234.53) and (444.89,235.41) .. (443.97,236.01) .. controls (443.3,236.44) and (435.52,240.5) .. (435.52,241.08) .. controls (435.52,242.24) and (437.77,240.5) .. (438.9,240.23) .. controls (440.92,239.76) and (443.24,240.11) .. (444.81,238.54) .. controls (445.03,238.32) and (444.21,238.77) .. (443.97,238.97) .. controls (443.5,239.34) and (443.16,239.85) .. (442.7,240.23) .. controls (441.3,241.4) and (439.63,242.82) .. (438.05,243.61) .. controls (437.32,243.98) and (439.45,242.75) .. (440.16,242.35) .. controls (441.32,241.69) and (442.46,241) .. (443.54,240.23) .. controls (446.01,238.51) and (442.39,239.58) .. (442.7,238.97) .. controls (443.84,236.68) and (448.71,232.76) .. (449.04,231.78) .. controls (449.4,230.68) and (443.78,232.56) .. (443.54,232.63) .. controls (443.24,232.71) and (442.7,233.37) .. (442.7,233.05) .. controls (442.7,231.79) and (444.99,231.99) .. (446.08,231.36) .. controls (447.3,230.66) and (448.29,229.61) .. (449.46,228.83) .. controls (450.26,228.29) and (451.19,227.67) .. (451.99,227.14) .. controls (452.11,227.06) and (452.56,227.14) .. (452.42,227.14) .. controls (451.37,227.14) and (449.56,227.2) .. (448.61,227.56) .. controls (446.73,228.27) and (444.98,229.3) .. (443.12,230.09) .. controls (442.99,230.15) and (442.57,230.16) .. (442.7,230.09) .. controls (447.07,227.91) and (453.21,225.92) .. (456.64,222.49) .. controls (457.24,221.89) and (454.93,222.31) .. (454.11,222.49) .. controls (452.23,222.88) and (450.43,223.57) .. (448.61,224.18) .. controls (447.74,224.47) and (447.38,224.31) .. (446.5,224.6) .. controls (446.31,224.66) and (445.94,225.16) .. (446.08,225.02) .. controls (448.65,222.45) and (452.22,221.08) .. (454.95,218.69) .. controls (455.91,217.85) and (452.32,218.6) .. (451.15,219.11) .. controls (449.53,219.82) and (448.1,220.9) .. (446.5,221.64) .. controls (445.98,221.89) and (445.36,221.88) .. (444.81,222.07) .. controls (444.51,222.17) and (443.72,222.69) .. (443.97,222.49) .. controls (448.08,219.06) and (449.07,219.14) .. (454.53,215.73) .. controls (454.91,215.49) and (453.69,216.04) .. (453.26,216.15) .. controls (451.97,216.47) and (450.65,216.82) .. (449.46,217.42) .. controls (449.28,217.51) and (448.9,217.98) .. (449.04,217.84) .. controls (449.89,216.98) and (450.98,216.38) .. (451.99,215.73) .. controls (453.44,214.8) and (454.88,213.84) .. (456.22,212.77) .. controls (456.38,212.65) and (456.84,212.35) .. (456.64,212.35) .. controls (454.75,212.35) and (452.42,213.74) .. (450.73,214.46) .. controls (449.51,214.98) and (446.24,217.31) .. (447.35,216.57) .. controls (449.96,214.83) and (452.76,212.82) .. (455.37,211.08) .. controls (455.49,211) and (455.08,211.02) .. (454.95,211.08) .. controls (452.11,212.43) and (449.26,213.78) .. (446.5,215.31) .. controls (440.96,218.37) and (436.26,222.21) .. (430.87,225.45) .. controls (429.39,226.33) and (426.11,226.94) .. (425.38,228.4) .. controls (425.31,228.53) and (425.67,228.45) .. (425.8,228.4) .. controls (426.77,228.08) and (427.86,227.21) .. (428.76,226.71) .. controls (431.19,225.37) and (434.2,224.66) .. (435.94,222.49) .. controls (436.73,221.5) and (433.39,222.28) .. (432.14,222.49) .. controls (429.81,222.88) and (425.38,222.24) .. (425.38,224.6) .. controls (425.38,224.62) and (433.94,220.2) .. (434.25,219.95) .. controls (434.6,219.67) and (433.33,220.1) .. (432.98,220.38) .. controls (431.89,221.25) and (431.05,222.38) .. (430.02,223.33) .. controls (429.07,224.22) and (426.15,224.95) .. (427.07,225.87) .. controls (428.37,227.17) and (432.37,225.02) .. (434.25,225.02) .. controls (436.25,225.02) and (431.74,228.14) .. (430.45,229.67) .. controls (430.06,230.13) and (428.6,231.08) .. (429.18,230.94) .. controls (430.44,230.62) and (431.73,230.43) .. (432.98,230.09) .. controls (433.29,230.01) and (433.83,229.36) .. (433.83,229.67) .. controls (433.83,231.08) and (431.68,231.52) .. (430.45,232.21) .. controls (429.83,232.55) and (428.07,233.61) .. (428.76,233.47) .. controls (430.2,233.18) and (431.6,232.71) .. (432.98,232.21) .. controls (433.17,232.14) and (433.54,231.64) .. (433.4,231.78) .. controls (431.95,233.24) and (429.28,234.95) .. (428.33,236.85) .. controls (428.07,237.37) and (429.47,236.61) .. (430.02,236.43) .. controls (430.07,236.41) and (433.4,235.31) .. (433.4,235.16) .. controls (433.4,234.06) and (429.86,237.72) .. (430.87,237.28) .. controls (432.26,236.66) and (433.66,236.07) .. (435.09,235.59) .. controls (435.52,235.44) and (434.67,236.41) .. (434.67,236.85) .. controls (434.67,236.99) and (434.58,236.54) .. (434.67,236.43) .. controls (435.45,235.52) and (436.47,234.84) .. (437.21,233.9) .. controls (440.81,229.26) and (440.45,228.77) .. (443.12,223.76) .. controls (444.13,221.86) and (446.6,220.15) .. (445.23,217.42) .. controls (444.66,216.28) and (440.96,218.18) .. (441.43,217) .. controls (442.49,214.35) and (445.24,212.54) .. (446.08,209.81) .. controls (446.29,209.14) and (444.58,209.47) .. (443.97,209.81) .. controls (441.95,210.93) and (440.26,212.58) .. (438.47,214.04) .. controls (438.23,214.24) and (439.06,213.79) .. (439.32,213.62) .. controls (440.89,212.52) and (444.49,212.08) .. (443.97,210.24) .. controls (443.46,208.47) and (440.3,210.43) .. (438.47,210.66) .. controls (435.92,210.98) and (433.43,212.1) .. (430.87,211.93) .. controls (430.45,211.9) and (430.74,211.06) .. (430.87,210.66) .. controls (431.48,208.84) and (433.68,208.81) .. (432.98,206.01) .. controls (432.81,205.33) and (431.54,205.8) .. (430.87,206.01) .. controls (427.18,207.16) and (426.2,208.55) .. (423.26,208.55) .. controls (422.76,208.55) and (424.13,208) .. (424.53,207.7) .. controls (425.34,207.09) and (426.16,206.46) .. (427.07,206.01) .. controls (427.07,206.01) and (441.18,201.02) .. (443.12,197.14) .. controls (443.82,195.75) and (440.01,196.93) .. (438.47,196.71) .. controls (437.66,196.6) and (439.88,195.86) .. (440.59,195.45) .. controls (443.09,193.98) and (445.44,192.14) .. (448.19,191.22) .. controls (448.31,191.18) and (452.08,190.04) .. (451.57,189.53) .. controls (451.42,189.39) and (450.8,189.89) .. (450.73,189.95) .. controls (449.4,191.01) and (447.52,191.75) .. (446.92,193.33) .. controls (446.63,194.13) and (448.62,193.4) .. (449.46,193.33) .. controls (450.59,193.25) and (451.73,193.16) .. (452.84,192.91) .. controls (453.27,192.82) and (454.31,192.09) .. (454.11,192.49) .. controls (453.39,193.92) and (451.61,194.54) .. (450.73,195.87) .. controls (450.31,196.5) and (452.09,195.15) .. (452.84,195.02) .. controls (454.02,194.83) and (454.64,194.89) .. (455.8,194.6) .. controls (456.1,194.53) and (456.88,193.97) .. (456.64,194.18) .. controls (454.82,195.74) and (452.85,197.13) .. (451.15,198.83) .. controls (450.61,199.36) and (452.54,198.22) .. (453.26,197.98) .. controls (454.11,197.7) and (455,197.54) .. (455.8,197.14) .. controls (455.92,197.07) and (456.22,197.14) .. (456.22,197.14) .. controls (456.22,197.14) and (450.3,201.36) .. (450.3,201.36) .. controls (450.86,201.92) and (455.09,200.8) .. (455.8,200.1) .. controls (456.74,199.16) and (453.56,201.52) .. (452.42,202.21) .. controls (451.34,202.86) and (450.02,203.11) .. (449.04,203.9) .. controls (448.71,204.16) and (449.89,203.98) .. (450.3,203.9) .. controls (451.18,203.72) and (452.01,203.38) .. (452.84,203.05) .. controls (453.02,202.98) and (453.44,202.54) .. (453.26,202.63) .. controls (445.15,206.69) and (437,209.57) .. (429.18,214.04) .. controls (426.96,215.31) and (423.34,216.24) .. (421.57,217.42) .. controls (421.2,217.66) and (420.7,217.64) .. (420.31,217.84) .. controls (419.91,218.04) and (418.59,218.26) .. (419.04,218.26) .. controls (422.14,218.26) and (424.69,216.15) .. (427.91,216.15) .. controls (428.19,216.15) and (427.32,216.02) .. (427.07,216.15) .. controls (424.7,217.33) and (422.15,218.17) .. (419.88,219.53) .. controls (419.64,219.68) and (420.08,220.18) .. (419.88,220.38) .. controls (419.68,220.57) and (418.77,220.45) .. (419.04,220.38) .. controls (421.27,219.74) and (423.57,219.32) .. (425.8,218.69) .. controls (426.53,218.48) and (427.19,218.08) .. (427.91,217.84) .. controls (434.25,215.73) and (422.71,219.4) .. (422.42,219.53) .. controls (421.28,220.06) and (420.3,221.22) .. (419.04,221.22) .. controls (418.9,221.22) and (419.34,221.28) .. (419.46,221.22) .. controls (420.48,220.71) and (421.48,220.17) .. (422.42,219.53) .. controls (423.61,218.71) and (432.24,212.92) .. (432.56,211.08) .. controls (433.13,207.75) and (432.69,204.32) .. (432.56,200.94) .. controls (432.53,200.08) and (432.95,198.68) .. (432.14,198.4) .. controls (428.44,197.17) and (426.96,198.84) .. (423.69,199.25) .. controls (422.79,199.36) and (424.81,197.84) .. (425.38,197.14) .. controls (426.57,195.64) and (427.81,194.15) .. (428.76,192.49) .. controls (428.97,192.12) and (429.13,191.41) .. (428.76,191.22) .. controls (427.74,190.71) and (426.47,191.33) .. (425.38,191.64) .. controls (422.29,192.54) and (419.14,193.2) .. (416.08,194.18) .. controls (415.48,194.37) and (413.76,195.02) .. (414.39,195.02) .. controls (417.07,195.02) and (418.96,192.49) .. (421.57,192.49) ;
%Shape: Free Drawing [id:dp6122979322672671] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=6] [line join = round][line cap = round] (463,165.5) .. controls (460.27,165.5) and (463.27,165.5) .. (466,165.5) .. controls (466.47,165.5) and (465.33,166.17) .. (465,166.5) .. controls (464.76,166.74) and (465.76,166.26) .. (466,166.5) .. controls (467.18,167.68) and (461.33,170.5) .. (463,170.5) .. controls (466.67,170.5) and (471.54,169.65) .. (475,168.5) .. controls (476.41,168.03) and (477.59,166.97) .. (479,166.5) .. controls (479.63,166.29) and (481.6,166.2) .. (481,166.5) .. controls (473.4,170.3) and (464.05,174.82) .. (480,169.5) .. controls (481.14,169.12) and (477.98,170.81) .. (477,171.5) .. controls (474.27,173.41) and (471.36,175.14) .. (469,177.5) .. controls (468.25,178.25) and (464.95,178.5) .. (466,178.5) .. controls (471.5,178.5) and (477.6,171.2) .. (482,174.5) .. controls (485.26,176.94) and (475.43,179.32) .. (472,181.5) .. controls (471.6,181.75) and (470.55,182.65) .. (471,182.5) .. controls (473.27,181.74) and (484,177.5) .. (484,177.5) .. controls (484,177.5) and (472.45,183.05) .. (468,187.5) .. controls (466.28,189.22) and (472.75,186.4) .. (475,185.5) .. controls (480.29,183.38) and (485.6,181.3) .. (491,179.5) .. controls (491.71,179.26) and (493.63,178.1) .. (493,178.5) .. controls (488.71,181.23) and (484.51,184.14) .. (480,186.5) .. controls (477.48,187.82) and (474.55,188.23) .. (472,189.5) .. controls (465.62,192.69) and (481.81,187.76) .. (486,186.5) .. controls (493.05,184.39) and (495.87,180.24) .. (497,182.5) .. controls (497.42,183.34) and (495.79,183.99) .. (495,184.5) .. controls (491.1,187) and (487.18,189.5) .. (483,191.5) .. controls (479.48,193.18) and (475.7,194.27) .. (472,195.5) .. controls (471.55,195.65) and (470.53,196.5) .. (471,196.5) .. controls (478.06,196.5) and (484.3,191.73) .. (491,189.5) .. controls (493.07,188.81) and (493.93,189.19) .. (496,188.5) .. controls (496.71,188.26) and (498.62,187.09) .. (498,187.5) .. controls (491.02,192.15) and (486.86,196.88) .. (479,201.5) .. controls (478.98,201.51) and (469.32,204.82) .. (470,205.5) .. controls (471.91,207.41) and (477.2,202.15) .. (479,201.5) .. controls (484.8,199.39) and (493.06,195.5) .. (499,195.5) .. controls (500.94,195.5) and (495.68,197.51) .. (494,198.5) .. controls (489.12,201.37) and (484.07,203.97) .. (479,206.5) .. controls (476.35,207.83) and (471.26,206.71) .. (470,210.5) .. controls (469.33,212.5) and (474,209.17) .. (476,208.5) .. controls (476.95,208.18) and (478.05,208.82) .. (479,208.5) .. controls (486.11,206.13) and (492.51,200.5) .. (500,200.5) .. controls (504.53,200.5) and (492.9,206.19) .. (489,208.5) .. controls (485.53,210.55) and (481.6,211.7) .. (478,213.5) .. controls (477.58,213.71) and (476.53,214.5) .. (477,214.5) .. controls (482.85,214.5) and (498.62,205.31) .. (501,206.5) .. controls (505.3,208.65) and (493.15,212.07) .. (489,214.5) .. controls (482.44,218.35) and (475.81,222.1) .. (469,225.5) .. controls (466.73,226.64) and (473.64,223.43) .. (476,222.5) .. controls (481.6,220.31) and (487.27,218.31) .. (493,216.5) .. controls (495.01,215.87) and (500.75,213.33) .. (499,214.5) .. controls (492.29,218.98) and (484.08,223.96) .. (477,227.5) .. controls (475.11,228.44) and (471.94,227.61) .. (471,229.5) .. controls (470.4,230.69) and (473.71,229.84) .. (475,229.5) .. controls (483.76,227.16) and (492.4,224.37) .. (501,221.5) .. controls (501.71,221.26) and (503,220.5) .. (503,220.5) .. controls (503,220.5) and (497.22,224.38) .. (497,224.5) .. controls (494.38,225.93) and (491.76,227.38) .. (489,228.5) .. controls (482.46,231.16) and (461.94,235.5) .. (469,235.5) .. controls (476.07,235.5) and (492,228.5) .. (492,228.5) .. controls (492,228.5) and (470.07,238.82) .. (466,239.5) .. controls (463.73,239.88) and (461.95,240.85) .. (460,241.5) .. controls (459,241.83) and (455.98,242.78) .. (457,242.5) .. controls (467.67,239.51) and (479.09,238.46) .. (489,233.5) .. controls (491,232.5) and (485,235.5) .. (483,236.5) .. controls (479,238.5) and (475.19,240.93) .. (471,242.5) .. controls (469.03,243.24) and (462.89,244.5) .. (465,244.5) .. controls (469.94,244.5) and (479.46,241.56) .. (483,240.5) .. controls (487.86,239.04) and (463.92,246.5) .. (469,246.5) ;

% Text Node
\draw (151.3,6.94) node [anchor=north west][inner sep=0.75pt]   [align=left] {INNER JOIN};
% Text Node
\draw (369.3,5.94) node [anchor=north west][inner sep=0.75pt]   [align=left] {FULL OUTER JOIN};
% Text Node
\draw (156.3,134.94) node [anchor=north west][inner sep=0.75pt]   [align=left] {LEFT JOIN};
% Text Node
\draw (396.3,134.94) node [anchor=north west][inner sep=0.75pt]   [align=left] {RIGHT JOIN};


\end{tikzpicture}


}

\section{Version Control Software (L7)}

This lecture focuses mainly on \textbf{version control} and \textbf{\texttt{Git}}. Since I know the basics of this stuff already, this will just be a smaller review of \texttt{Git} plus the stuff I didn't already know very well coming into this course.\\

\begin{myproof}
\textbf{Big Idea:} Teams needed good ways to maintain central repositories for their code projects without having conflicting versions of their code, so \textbf{version control software} was created.\\\\ Eventually, this software carried the secondary purpose of tracking and managing bugs. These days, however, dedicated enterprise tools like \textbf{JIRA} also exist to handle bugs. It's mainly based on what your company decides to use.
\end{myproof}

\textbf{Version Control Software} is used to do the following:
\begin{itemize}
	\item Search revision history and get \textbf{any version} of the project you're working on
	\item Share code changes with your collaborators
	\item Confidently make changes to large files
\end{itemize}

\subsection{Centralized VCS}

People used \textbf{Centralized VCS} to have multiple users all pushing towards a central repository. I've seen examples of this in use at HPE (\texttt{SVN}- Subversion), and during my classes at UMD- \texttt{CVS} in CMSC132.

\begin{myproof}
{

\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Can [id:dp2786160443943346] 
\draw   (409.67,73.03) -- (409.67,127.63) .. controls (409.67,134.1) and (385.71,139.33) .. (356.17,139.33) .. controls (326.62,139.33) and (302.67,134.1) .. (302.67,127.63) -- (302.67,73.03) .. controls (302.67,66.57) and (326.62,61.33) .. (356.17,61.33) .. controls (385.71,61.33) and (409.67,66.57) .. (409.67,73.03) .. controls (409.67,79.5) and (385.71,84.73) .. (356.17,84.73) .. controls (326.62,84.73) and (302.67,79.5) .. (302.67,73.03) ;
%Shape: Rectangle [id:dp5804880297256121] 
\draw   (184.67,184.33) -- (254.67,184.33) -- (254.67,224.33) -- (184.67,224.33) -- cycle ;
%Shape: Rectangle [id:dp45826603462909665] 
\draw   (318.67,198.33) -- (388.67,198.33) -- (388.67,238.33) -- (318.67,238.33) -- cycle ;
%Shape: Rectangle [id:dp027544595740327416] 
\draw   (448,185) -- (518,185) -- (518,225) -- (448,225) -- cycle ;
%Straight Lines [id:da9610493616903942] 
\draw    (231.67,177.33) -- (293.43,128.57) ;
\draw [shift={(295,127.33)}, rotate = 501.71] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da5107911626847681] 
\draw    (298.33,131.33) -- (236.57,180.09) ;
\draw [shift={(235,181.33)}, rotate = 321.71000000000004] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da17177392489774002] 
\draw    (477,177.33) -- (417.74,128.6) ;
\draw [shift={(416.2,127.33)}, rotate = 399.43] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da26959159300812263] 
\draw    (413,131.33) -- (472.26,180.06) ;
\draw [shift={(473.8,181.33)}, rotate = 219.43] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da3848074645987064] 
\draw    (349.67,147.33) -- (349.67,191.33) ;
\draw [shift={(349.67,193.33)}, rotate = 270] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da9001312207066434] 
\draw    (360.33,192.67) -- (360.33,148.67) ;
\draw [shift={(360.33,146.67)}, rotate = 450] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (341.33,100.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {repo};
% Text Node
\draw (204,195) node [anchor=north west][inner sep=0.75pt]   [align=left] {user};
% Text Node
\draw (338,209) node [anchor=north west][inner sep=0.75pt]   [align=left] {user};
% Text Node
\draw (467.33,195.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {user};


\end{tikzpicture}


}

\hfill \break In this case, there's a singular centralized codebase that users will all be contributing to at the same time.
\end{myproof}

\subsection{Distributed VCS}

Distributed VCS has no central repository, and every repository has their own commits and history. Examples of this would be \textbf{Git} and \textbf{Mercurial}.\\

\begin{tcolorbox}[title=Aside: \texttt{Git}'s popularity,colframe=black,colback=white,arc=0pt,fonttitle=\bfseries]
\texttt{Git} is currently the most widely used code management tool (VCS). The next two in line are \texttt{SVN} and \texttt{Mercurial}.
\end{tcolorbox}

\hfill \break \texttt{Git} is also more efficient and secure than \texttt{SVN}, but a lot of old legacy codebases and companies still make use of \texttt{SVN}. For that reason, it may be good to pick up some of the basics of \texttt{SVN} so I'm not totally unfamiliar, but \texttt{Git} generally seems to be preferable.

\subsection{Branching}

\texttt{Git} also allows for branches. \textbf{Merging} also ties in closely with branching on git, and has a lot more sophistication in \texttt{Git}.

\subsubsection{When Should I Branch?}

Anything in the master branch is considered to be deployable. If you're adding a new feature, working on an experiment, or trying to implement a new fix, make a branch. You can always merge it back to master once it's considered 'deployable' again.

\subsection{Git Basics}

As far as \texttt{git} is concerned, a file has 4 states:

\begin{itemize}
	\item \textbf{Modified} $\rightarrow$ File has been changed, but those changes have not been committed
	\item \textbf{Staged} $\rightarrow$ Marked to go to next commit snapshot
	\item \textbf{Committed} $\rightarrow$ Safely stored in local database as part of a 'commit'
	\item \textbf{Untracked} $\rightarrow$ News added or removed files
\end{itemize}

This idea can be complemented by a visual guide, where you can see the three main 'places' that a file can be within the \texttt{git} system.\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Rectangle [id:dp03360737972469485] 
\draw   (87.67,2) -- (235,2) -- (235,42) -- (87.67,42) -- cycle ;
%Shape: Rectangle [id:dp33101061268187126] 
\draw   (269,3.33) -- (416.33,3.33) -- (416.33,43.33) -- (269,43.33) -- cycle ;
%Shape: Rectangle [id:dp2846424801765658] 
\draw   (467.67,3.33) -- (615,3.33) -- (615,43.33) -- (467.67,43.33) -- cycle ;
%Straight Lines [id:da9065613336807579] 
\draw    (160,42) -- (160,291.33) ;
%Straight Lines [id:da2956282330861755] 
\draw    (341.67,42) -- (341.67,291.33) ;
%Straight Lines [id:da16834279927637852] 
\draw    (540.67,42) -- (540.67,291.33) ;
%Straight Lines [id:da5787373144011785] 
\draw [color={rgb, 255:red, 255; green, 0; blue, 0 }  ,draw opacity=1 ]   (540.67,70.5) -- (162,70.5) ;
\draw [shift={(160,70.5)}, rotate = 360] [color={rgb, 255:red, 255; green, 0; blue, 0 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da4994076538949177] 
\draw [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ]   (160,146) -- (339.67,146) ;
\draw [shift={(341.67,146)}, rotate = 180] [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da45013717398282227] 
\draw [color={rgb, 255:red, 65; green, 117; blue, 5 }  ,draw opacity=1 ]   (341.67,221.67) -- (538.67,221.67) ;
\draw [shift={(540.67,221.67)}, rotate = 180] [color={rgb, 255:red, 65; green, 117; blue, 5 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (102.5,12.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {working directory};
% Text Node
\draw (299.17,12.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {staging area};
% Text Node
\draw (496.67,12.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {git repository};
% Text Node
\draw (213.33,74.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{checkout}};
% Text Node
\draw (188.67,150) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{stage ({\fontfamily{pcr}\selectfont git add})}};
% Text Node
\draw (414.67,225.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.25,0.46,0.02}{commit}};


\end{tikzpicture}


}

\subsection{Online Hosting}

\textbf{Github}, \textbf{Bitbucket}, and \textbf{Gitlab} are all popular sources that will host your git repositories. Think of this as just another place for your git stuff to exist, except whenever you want to update the main online repository, perform a \texttt{git push}.\\

It's sort of like a hybrid of a centralized and distributed VCS. According to a website online, "GitHub and similar services bring all of the benefits of a decentralized VCS to a centralized service."

\section{Missing Data and Imputation (L8)}

\subsection{Missing Data}

Missing data is information that we want to know, but don't know. It can come in many forms. Here are some examples.

\begin{itemize}
	\item People omitting answers on surveys
	\item Inaccurate measurements that we need to discard- we're mainly talking about easily detectable outliers here
	\item Canceled trials of an experiment
\end{itemize}

\hfill \break To do something about this, however, we need to figure out the following.

\begin{itemize}
	\item What contributes to the \textit{probability} of a data point being absent?
	\item Can this missing data be interpolated using the data we already have? Or not?
\end{itemize}

\subsubsection{Just Deleting It}

The easiest way to deal with this is just to delete all the tuples with any missing values, so we don't have any 'incomplete observations'. All we have to do in this case is just use \texttt{df.dropna()} to trop the appropriate row.\\

Be warned that a loss of a sample could lead to a variance greater than what's reflected by the size of our data. This could cause bias. Overall, despite this being the easiest way to get rid of 'problem' data, we should avoid doing this if we can, and intelligently account for it being missing if possible.

Obviously, if we want to remain the most accurate that we can possibly be, the goal is not to throw in the towel and just toss out missing data right off the bat.

\subsection{Types of Missing Data}

Let's start by classifying missing data into a variety of different types. Missing data can fall into one of three categories. They also have commonly used abbreviations, included here.

\begin{itemize}
	\item Missing Completely at Random (\textbf{MCAR})
	\item Missing at Random (\textbf{MAR})
	\item Missing Not at Random (\textbf{MNAR})
\end{itemize}

\subsubsection{Missing Completely At Random}

Missing completely at random means exactly what it says- the data that has gone missing has gone missing completely and entirely at random- there is no rhyme or reason behind what's gone and why it went.\\

\begin{myproof}
\textbf{Example of MCAR:} Imagine you are doing an experiment on plants grown in pots, when you have a nervous breakdown and destroy some of the pots. You didn't have any bias in how you picked the pots to break, so this data is now \textbf{MCAR}.
\end{myproof}

\hfill \break However, this just isn't realistic. Data is usually missing for a reason. For example, if you're standing outside CSIC polling people for a survey and you suddenly ask for their grades, students who are failing may be less likely to reveal their grades than students who are doing well.

\subsubsection{Missing At Random}

For data that is missing at random, the probability of the missing data is dependent on the observed data, but not the unobserved data.\\

There is a \textbf{probabilistic mechanism} associated with whether the data is missing, and that mechanism takes the observed data as input.\\

\begin{myproof}
\textbf{Example:} If a child does not attend an educational assessment because the child is (genuinely) ill, this might be predictable from other data we have about the child's health, but it would not be related to what we would have measured had the child not been ill.\\\\ Since we could predict the "missing-ness" of the student's score from the student's health, this is considered \textbf{Missing At Random}. \textit{(Taken from Martin Bland's textbook: An Introduction to Medical Statistics, Fourth Edition)}
\end{myproof}

\hfill \break We can model a parameter's "missing-ness" on other properties of the data we have already.

\subsubsection{Missing Not At Random}

The "missing-ness" of a variable has something to do with the variable itself.

\begin{myproof}
\textbf{Example:} If men failed to fill out a survey because of their level of depression, their depression affects the measurement that is being taken about their depression, making this data \textbf{Missing Not at Random}.
\end{myproof}

\subsection{Line of Best Fit}

In order to aid in this, let's again revisit the idea of a 'line of best fit' from statistics. Here's a quick review.\\

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Line of Best Fit} $\rightarrow$ When the difference between the true y-values and the line that you're using the estimate y-values is minimized.
\end{tcolorbox}

\hfill \break This is accomplished by using the least squares method. Below is a visualization of the process. Given the following plotted points (light green), we're looking to create the line of best fit (blue).\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Straight Lines [id:da7643777205506523] 
\draw [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ]   (65,288) -- (638.44,93.97) ;
\draw [shift={(640.33,93.33)}, rotate = 521.31] [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Shape: Rectangle [id:dp6573658715334976] 
\draw   (65,8.67) -- (640.33,8.67) -- (640.33,288) -- (65,288) -- cycle ;
%Shape: Circle [id:dp030383011435651586] 
\draw  [fill={rgb, 255:red, 184; green, 233; blue, 134 }  ,fill opacity=1 ] (148.63,189.13) .. controls (148.63,187.11) and (150.27,185.47) .. (152.3,185.47) .. controls (154.33,185.47) and (155.97,187.11) .. (155.97,189.13) .. controls (155.97,191.16) and (154.33,192.8) .. (152.3,192.8) .. controls (150.27,192.8) and (148.63,191.16) .. (148.63,189.13) -- cycle ;
%Shape: Circle [id:dp39783578450486057] 
\draw  [fill={rgb, 255:red, 184; green, 233; blue, 134 }  ,fill opacity=1 ] (254.33,119.93) .. controls (254.33,117.91) and (255.97,116.27) .. (258,116.27) .. controls (260.03,116.27) and (261.67,117.91) .. (261.67,119.93) .. controls (261.67,121.96) and (260.03,123.6) .. (258,123.6) .. controls (255.97,123.6) and (254.33,121.96) .. (254.33,119.93) -- cycle ;
%Straight Lines [id:da026963523494234698] 
\draw [color={rgb, 255:red, 144; green, 19; blue, 254 }  ,draw opacity=1 ]   (152.43,193.6) -- (152.3,257.2) ;
%Straight Lines [id:da6336444278407171] 
\draw [color={rgb, 255:red, 144; green, 19; blue, 254 }  ,draw opacity=1 ]   (258,123.6) -- (258,223.2) ;
%Shape: Circle [id:dp12766640087215864] 
\draw  [fill={rgb, 255:red, 184; green, 233; blue, 134 }  ,fill opacity=1 ] (431.73,217.93) .. controls (431.73,215.91) and (433.37,214.27) .. (435.4,214.27) .. controls (437.43,214.27) and (439.07,215.91) .. (439.07,217.93) .. controls (439.07,219.96) and (437.43,221.6) .. (435.4,221.6) .. controls (433.37,221.6) and (431.73,219.96) .. (431.73,217.93) -- cycle ;
%Shape: Circle [id:dp7005961980277177] 
\draw  [fill={rgb, 255:red, 184; green, 233; blue, 134 }  ,fill opacity=1 ] (576.33,156.33) .. controls (576.33,154.31) and (577.97,152.67) .. (580,152.67) .. controls (582.03,152.67) and (583.67,154.31) .. (583.67,156.33) .. controls (583.67,158.36) and (582.03,160) .. (580,160) .. controls (577.97,160) and (576.33,158.36) .. (576.33,156.33) -- cycle ;
%Straight Lines [id:da8064600665611591] 
\draw [color={rgb, 255:red, 144; green, 19; blue, 254 }  ,draw opacity=1 ]   (435.4,163.2) -- (435.4,214.27) ;
%Straight Lines [id:da09925340912653924] 
\draw [color={rgb, 255:red, 144; green, 19; blue, 254 }  ,draw opacity=1 ]   (580,152.67) -- (580,113.6) ;

% Text Node
\draw (130.3,212.8) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.56,0.07,1}{E$\displaystyle _{1}$}};
% Text Node
\draw (236,154.4) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.56,0.07,1}{E}\textcolor[rgb]{0.56,0.07,1}{$\displaystyle _{2}$}};
% Text Node
\draw (413.4,182.8) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.56,0.07,1}{E}\textcolor[rgb]{0.56,0.07,1}{$\displaystyle _{3}$}};
% Text Node
\draw (558,126) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.56,0.07,1}{E}\textcolor[rgb]{0.56,0.07,1}{$\displaystyle _{4}$}};
% Text Node
\draw (308.01,184.87) node [anchor=north west][inner sep=0.75pt]  [rotate=-341] [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{Line of Best Fit}};


\end{tikzpicture}


}

\hfill \break Here, using the least squares method, we try to minimize the equation yielded by the following summation. 

$$ \sum_{i=1}^{n} {(E_i)^2} = (E_1)^2 + (E_2)^2 + ... (E_n)^2$$

\subsection{Imputation}

How do we handle missing data? One excellent way to do that is \textbf{imputation}.\\

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Imputation} $\rightarrow$ Replacing the missing data with substituted values.
\end{tcolorbox}

\hfill \break Imputation is basically just a fancy way of saying that you'll replace whatever's missing with other data that you generate or collect yourself.

\subsubsection{Types Of Imputation}

\begin{itemize}
	\item \textbf{Mean Imputation} $\rightarrow$ Imputing the average from observed cases for all missing values
	\item \textbf{Hot-Deck Imputation} $\rightarrow$ Imputing a value from another subject, or \textit{donor}, that is most like the subject.
	\item \textbf{Cold-Deck Imputation} $\rightarrow$ Bringing in other datasets
	\item \textbf{Other Methods} $\rightarrow$ Regression imputation, stochastic regression imputation, interpolation and extrapolation.
\end{itemize}

\subsubsection{Single vs. Multiple Imputation}

\textbf{Single Imputation} is supposedly 'old and busted', while \textbf{Multiple Imputation} is the 'new hotness'. Multiple imputation is basically when you impute on the original data $n$ times, then you perform analysis on each of the resulting $n$ datasets, then pool all those results into one, distilled result.\\\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Can [id:dp2602400961202158] 
\draw   (184.33,90.28) -- (184.33,188.05) .. controls (184.33,199.62) and (149.71,209) .. (107,209) .. controls (64.29,209) and (29.67,199.62) .. (29.67,188.05) -- (29.67,90.28) .. controls (29.67,78.71) and (64.29,69.33) .. (107,69.33) .. controls (149.71,69.33) and (184.33,78.71) .. (184.33,90.28) .. controls (184.33,101.85) and (149.71,111.23) .. (107,111.23) .. controls (64.29,111.23) and (29.67,101.85) .. (29.67,90.28) ;
%Flowchart: Punched Tape [id:dp15411156869991438] 
\draw   (501.33,83.6) .. controls (501.33,91.3) and (517.53,97.53) .. (537.5,97.53) .. controls (557.47,97.53) and (573.67,91.3) .. (573.67,83.6) .. controls (573.67,75.9) and (589.86,69.67) .. (609.83,69.67) .. controls (629.81,69.67) and (646,75.9) .. (646,83.6) -- (646,195.07) .. controls (646,187.37) and (629.81,181.13) .. (609.83,181.13) .. controls (589.86,181.13) and (573.67,187.37) .. (573.67,195.07) .. controls (573.67,202.76) and (557.47,209) .. (537.5,209) .. controls (517.53,209) and (501.33,202.76) .. (501.33,195.07) -- cycle ;
%Straight Lines [id:da6406675389654978] 
\draw    (183.67,139.33) -- (225.12,85.19) ;
\draw [shift={(226.33,83.6)}, rotate = 487.44] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da8270960211517617] 
\draw    (183.67,139.33) -- (224.8,104.95) ;
\draw [shift={(226.33,103.67)}, rotate = 500.11] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da20731204626185318] 
\draw    (183.67,139.33) -- (224.49,122.11) ;
\draw [shift={(226.33,121.33)}, rotate = 517.13] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da35473989075756784] 
\draw    (183.67,139.33) -- (224.33,139.33) ;
\draw [shift={(226.33,139.33)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da06709427884110541] 
\draw    (183.67,139.33) -- (224.9,179.73) ;
\draw [shift={(226.33,181.13)}, rotate = 224.41] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da28455274447168444] 
\draw    (255.67,83.6) -- (430.33,83.6) ;
\draw [shift={(432.33,83.6)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da5528993287124772] 
\draw    (255.67,103.67) -- (431.67,103.67) ;
\draw [shift={(433.67,103.67)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da8952519050209367] 
\draw    (255.67,121.33) -- (431,121.33) ;
\draw [shift={(433,121.33)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da6969730944234751] 
\draw    (255.67,139.33) -- (431,139.33) ;
\draw [shift={(433,139.33)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da06722917869533818] 
\draw    (255.67,181.13) -- (431,181.13) ;
\draw [shift={(433,181.13)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da7733865371627034] 
\draw    (463,83.6) -- (500.2,137.69) ;
\draw [shift={(501.33,139.33)}, rotate = 235.48] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da923997647536473] 
\draw    (463,103.67) -- (499.87,137.97) ;
\draw [shift={(501.33,139.33)}, rotate = 222.94] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da862155293400015] 
\draw    (463,121.33) -- (499.52,138.48) ;
\draw [shift={(501.33,139.33)}, rotate = 205.15] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da4866075836725807] 
\draw    (463,139.33) -- (499.33,139.33) ;
\draw [shift={(501.33,139.33)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da6375086814085584] 
\draw    (463,181.13) -- (499.98,140.81) ;
\draw [shift={(501.33,139.33)}, rotate = 492.52] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (229.33,73.6) node [anchor=north west][inner sep=0.75pt]   [align=left] {S$\displaystyle _{1}$};
% Text Node
\draw (229.33,93.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {S$\displaystyle _{2}$};
% Text Node
\draw (229.33,111.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {S$\displaystyle _{3}$};
% Text Node
\draw (229.33,129.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {S$\displaystyle _{4}$};
% Text Node
\draw (229.33,171.13) node [anchor=north west][inner sep=0.75pt]   [align=left] {S$\displaystyle _{n}$};
% Text Node
\draw (441,73.6) node [anchor=north west][inner sep=0.75pt]   [align=left] {A$\displaystyle _{1}$};
% Text Node
\draw (441,93.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {A$\displaystyle _{2}$};
% Text Node
\draw (441,111.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {A$\displaystyle _{3}$};
% Text Node
\draw (441,129.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {A$\displaystyle _{4}$};
% Text Node
\draw (441,171.13) node [anchor=north west][inner sep=0.75pt]   [align=left] {A$\displaystyle _{n}$};
% Text Node
\draw (339.33,150.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {...};
% Text Node
\draw (69.5,125.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {Incomplete\\Dataset};
% Text Node
\draw (552,120.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {Pooled\\Results};


\end{tikzpicture}


}

\hfill \break In this case, pooling the data is just taking an average. We can see that this 'multiple imputation' model relies on the results of each imputation being distinct.\\

Now, the question arises: \textbf{How big should $n$ be} when we're talking about multiple imputation? This ultimately depends on the \textbf{size of your dataset} and the \textbf{amount of data missing}.\\

Recent research suggests, however, that a 'good' $n$ value is generally higher. General counsel here is to choose a pretty sizable value for $n$.

\section{Data Wrangling, Integration, Cleaning (L9)}

Now that we have imputed and made some new datasets, how do we make use of them? To clarify from the previous example, we can simply perform analysis on each \textbf{individual dataset}, then pool the results of that analysis (take the average of the analyses on the imputed datasets).\\

\textbf{Bayesian Imputation} is another way to handle imputation, this time using Bayes' theorem. A google search doesn't yield much on this, but I assume it's got something to do with a Bayesian attempt at imputing data.\\

\subsection{Data Wrangling}

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Data Wrangling} $\rightarrow$ Getting data into a structured form suitable for analysis.\\\\ A.k.a. \textbf{data preparation}, \textbf{data mining}, or the \textbf{Extract $\rightarrow$ Transfer $\rightarrow$ Load (ETL) Process}.
\end{tcolorbox}

\hfill \break Supposedly, 80 to 90\% of your time is spent on data wrangling.

\subsubsection{Key Steps of Data Wrangling}

\begin{itemize}
	\item \textbf{Scraping} $\rightarrow$ Taking info from sources. The process of getting your data from its unusable format into Python data structures.
	\item \textbf{Transformation} $\rightarrow$ Tidying it into the right data structure. (e.g. converting \texttt{string}s that represent the date into \texttt{datetime} objects)
	\item \textbf{Information Extraction} $\rightarrow$ Extracting structured information from unstructured text and/or sources.
	\item \textbf{Data Integration} $\rightarrow$ (If applicable) combine info from multiple sources so that it all plays nice together.
	\item \textbf{Cleaning} $\rightarrow$ Removing inconsistencies + errors.
\end{itemize}

Usually, there are a 'mishmash' of tools that are used for all these processes, but there are generally accepted tools that are used to 'pipeline' all of these operations so that they work smoothly.\\

For example, some popular ETL services are \textbf{Talend} and \textbf{AWS Glue}.\\

\subsection{Data Integration}

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Data Integration} $\rightarrow$ Combining data from different sources into one unified view, and allowing for structured querying and analysis on that data.
\end{tcolorbox}

\hfill \break Each data has a \textbf{local schema}. We want to bring it all under one, unified, \textbf{global schema}. As it happens, there are actually two main ways that we can accomplish this.

\begin{itemize}
	\item[1.] Bring all the data into one single data structure or repository. This is known as \textbf{data warehousing}.
	\item[2.] Keep the data where it is, in separate repositories, and send \textbf{queries} to our separate data when we need to.
\end{itemize}

Option 1 is pretty self explanatory- you're just mushing all the data together, and with a little effort, you can ensure that the format of your data is all compatible, and combine it. Option 2 is a little tougher to think about at first, so here's a diagram.\\

{

\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Flowchart: Off Page Connector [id:dp9567338023313746] 
\draw   (292,20) -- (362,20) -- (362,52.5) -- (327,60) -- (292,52.5) -- cycle ;
%Shape: Rectangle [id:dp7141492352007189] 
\draw   (221.92,101.33) -- (432.08,101.33) -- (432.08,153.33) -- (221.92,153.33) -- cycle ;
%Straight Lines [id:da5299238451556456] 
\draw [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ]   (327,60) -- (327,96.67) ;
\draw [shift={(327,98.67)}, rotate = 270] [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Shape: Can [id:dp2595198181958037] 
\draw   (232.33,232.5) -- (232.33,281.5) .. controls (232.33,287.3) and (210.47,292) .. (183.5,292) .. controls (156.53,292) and (134.67,287.3) .. (134.67,281.5) -- (134.67,232.5) .. controls (134.67,226.7) and (156.53,222) .. (183.5,222) .. controls (210.47,222) and (232.33,226.7) .. (232.33,232.5) .. controls (232.33,238.3) and (210.47,243) .. (183.5,243) .. controls (156.53,243) and (134.67,238.3) .. (134.67,232.5) ;
%Shape: Cube [id:dp8253370569862154] 
\draw   (278.67,240.67) -- (299.67,219.67) -- (377.67,219.67) -- (377.67,268.67) -- (356.67,289.67) -- (278.67,289.67) -- cycle ; \draw   (377.67,219.67) -- (356.67,240.67) -- (278.67,240.67) ; \draw   (356.67,240.67) -- (356.67,289.67) ;
%Flowchart: Document [id:dp20461138874225637] 
\draw   (427.33,226) -- (519,226) -- (519,273.3) .. controls (461.71,273.3) and (473.17,290.36) .. (427.33,279.32) -- cycle ;
%Straight Lines [id:da9431063515450052] 
\draw [color={rgb, 255:red, 254; green, 0; blue, 0 }  ,draw opacity=1 ]   (327,154) -- (187.41,232.35) ;
\draw [shift={(185.67,233.33)}, rotate = 330.69] [color={rgb, 255:red, 254; green, 0; blue, 0 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da4037211793604081] 
\draw [color={rgb, 255:red, 254; green, 0; blue, 0 }  ,draw opacity=1 ]   (327,154) -- (327,227.33) ;
\draw [shift={(327,229.33)}, rotate = 270] [color={rgb, 255:red, 254; green, 0; blue, 0 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da35327476463374285] 
\draw [color={rgb, 255:red, 254; green, 0; blue, 0 }  ,draw opacity=1 ]   (327,154) -- (467.22,225.1) ;
\draw [shift={(469,226)}, rotate = 206.89] [color={rgb, 255:red, 254; green, 0; blue, 0 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (297.5,28.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {Query \textcolor[rgb]{0.94,0,0}{Q}};
% Text Node
\draw (236.5,123.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {Mediated (Global) Schema};
% Text Node
\draw (152.67,253.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {Source A};
% Text Node
\draw (287.67,253.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {Source B};
% Text Node
\draw (438.67,241) node [anchor=north west][inner sep=0.75pt]   [align=left] {Source C};


\end{tikzpicture}


}

\hfill \break Though it is \textbf{less efficient}, this setup is preferable when the data is \textbf{dynamic}.\\

For example, if you needed to make a query on the sum total of the data in Sources A, B and C, and data source B was constantly updating, there's no real way you'd be able to warehouse that data.\\

We usually move to keeping our data separate and maintaining only a global schema when data warehousing is unfeasible.\\

Data integration has key challenges associated with it. For example integration, cleaning, setting up a global schema, and limits on how many time you can access a source (e.g. API request limits)\\

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Schema Matching} $\rightarrow$ Constructing a global schema that allows all the data you're accessing to 'mix'. (Different sources label their data differently!)
\end{tcolorbox}

\hfill \break \textbf{Big Idea:} Data integration is still a pretty ad-hoc and manual process as of right now, but it is a growing area of focus in this relatively new industry.\\

\subsection{Data Cleaning}

Data cleaning is the process in which we deal with data quality issues.\\

\subsubsection{Types of Data Quality Issues}

\begin{itemize}
	\item \textbf{Single Source Problems} $\rightarrow$ These are problems you run into when you work with a single source. Databases tend to force constraints on their data, so we can trust them sometimes. There is \textbf{no Quality Control} for data scraped from files, spreadsheets, or webpages. As such, we're susceptible to things like \textbf{ill-formatted or missing data}, \textbf{contradicting data}, \textbf{illegal entries}, and the like.
	\item \textbf{Multi Source Problems} $\rightarrow$ More problems arise when you try to integrate multiple sources. Different sources are maintained by different people, so we can already see a basis for issues.
	\begin{itemize}
		\item[1.] Mapping Info
		\begin{itemize}
			\item Naming conflicts: The same name is used for different objects.
			\item Structural conflicts: different representations across sources.
		\end{itemize}
		\item[2.] Entity Resolution
		\begin{itemize}
			\item Matching entries across sources may prove to be difficult.
		\end{itemize}
		\item[3.] Data Quality Issues
		\begin{itemize}
			\item Contradicting information, mismatched information, etc.
		\end{itemize}
	\end{itemize}
\end{itemize}

\subsubsection{Outlier Detection}

\textbf{Univariate outliers} are detected by using the \textbf{MAD}.\\

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Median Absolute Deviation (MAD)} $\rightarrow$ Median distance of all the values from the median value.
\end{tcolorbox}

\hfill \break If your data is \textbf{normally distributed}, we can assume that any and all data farther than \textbf{\texttt{1.4826}} MAD from the median are considered outliers.\\

\begin{tcolorbox}[title=Aside: Curse of Dimensionality,colframe=black,colback=white,arc=0pt,fonttitle=\bfseries]
The "Curse of Dimensionality" states that methods to find outliers break down as the dimensionality of the data increases. We always have the option of \textbf{projecting} this data down to lower dimensions and working with that, but that's the best we can do. Usually, it isn't that straightforward.
\end{tcolorbox}

\subsubsection{Entity Resolution}

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Entity Resolution} $\rightarrow$ Determining when references to data are equivalent. (E.g. Bob Spence, B. Spence, and Robert Spence \textit{could} all be referencing the same person)
\end{tcolorbox}

\hfill \break When it comes to working with entity resolution in practice, there are three main issues to get over. First, let's informally define a \textbf{mention} as when some object gets referenced.

\begin{itemize}
	\item \textbf{Deduplication} $\rightarrow$ Cluster records/mentions to the same entity and create a \textbf{cluster representative}.
	\item \textbf{Record Linkage} $\rightarrow$ Matching records across different databases. (e.g. finding the same person on FB and Google Plus)
	\item \textbf{Reference Matching} $\rightarrow$ Matching mentions of an entity to the actual entity being mentioned.
\end{itemize}

Actually finding similarity between two references/mentions can be done in many ways. For example, one can use 'edit distance' (string dissimilarity) functions, set similarity, vector similarity, etc. Generally, this must be optimized for each dataset.\\

For more advanced examples, the keyword to search the web for is \textbf{entity resolution algorithms}. The examples in the slides look to be fairly basic and straightforward.

\section{Statistics Review (L10)}

This lecture is a summary of the 'important stuff' from STAT400, and then some. Worth going over, it's been a long time since statistics.

\subsection{Exploratory Data Analysis}

\textbf{Exploratory Data Analysis} is fancy talk for when you're getting a feel for what's going on with the data. We can usually spot certain nuances in the data (skews, trends, interactions of variables) that will help suggest what assumptions to make and what approaches to take when performing actual analysis.

\subsection{Summary Statistics - Overview}

Part of \textbf{descriptive statistics}, summary statistics is basically exactly what you think it is- it's used to summarize data.\\

The big idea here is to convey information with extreme simplicity. This includes the following:

\begin{itemize}
	\item Measures of Location
	\item Measures of Dispersion
	\item Correlations
\end{itemize}

\subsection{Measures of Location}

There exists a certain distribution of values, and we are interested in the center of said distribution. In other words, we are interested in \textbf{measures of central tendency.}\\

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Measure of Central Tendency} $\rightarrow$ A measure of \textbf{central tendency} is a single value that attempts to describe a set of data by identifying the central position within that set of data.
\end{tcolorbox}

\hfill \break There are two measurements that we use for this business: \textbf{sample mean} and \textbf{sample median}. They look similar, and they measure the same thing.\\

We also know that these two measures \textbf{differ predictably} when the data \textbf{aren't symmetric}.\\

The idea of \textbf{weighted average} is when you give more power to larger contributions. For example, bombing a midterm will hurt your grade average more than bombing a homework assignment.\\

\hfill \break It's interesting to note that \textbf{extreme observations} will skew the mean, but not the median. For example:\\

\begin{myproof}
The \textbf{median} of \texttt{[1,2,4,6,8,9,17000]} is $6$\\
The \textbf{mean} of \texttt{[1,2,4,6,8,9,17000]} is $2432.8$ 
\end{myproof} 

\subsection{Measures of Dispersion}

\hfill \break So how do we measure \textbf{dispersion}? We know we can't use the range to do so, as shown by the below example:\\

{

\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Flowchart: Process [id:dp42985399592270557] 
\draw   (68.33,54) -- (220.33,54) -- (220.33,207.33) -- (68.33,207.33) -- cycle ;
%Flowchart: Process [id:dp10230829715920353] 
\draw   (313,54) -- (465,54) -- (465,207.33) -- (313,207.33) -- cycle ;
%Shape: Rectangle [id:dp294441304663163] 
\draw   (68.5,189) -- (80.17,189) -- (80.17,207.33) -- (68.5,207.33) -- cycle ;
%Shape: Rectangle [id:dp0758070145538664] 
\draw   (120.5,184.75) -- (132.17,184.75) -- (132.17,207.33) -- (120.5,207.33) -- cycle ;
%Shape: Rectangle [id:dp2684702369415499] 
\draw   (132,171.25) -- (143.67,171.25) -- (143.67,207.33) -- (132,207.33) -- cycle ;
%Shape: Rectangle [id:dp18320331343577423] 
\draw   (143.5,193.75) -- (155.17,193.75) -- (155.17,207.33) -- (143.5,207.33) -- cycle ;
%Shape: Rectangle [id:dp5730680892444393] 
\draw   (208,192.75) -- (219.67,192.75) -- (219.67,207.33) -- (208,207.33) -- cycle ;
%Shape: Rectangle [id:dp33682018352719645] 
\draw   (312.5,189) -- (324.17,189) -- (324.17,207.33) -- (312.5,207.33) -- cycle ;
%Shape: Rectangle [id:dp35260214424395486] 
\draw   (324,185.25) -- (335.67,185.25) -- (335.67,207.33) -- (324,207.33) -- cycle ;
%Shape: Rectangle [id:dp33610034244219744] 
\draw   (336,189) -- (347.67,189) -- (347.67,207.33) -- (336,207.33) -- cycle ;
%Shape: Rectangle [id:dp3523600042941607] 
\draw   (347.5,178.75) -- (359.17,178.75) -- (359.17,207.33) -- (347.5,207.33) -- cycle ;
%Shape: Rectangle [id:dp9944722298043582] 
\draw   (359.5,194.25) -- (371.17,194.25) -- (371.17,207.33) -- (359.5,207.33) -- cycle ;
%Shape: Rectangle [id:dp48377497880010323] 
\draw   (371,166.75) -- (382.67,166.75) -- (382.67,207.33) -- (371,207.33) -- cycle ;
%Shape: Rectangle [id:dp15813218987235855] 
\draw   (383,157.75) -- (394.67,157.75) -- (394.67,207.33) -- (383,207.33) -- cycle ;
%Shape: Rectangle [id:dp4067396806620175] 
\draw   (394.5,184.25) -- (406.17,184.25) -- (406.17,207.33) -- (394.5,207.33) -- cycle ;
%Shape: Rectangle [id:dp9548628722511838] 
\draw   (406,189) -- (417.67,189) -- (417.67,207.33) -- (406,207.33) -- cycle ;
%Shape: Rectangle [id:dp36628712469833324] 
\draw   (417.5,191.75) -- (429.17,191.75) -- (429.17,207.33) -- (417.5,207.33) -- cycle ;
%Shape: Rectangle [id:dp3668461747520537] 
\draw   (429,184.75) -- (440.67,184.75) -- (440.67,207.33) -- (429,207.33) -- cycle ;
%Shape: Rectangle [id:dp6070297155185923] 
\draw   (441,189) -- (452.67,189) -- (452.67,207.33) -- (441,207.33) -- cycle ;
%Shape: Rectangle [id:dp43867865923165406] 
\draw   (453,189) -- (464.67,189) -- (464.67,207.33) -- (453,207.33) -- cycle ;

% Text Node
\draw (13,224.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {Ex) These two datasets have the same range, but highly different dispersions};


\end{tikzpicture}


}


\hfill \break As such, \textbf{Variance} and \textbf{Standard Deviation} are univariate measures of dispersion in a dataset.\\

\textbf{Variance} measures how far a set of numbers is spread out from their average value. It can be calculated using the following formula:\\

$$\frac{1}{n}\sum_{i=1}^{n}{(x_i-\bar{x})^2}$$\\

\hfill \break In this case, $x_i$ represents each data point, while $\bar{x}$ represents the sample mean.\\

However, the units get messed up when we calculate variance. Thus, \textbf{standard deviation} is an \textbf{interpretable} unit of measurement for dispersion.\\

We calculate the standard deviation by simply finding the square root of variance. The formula is as follows:\\

$$\sqrt{\frac{1}{n}\sum_{i=1}^{n}{(x_i-\bar{x})^2}}$$\\

\begin{tcolorbox}[title=Aside: Bessel's Correction,colframe=black,colback=white,arc=0pt,fonttitle=\bfseries]
\textbf{Bessel's Correction} is the formal name for when we use $(n-1)$ instead of $n$ when calculating sample variance and sample standard deviation, where $n$ is the number of observations in the sample. This method apparently corrects bias in the estimation of the population variance.
\end{tcolorbox}

\hfill \break \textbf{Standard deviation} can also be used as a unit. For example, $68\%$ of the distribution will be within 1 standard deviation from the mean, $95\%$ will be within 2 standard deviations from the mean, $99.73\%$ in 3 standard deviations, and so on. (Just like STAT400)

\subsection{Correlation}


\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Correlation} $\rightarrow$ The idea that variables x and y vary together.
\end{tcolorbox}

\hfill \break \textbf{Causality vs. Correlation:} Can we figure out if movement in $x$ induces some sort of movement in $y$?\\

\begin{myproof}
\textbf{Example:} If the following scatter plot suggests correlation, \textbf{linear regression} can actually measure that correlation.\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Straight Lines [id:da691600851003093] 
\draw    (99,249) -- (99,92.67) ;
\draw [shift={(99,90.67)}, rotate = 450] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da40851272097006375] 
\draw    (99,249) -- (265,249) ;
\draw [shift={(267,249)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da28842348848593835] 
\draw    (407,249) -- (573,249) ;
\draw [shift={(575,249)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da9336437291069557] 
\draw    (407,249) -- (407,92.67) ;
\draw [shift={(407,90.67)}, rotate = 450] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Flowchart: Summing Junction [id:dp9042799879699613] 
\draw   (128.53,200.87) .. controls (128.53,198.77) and (130.23,197.07) .. (132.33,197.07) .. controls (134.43,197.07) and (136.13,198.77) .. (136.13,200.87) .. controls (136.13,202.97) and (134.43,204.67) .. (132.33,204.67) .. controls (130.23,204.67) and (128.53,202.97) .. (128.53,200.87) -- cycle ; \draw   (129.65,198.18) -- (135.02,203.55) ; \draw   (135.02,198.18) -- (129.65,203.55) ;
%Flowchart: Summing Junction [id:dp33641373424460996] 
\draw   (148.53,220.87) .. controls (148.53,218.77) and (150.23,217.07) .. (152.33,217.07) .. controls (154.43,217.07) and (156.13,218.77) .. (156.13,220.87) .. controls (156.13,222.97) and (154.43,224.67) .. (152.33,224.67) .. controls (150.23,224.67) and (148.53,222.97) .. (148.53,220.87) -- cycle ; \draw   (149.65,218.18) -- (155.02,223.55) ; \draw   (155.02,218.18) -- (149.65,223.55) ;
%Flowchart: Summing Junction [id:dp25077564919441] 
\draw   (155.87,190.87) .. controls (155.87,188.77) and (157.57,187.07) .. (159.67,187.07) .. controls (161.77,187.07) and (163.47,188.77) .. (163.47,190.87) .. controls (163.47,192.97) and (161.77,194.67) .. (159.67,194.67) .. controls (157.57,194.67) and (155.87,192.97) .. (155.87,190.87) -- cycle ; \draw   (156.98,188.18) -- (162.35,193.55) ; \draw   (162.35,188.18) -- (156.98,193.55) ;
%Flowchart: Summing Junction [id:dp10280310083157251] 
\draw   (188.53,184.87) .. controls (188.53,182.77) and (190.23,181.07) .. (192.33,181.07) .. controls (194.43,181.07) and (196.13,182.77) .. (196.13,184.87) .. controls (196.13,186.97) and (194.43,188.67) .. (192.33,188.67) .. controls (190.23,188.67) and (188.53,186.97) .. (188.53,184.87) -- cycle ; \draw   (189.65,182.18) -- (195.02,187.55) ; \draw   (195.02,182.18) -- (189.65,187.55) ;
%Flowchart: Summing Junction [id:dp9991352500853743] 
\draw   (194.2,152.2) .. controls (194.2,150.1) and (195.9,148.4) .. (198,148.4) .. controls (200.1,148.4) and (201.8,150.1) .. (201.8,152.2) .. controls (201.8,154.3) and (200.1,156) .. (198,156) .. controls (195.9,156) and (194.2,154.3) .. (194.2,152.2) -- cycle ; \draw   (195.31,149.51) -- (200.69,154.89) ; \draw   (200.69,149.51) -- (195.31,154.89) ;
%Flowchart: Summing Junction [id:dp6950916379064316] 
\draw   (234.2,147.53) .. controls (234.2,145.43) and (235.9,143.73) .. (238,143.73) .. controls (240.1,143.73) and (241.8,145.43) .. (241.8,147.53) .. controls (241.8,149.63) and (240.1,151.33) .. (238,151.33) .. controls (235.9,151.33) and (234.2,149.63) .. (234.2,147.53) -- cycle ; \draw   (235.31,144.85) -- (240.69,150.22) ; \draw   (240.69,144.85) -- (235.31,150.22) ;
%Flowchart: Summing Junction [id:dp9248526822392967] 
\draw   (214.2,128.2) .. controls (214.2,126.1) and (215.9,124.4) .. (218,124.4) .. controls (220.1,124.4) and (221.8,126.1) .. (221.8,128.2) .. controls (221.8,130.3) and (220.1,132) .. (218,132) .. controls (215.9,132) and (214.2,130.3) .. (214.2,128.2) -- cycle ; \draw   (215.31,125.51) -- (220.69,130.89) ; \draw   (220.69,125.51) -- (215.31,130.89) ;
%Flowchart: Summing Junction [id:dp5996481964322673] 
\draw   (115.2,233.2) .. controls (115.2,231.1) and (116.9,229.4) .. (119,229.4) .. controls (121.1,229.4) and (122.8,231.1) .. (122.8,233.2) .. controls (122.8,235.3) and (121.1,237) .. (119,237) .. controls (116.9,237) and (115.2,235.3) .. (115.2,233.2) -- cycle ; \draw   (116.31,230.51) -- (121.69,235.89) ; \draw   (121.69,230.51) -- (116.31,235.89) ;
%Flowchart: Summing Junction [id:dp5620763540811129] 
\draw   (155.2,127.53) .. controls (155.2,125.43) and (156.9,123.73) .. (159,123.73) .. controls (161.1,123.73) and (162.8,125.43) .. (162.8,127.53) .. controls (162.8,129.63) and (161.1,131.33) .. (159,131.33) .. controls (156.9,131.33) and (155.2,129.63) .. (155.2,127.53) -- cycle ; \draw   (156.31,124.85) -- (161.69,130.22) ; \draw   (161.69,124.85) -- (156.31,130.22) ;
%Flowchart: Summing Junction [id:dp5542972204658851] 
\draw   (227.53,175.87) .. controls (227.53,173.77) and (229.23,172.07) .. (231.33,172.07) .. controls (233.43,172.07) and (235.13,173.77) .. (235.13,175.87) .. controls (235.13,177.97) and (233.43,179.67) .. (231.33,179.67) .. controls (229.23,179.67) and (227.53,177.97) .. (227.53,175.87) -- cycle ; \draw   (228.65,173.18) -- (234.02,178.55) ; \draw   (234.02,173.18) -- (228.65,178.55) ;
%Flowchart: Summing Junction [id:dp9478706147850046] 
\draw   (438.53,199.87) .. controls (438.53,197.77) and (440.23,196.07) .. (442.33,196.07) .. controls (444.43,196.07) and (446.13,197.77) .. (446.13,199.87) .. controls (446.13,201.97) and (444.43,203.67) .. (442.33,203.67) .. controls (440.23,203.67) and (438.53,201.97) .. (438.53,199.87) -- cycle ; \draw   (439.65,197.18) -- (445.02,202.55) ; \draw   (445.02,197.18) -- (439.65,202.55) ;
%Flowchart: Summing Junction [id:dp0966153856744063] 
\draw   (458.53,219.87) .. controls (458.53,217.77) and (460.23,216.07) .. (462.33,216.07) .. controls (464.43,216.07) and (466.13,217.77) .. (466.13,219.87) .. controls (466.13,221.97) and (464.43,223.67) .. (462.33,223.67) .. controls (460.23,223.67) and (458.53,221.97) .. (458.53,219.87) -- cycle ; \draw   (459.65,217.18) -- (465.02,222.55) ; \draw   (465.02,217.18) -- (459.65,222.55) ;
%Flowchart: Summing Junction [id:dp198239701522397] 
\draw   (465.87,189.87) .. controls (465.87,187.77) and (467.57,186.07) .. (469.67,186.07) .. controls (471.77,186.07) and (473.47,187.77) .. (473.47,189.87) .. controls (473.47,191.97) and (471.77,193.67) .. (469.67,193.67) .. controls (467.57,193.67) and (465.87,191.97) .. (465.87,189.87) -- cycle ; \draw   (466.98,187.18) -- (472.35,192.55) ; \draw   (472.35,187.18) -- (466.98,192.55) ;
%Flowchart: Summing Junction [id:dp6366216446149654] 
\draw   (498.53,183.87) .. controls (498.53,181.77) and (500.23,180.07) .. (502.33,180.07) .. controls (504.43,180.07) and (506.13,181.77) .. (506.13,183.87) .. controls (506.13,185.97) and (504.43,187.67) .. (502.33,187.67) .. controls (500.23,187.67) and (498.53,185.97) .. (498.53,183.87) -- cycle ; \draw   (499.65,181.18) -- (505.02,186.55) ; \draw   (505.02,181.18) -- (499.65,186.55) ;
%Flowchart: Summing Junction [id:dp5339937012733282] 
\draw   (504.2,151.2) .. controls (504.2,149.1) and (505.9,147.4) .. (508,147.4) .. controls (510.1,147.4) and (511.8,149.1) .. (511.8,151.2) .. controls (511.8,153.3) and (510.1,155) .. (508,155) .. controls (505.9,155) and (504.2,153.3) .. (504.2,151.2) -- cycle ; \draw   (505.31,148.51) -- (510.69,153.89) ; \draw   (510.69,148.51) -- (505.31,153.89) ;
%Flowchart: Summing Junction [id:dp3396798526868743] 
\draw   (544.2,146.53) .. controls (544.2,144.43) and (545.9,142.73) .. (548,142.73) .. controls (550.1,142.73) and (551.8,144.43) .. (551.8,146.53) .. controls (551.8,148.63) and (550.1,150.33) .. (548,150.33) .. controls (545.9,150.33) and (544.2,148.63) .. (544.2,146.53) -- cycle ; \draw   (545.31,143.85) -- (550.69,149.22) ; \draw   (550.69,143.85) -- (545.31,149.22) ;
%Flowchart: Summing Junction [id:dp6647281849884135] 
\draw   (524.2,127.2) .. controls (524.2,125.1) and (525.9,123.4) .. (528,123.4) .. controls (530.1,123.4) and (531.8,125.1) .. (531.8,127.2) .. controls (531.8,129.3) and (530.1,131) .. (528,131) .. controls (525.9,131) and (524.2,129.3) .. (524.2,127.2) -- cycle ; \draw   (525.31,124.51) -- (530.69,129.89) ; \draw   (530.69,124.51) -- (525.31,129.89) ;
%Flowchart: Summing Junction [id:dp4567465349796471] 
\draw   (425.2,232.2) .. controls (425.2,230.1) and (426.9,228.4) .. (429,228.4) .. controls (431.1,228.4) and (432.8,230.1) .. (432.8,232.2) .. controls (432.8,234.3) and (431.1,236) .. (429,236) .. controls (426.9,236) and (425.2,234.3) .. (425.2,232.2) -- cycle ; \draw   (426.31,229.51) -- (431.69,234.89) ; \draw   (431.69,229.51) -- (426.31,234.89) ;
%Flowchart: Summing Junction [id:dp39927990486176523] 
\draw   (465.2,126.53) .. controls (465.2,124.43) and (466.9,122.73) .. (469,122.73) .. controls (471.1,122.73) and (472.8,124.43) .. (472.8,126.53) .. controls (472.8,128.63) and (471.1,130.33) .. (469,130.33) .. controls (466.9,130.33) and (465.2,128.63) .. (465.2,126.53) -- cycle ; \draw   (466.31,123.85) -- (471.69,129.22) ; \draw   (471.69,123.85) -- (466.31,129.22) ;
%Flowchart: Summing Junction [id:dp0888699043165192] 
\draw   (537.53,174.87) .. controls (537.53,172.77) and (539.23,171.07) .. (541.33,171.07) .. controls (543.43,171.07) and (545.13,172.77) .. (545.13,174.87) .. controls (545.13,176.97) and (543.43,178.67) .. (541.33,178.67) .. controls (539.23,178.67) and (537.53,176.97) .. (537.53,174.87) -- cycle ; \draw   (538.65,172.18) -- (544.02,177.55) ; \draw   (544.02,172.18) -- (538.65,177.55) ;
%Straight Lines [id:da22477620790243336] 
\draw [color={rgb, 255:red, 255; green, 0; blue, 0 }  ,draw opacity=1 ]   (407,249) -- (568,115.75) ;
%Right Arrow [id:dp31063873859211444] 
\draw  [fill={rgb, 255:red, 248; green, 231; blue, 28 }  ,fill opacity=1 ] (305,162) -- (347,162) -- (347,152) -- (375,172) -- (347,192) -- (347,182) -- (305,182) -- cycle ;

% Text Node
\draw (320.5,163.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {???};


\end{tikzpicture}


}

\end{myproof}

\hfill \break However, be warned. \textbf{Correlation does not imply causation!} Just because two things vary at similar rates does not imply that one affects the other!\\

\subsection{Standardization}

Say we find out our variables are skewed and have different ranges. We should \textbf{transform} the variables to ease data analysis and allow the use of other statistics and/or machine learning models.\\ 

This can be done with \textbf{Standardization} (3 types).

\begin{itemize}
	\item \textbf{P-Standardization} (Percentiles)
	\item \textbf{Z-Standardization} (Z-score)
	\item \textbf{D-Standardization} (Dichotomization)
\end{itemize}

You should \textbf{always} standardize. \\

\begin{myproof}
\textbf{Example:} when averaging multiple variables, like creating a 'socioeconomic status' variable out of a combination of age, income and education.
\end{myproof}

\hfill \break \textbf{P-Standardization} is sometimes called a 'percentile score'. Every observation is assigned to a number between 0 and 100, indicating the \% of observations 'beneath it'.\\

\textbf{Special examples of this:} median, quartiles, quintiles.\\

This process is known as transforming the variable into an \textbf{ordinal variable}.\\

You can also group data into 'standard deviations from the mean', like we do with exam scores.\\

It's important to note which of your datasets uses \textbf{discrete} or \textbf{continuous} variables, because some models only work on one type of dataset or another, so we should keep that in mind.\\

\begin{tcolorbox}[title=Aside: Logarithmic Transformation,colframe=black,colback=white,arc=0pt,fonttitle=\bfseries]
The Logarithmic transformation is especially useful when standardizing data that is skewed to the right, i.e. when we have some ridiculously high values to the right of our data. The log transform 'reels them in' to the left and helps us deal with all the data.
\end{tcolorbox}

\section{Networks and Graphs (L11)}

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Network} $\rightarrow$ a system of interconnected objects, often represented by a graph.
\end{tcolorbox}

\subsection{Review on Graphs}

A \textbf{graph} is a data structure describing a set of vertices and edges. It can either be \textbf{weighted} or \textbf{unweighted}, \textbf{directed} or \textbf{non-directed}.\\

You can store graphs in a few different ways.\\

\begin{myproof}
\textbf{Adjacency List}\\
(For each vertex, store a list of the vertices it connects to)\\

Pros:
\begin{itemize}
	\item Easy to add edges
\end{itemize}

Cons:
\begin{itemize}
	\item Deleting is hard
	\item Checking for edges is \texttt{O(|v|)}
\end{itemize}

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Circle [id:dp2717325757060276] 
\draw   (22.5,70.17) .. controls (22.5,58.84) and (31.68,49.67) .. (43,49.67) .. controls (54.32,49.67) and (63.5,58.84) .. (63.5,70.17) .. controls (63.5,81.49) and (54.32,90.67) .. (43,90.67) .. controls (31.68,90.67) and (22.5,81.49) .. (22.5,70.17) -- cycle ;
%Shape: Circle [id:dp10440427856784407] 
\draw   (121.83,104.17) .. controls (121.83,92.84) and (131.01,83.67) .. (142.33,83.67) .. controls (153.66,83.67) and (162.83,92.84) .. (162.83,104.17) .. controls (162.83,115.49) and (153.66,124.67) .. (142.33,124.67) .. controls (131.01,124.67) and (121.83,115.49) .. (121.83,104.17) -- cycle ;
%Shape: Circle [id:dp1041314060360411] 
\draw   (25.17,192.83) .. controls (25.17,181.51) and (34.34,172.33) .. (45.67,172.33) .. controls (56.99,172.33) and (66.17,181.51) .. (66.17,192.83) .. controls (66.17,204.16) and (56.99,213.33) .. (45.67,213.33) .. controls (34.34,213.33) and (25.17,204.16) .. (25.17,192.83) -- cycle ;
%Shape: Circle [id:dp7832881426057642] 
\draw   (131.17,202.83) .. controls (131.17,191.51) and (140.34,182.33) .. (151.67,182.33) .. controls (162.99,182.33) and (172.17,191.51) .. (172.17,202.83) .. controls (172.17,214.16) and (162.99,223.33) .. (151.67,223.33) .. controls (140.34,223.33) and (131.17,214.16) .. (131.17,202.83) -- cycle ;
%Straight Lines [id:da38761707924361] 
\draw    (122.33,97.33) -- (65.32,71.01) ;
\draw [shift={(63.5,70.17)}, rotate = 384.78999999999996] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Curve Lines [id:da24509226005445295] 
\draw    (53.33,52) .. controls (123.27,-5.13) and (134.02,47.7) .. (136.24,81.79) ;
\draw [shift={(136.33,83.33)}, rotate = 266.63] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da1214994408370218] 
\draw    (60.33,178) -- (122.27,114.76) ;
\draw [shift={(123.67,113.33)}, rotate = 494.4] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Curve Lines [id:da8406248096151248] 
\draw    (58.67,209.67) .. controls (116.78,271.07) and (129.94,262.93) .. (144.98,225.08) ;
\draw [shift={(145.67,223.33)}, rotate = 471.3] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da16515278602403738] 
\draw    (277.33,42) -- (326.33,42) ;
%Straight Lines [id:da24796609978389095] 
\draw    (354,42) -- (427,42) ;

% Text Node
\draw (37.5,60.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont A}};
% Text Node
\draw (136.83,94.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont C}};
% Text Node
\draw (40.17,183) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont B}};
% Text Node
\draw (146.17,193) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont D}};
% Text Node
\draw (278.67,21) node [anchor=north west][inner sep=0.75pt]   [align=left] {Vertex};
% Text Node
\draw (354.67,21) node [anchor=north west][inner sep=0.75pt]   [align=left] {Neighbors};
% Text Node
\draw (295.33,46) node [anchor=north west][inner sep=0.75pt]   [align=left] {A};
% Text Node
\draw (380,46) node [anchor=north west][inner sep=0.75pt]   [align=left] {[C]};
% Text Node
\draw (294.67,70) node [anchor=north west][inner sep=0.75pt]   [align=left] {B};
% Text Node
\draw (372,70.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {[C, D]};
% Text Node
\draw (294.67,92.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {C};
% Text Node
\draw (382.67,92) node [anchor=north west][inner sep=0.75pt]   [align=left] {[A]};
% Text Node
\draw (295.33,114.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {D};
% Text Node
\draw (388,112.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {[]};


\end{tikzpicture}
}
\end{myproof}

\begin{myproof}
\textbf{Adjacency Dictionary}\\
(For each vertex, store a dictionary of vertices that it connects to)\\\\
Pros:
\begin{itemize}
	\item O(1) to add, remove and query edges
\end{itemize}
Cons:
\begin{itemize}
	\item Memory + Caching overhead (heavy use of Python data structure)
\end{itemize}

Neighbors for B $\rightarrow$ \texttt{\{C:1.0, D:2.0\}}
\end{myproof}

\hfill \break The previous two are pretty much similar, it's just that the Adjacency Dictionary takes advantage of features provided to us in Python, while the Adjacency List is the more classic way to represent a graph. However, the reason we talk specifically about the adjacency dictionary will be explored below.\\

\begin{myproof}
\textbf{Adjacency Matrix}\\
(Store the connectivity of the graph in a matrix- almost always in a sparse matrix)\\\\
Pros:
\begin{itemize}
	\item \texttt{O(1)} time for query, add edge, remove edge
\end{itemize}
Cons:
\begin{itemize}
	\item \texttt{O($v^2$)} space complexity, regardless of the actual number of edges
\end{itemize}
\end{myproof}

\subsection{NetworkX}

\texttt{\textbf{NetworkX}} is how data scientists work with graphs in \textbf{Python}. It makes use of an \textbf{Adjacency Dictionary} representation, which is essentially just a beefed up Adjacency List with better access times and more overhead (see above).

\subsection{Graph Databases}

Graph databases exist and are an alternative to relational databases.\\

\begin{myproof}
\textbf{Example:} A very simple graph database. Note the similarities to a relational database. Here, the different relations like '\texttt{is\_friend}' and '\texttt{read}' can be represented with, for example, separate adjacency dictionaries.\\\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Rectangle [id:dp051464383627423005] 
\draw   (64,54.67) -- (184.33,54.67) -- (184.33,94.67) -- (64,94.67) -- cycle ;
%Shape: Rectangle [id:dp5147384377159534] 
\draw   (342.67,25.33) -- (463,25.33) -- (463,65.33) -- (342.67,65.33) -- cycle ;
%Shape: Rectangle [id:dp13294535427367693] 
\draw   (131,168.67) -- (372.33,168.67) -- (372.33,244) -- (131,244) -- cycle ;
%Curve Lines [id:da8347841470794473] 
\draw [color={rgb, 255:red, 245; green, 166; blue, 35 }  ,draw opacity=1 ]   (133,52.67) .. controls (170.29,3.17) and (292.52,22.93) .. (340.89,29.8) ;
\draw [shift={(342.33,30)}, rotate = 188.02] [color={rgb, 255:red, 245; green, 166; blue, 35 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Curve Lines [id:da09588463321076213] 
\draw [color={rgb, 255:red, 245; green, 166; blue, 35 }  ,draw opacity=1 ]   (391.67,65.33) .. controls (352.73,148.49) and (238.32,83.98) .. (185.9,81.39) ;
\draw [shift={(184.33,81.33)}, rotate = 361.48] [color={rgb, 255:red, 245; green, 166; blue, 35 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da2685612909820726] 
\draw [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ]   (119.67,95.33) -- (189.61,167.89) ;
\draw [shift={(191,169.33)}, rotate = 226.05] [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da7536260060068494] 
\draw [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ]   (401.67,65.33) -- (320.25,167.1) ;
\draw [shift={(319,168.67)}, rotate = 308.65999999999997] [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (141.33,187.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {TITLE: Bob's Adventure\\AUTHORS: Tim Lane, Jack Hart};
% Text Node
\draw (71.33,65.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {NAME: John P.};
% Text Node
\draw (358.67,36) node [anchor=north west][inner sep=0.75pt]   [align=left] {NAME: Lizzy};
% Text Node
\draw (170.67,2) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont \textcolor[rgb]{0.96,0.65,0.14}{is\_friend}}};
% Text Node
\draw (286,81.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont \textcolor[rgb]{0.96,0.65,0.14}{is\_friend}}};
% Text Node
\draw (112.67,124) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont \textcolor[rgb]{0.29,0.56,0.89}{read}}};
% Text Node
\draw (362.33,120) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont \textcolor[rgb]{0.29,0.56,0.89}{read}}};


\end{tikzpicture}


}
\end{myproof}

\hfill \break To \textbf{query} a graph database, we need to use a language other than SQL.\\

\texttt{\textbf{Neo4J}} and \texttt{\textbf{Titan}} are popular graph DB solutions, and can be accessed and leveraged using \texttt{\textbf{Bulbflow}}, a Python library.\\

\subsection{Centrality Analysis}

Centrality Analysis is the idea of finding out the most important node(s) in one network- not all nodes are created equal. This is important in visualization and classification. There are four distinct types of centrality analyses.

\begin{itemize}
	\item \textbf{Degree Centrality} $\rightarrow$ The importance of a vertex is determined by the number of vertices adjacent to it. In other words: the larger the \textit{degree}, the more important the vertex. To find normalized degree centrality, just divide \texttt{(degree of node)} by \texttt{(total nodes - 1)}.
	\item \textbf{Closeness Centrality} $\rightarrow$ The importance of a vertex is measured by how close a vertex is to other vertices. For example, to determine this, we can calculate the average length of all the shortest paths from that one node to every other node in the network. If needed, we can factor in edge weight.
	\item \textbf{Betweenness Centrality} $\rightarrow$ How "in the way" is this node in terms of shortest paths between nodes?
	$$\frac{\sigma_{st} v_i}{\sigma_{st}}$$
	To find this, we just need to divide the shortest paths passing through $v_i$ by the total shortest paths in the graph (as shown above).
	\item \textbf{Eigenvector Centrality} $\rightarrow$ A vertex's importance should be determined by the importance of the friends of that vertex- i.e. \textit{If one has many important friends, they too, are important.} A variant of eigenvector centrality is Google's PageRank algorithm's scoring system.
\end{itemize}

All of this and more are implemented in \texttt{NetworkX}.

\subsection{Network Topology}

We want to learn from the topology of a network- which is basically its broad, overarching structure.\\

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Bridge} $\rightarrow$ An edge is a bridge if its removal results in total disconnection of its terminal vertices. Bridges connecting two different communities are weak ties.\\\\
\textbf{Community} $\rightarrow$ A community is a tightly knit region of a network. We can use bridges to help define how communities are separated.
\end{tcolorbox}

\hfill \break The \textbf{Girvan-Newman Method} is a way to flesh out tightly-knit communities in networks. You basically remove the edges of a higher betweenness, then keep repeating this process. (This functionality is found in \texttt{NetworkX}).\\

You can also use \textbf{\texttt{GraphViz}} (like in CMSC330 with the regex project) to display your graphs and networks.\\

\section{Natural Language Processing (L14 + 15)}

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Natural Language Processing} $\rightarrow$ Natural language processing is a subfield of linguistics, computer science, information engineering, and artificial intelligence concerned with the interactions between computers and human languages, in particular how to program computers to process and analyze large amounts of natural language data. \textit{(Wikipedia)}
\end{tcolorbox}

\hfill \break It appears that this field is the intersection of \textbf{linguistics} and \textbf{computer science}. It's worth picking up further linguistics knowledge if pursuing NLP.

\hfill \break There are many uses for Natural Language Processing (NLP) software, but here are some good examples.

\begin{itemize}
	\item \textbf{Review analysis} is something that a lot of marketing and product companies have to do- they don't want to manually sift through every review of their product and figure out if the people like it or not, so they assign software to do it!
	\item \textbf{Language translation} is another excellent example. Language translation isn't as simple as swapping out words for one another, extra processing has to be done. This falls under the realm of NLP.
	\item \textbf{Question and answer}, like \textit{Jeopardy}, requires a fair amount of natural language processing. When IBM's Watson appeared on \textit{Jeopardy}, programmers mentioned that he had to distinguish between simple 'factoid questions', which could just be answered via a simple fact lookup, and 'thinking questions', which usually required the answer-er to follow some sort of narrative and do some critical thinking.
\end{itemize}

\subsection{Background}

Before the 1980s, Natural Language Processing was widely based on sets of hand-tuned rules.\\

After the 1980s, however, machine learned seeped into the realm of natural language processing.\\

This started with decision trees then went on to leveraging \textbf{hidden markov models}. Hidden markov models are based on augmenting \textbf{Markov Chain}s.\\

\begin{tcolorbox}[title=Aside: Markov Chains,colframe=black,colback=white,arc=0pt,fonttitle=\bfseries]
A Markov chain is a mathematical system that experiences transitions from one state to another according to certain probabilistic rules. The defining characteristic of a Markov chain is that no matter how the process arrived at its present state, the possible future states are fixed. It looks kind of like an NFA.\\

Excellent further information and an example found at:\\
\texttt{https://setosa.io/ev/markov-chains/}
\end{tcolorbox}

\hfill \break Then, people shifted to using statistical model for language. Now, people have finally come to using mostly unsupervised or semi-supervised learning for models.\\

In data science, lots of data comes in the form of \textbf{unstructured text}. We're basically defining that as text that cannot be predictably processed. The following are examples of chunks of text that could carry unstructured, yet valuable data.

\begin{itemize}
	\item Facebook posts
	\item Amazon reviews
	\item Wikileaks dumps
\end{itemize}

Understanding the English language is hard, as it's fairly hard to predict which bits of sentences mean what. Specifically, we can say that \textbf{structure can sometimes be ambigious}.\\

\begin{tcolorbox}[title=Aside: Windgrad Schema Challenge,colframe=black,colback=white,arc=0pt,fonttitle=\bfseries]
Proposed as a complement to the turing test, the Windgrad Schema Challenge involved asking people (or computers) to pick out the antecedent of an ambiguous pronoun.\\
\begin{myproof}
\textbf{Example:} The city \textbf{councilmen} refused the \textbf{demonstrators} a permit because they \textbf{[}\textit{feared}\textbf{/}\textit{advocated}\textbf{]} violence. 
\end{myproof}

\hfill \break Here, we know that \textbf{feared} corresponds to \textbf{councilmen}, while \textbf{advocated} corresponds to the \textbf{demonstrators}. It's fairly easy for a human to deduce this, byt the guy who hosted this challenge argued that understanding this requires 'more than NLP', and that such a task required some type of common sense and \textbf{deep, contextual reasoning}.
\end{tcolorbox}

NLP is a pretty broad field- we're starting this with sentiment analysis.\\

It can be argued that we don't need to fully understand the text to determine the general sentiment. For example, if you were to scroll through Amazon interviews, you can sort of tell which ones are glowing ones and which ones are very negative.\\

The main idea is to look for \textbf{signals} in the text that tell us the sentimental value of that text.

\subsection{Natural Language Processing Terminology}

Some key terms used in industry.

\begin{itemize}
	\item \textbf{Documents} $\rightarrow$ Groups of free text- remember that this definition is sort of distinct from the idea of 'word documents' (common confusion)
	\item \textbf{Corpus} $\rightarrow$ A collection of documents
	\item \textbf{Terms} $\rightarrow$ Individual words (usually delimited by whitespace or some equivalent)
	\item \textbf{Syntax} $\rightarrow$ Refers to the grammatical structure of language
	\item \textbf{Semantics} $\rightarrow$ The study of meaning of language
	\item \textbf{Tokenization} $\rightarrow$ Splitting sentences into tokens (similar to how compilers 'lex' stuff). There's almost certainly a library for this process.
	\item \textbf{Stemming} $\rightarrow$ Finding roots of words, e.g. turning \textit{organizer} into \textit{organiz}. Also written as: 'The crude chopping of 'affixes'.
	\item \textbf{Lemmatization} $\rightarrow$ Reducing inflections on variant forms back to 'base form', e.g. \textit{are}, \textit{am}, and \textit{is} become \textit{be}.
	\item \textbf{Morphological Segmentation} $\rightarrow$ A process that aims to break words into meaning-bearing morphemes. (Morphemes are the smallest meaningful units in a language.) How words are formed \& relationships of different parts of the words.
	\item \textbf{Parts of Speech (POS) tagging} $\rightarrow$ Finding parts of speech of a word. This is a key process in producing parse trees for sentences.
	\item \textbf{Parsing} $\rightarrow$ Making a parse tree for a sentence, based on parts of speech. Again, just like 330s compiler project.
	\item \textbf{Information Extraction} $\rightarrow$ Turning unstructured text into 'structured sequences', or usable information.
	\item \textbf{Named Identity Recognition} $\rightarrow$ Is when you identify key entities in text, like locations, times, people, dates, etc.
	\item \textbf{Sentiment Analysis} $\rightarrow$ Deciding if opinions are good or bad. Lots of ad and marketing companies use software like this. 
\end{itemize}

\subsection{Spoken Dialogue System (Example)}

Below is an example of a spoken dialogue system.\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Rectangle [id:dp8373981922874744] 
\draw   (40,11) -- (159.67,11) -- (159.67,291) -- (40,291) -- cycle ;
%Shape: Rectangle [id:dp39694369787199746] 
\draw   (159.33,11) -- (279,11) -- (279,291) -- (159.33,291) -- cycle ;
%Shape: Rectangle [id:dp32938700493363626] 
\draw   (279.33,11) -- (399,11) -- (399,291) -- (279.33,291) -- cycle ;
%Shape: Rectangle [id:dp26423645478485447] 
\draw   (399.33,11) -- (519,11) -- (519,291) -- (399.33,291) -- cycle ;
%Shape: Rectangle [id:dp4887672923949493] 
\draw   (519.33,11) -- (639,11) -- (639,291) -- (519.33,291) -- cycle ;
%Shape: Rectangle [id:dp8193604616119339] 
\draw   (57.67,18.33) -- (143,18.33) -- (143,73.33) -- (57.67,73.33) -- cycle ;
%Shape: Rectangle [id:dp14839864081748477] 
\draw   (166.33,19.67) -- (272.33,19.67) -- (272.33,90) -- (166.33,90) -- cycle ;
%Straight Lines [id:da09291938876112937] 
\draw [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ]   (143.67,50.67) -- (163.01,52.48) ;
\draw [shift={(165,52.67)}, rotate = 185.36] [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Shape: Rectangle [id:dp510085733603555] 
\draw   (297.67,18.33) -- (383,18.33) -- (383,67.33) -- (297.67,67.33) -- cycle ;
%Straight Lines [id:da9017689378229654] 
\draw [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ]   (272.33,54) -- (295.69,50.31) ;
\draw [shift={(297.67,50)}, rotate = 531.03] [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Shape: Rectangle [id:dp8303854262526095] 
\draw   (407,18.33) -- (513,18.33) -- (513,88.67) -- (407,88.67) -- cycle ;
%Straight Lines [id:da04064952779334319] 
\draw [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ]   (384.33,45.33) -- (405.67,46.55) ;
\draw [shift={(407.67,46.67)}, rotate = 183.27] [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Shape: Rectangle [id:dp6563202747605579] 
\draw   (533,38.67) -- (632.33,38.67) -- (632.33,131.33) -- (533,131.33) -- cycle ;
%Straight Lines [id:da008366805900147378] 
\draw [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ]   (513.67,49.33) -- (530.84,64.67) ;
\draw [shift={(532.33,66)}, rotate = 221.76] [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Shape: Rectangle [id:dp9583321312289558] 
\draw   (405.67,109.67) -- (511.67,109.67) -- (511.67,180) -- (405.67,180) -- cycle ;
%Shape: Rectangle [id:dp09633045144651142] 
\draw   (291.67,114.67) -- (382.33,114.67) -- (382.33,168) -- (291.67,168) -- cycle ;
%Shape: Rectangle [id:dp6166671040427456] 
\draw   (165.67,119.33) -- (268.33,119.33) -- (268.33,172.67) -- (165.67,172.67) -- cycle ;
%Shape: Rectangle [id:dp4918446352170527] 
\draw   (48.5,120.67) -- (151.17,120.67) -- (151.17,166.67) -- (48.5,166.67) -- cycle ;
%Straight Lines [id:da5143139305134313] 
\draw [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ]   (532.33,112) -- (513.59,135.11) ;
\draw [shift={(512.33,136.67)}, rotate = 309.03999999999996] [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da388562527552038] 
\draw [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ]   (406.33,140.67) -- (384.33,141.28) ;
\draw [shift={(382.33,141.33)}, rotate = 358.40999999999997] [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da18609348520822822] 
\draw [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ]   (291.67,141.33) -- (269.67,141.94) ;
\draw [shift={(267.67,142)}, rotate = 358.40999999999997] [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da31394847636073786] 
\draw [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ]   (165,146.67) -- (151.66,147.25) ;
\draw [shift={(149.67,147.33)}, rotate = 357.51] [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (70.67,26.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {Speech\\Analysis};
% Text Node
\draw (172,25) node [anchor=north west][inner sep=0.75pt]   [align=left] {Morphological\\+ Lexical\\Analysis};
% Text Node
\draw (313.33,38) node [anchor=north west][inner sep=0.75pt]   [align=left] {Parsing};
% Text Node
\draw (422.67,33.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {Contextual\\Reasoning};
% Text Node
\draw (540.67,56) node [anchor=north west][inner sep=0.75pt]   [align=left] {Application\\Reasoning\\+ \ Execution};
% Text Node
\draw (417.33,126.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {Utterance\\Planning};
% Text Node
\draw (300,122) node [anchor=north west][inner sep=0.75pt]   [align=left] {Syntactic\\Realization};
% Text Node
\draw (169.33,128.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {Morphological\\Realization};
% Text Node
\draw (66,123) node [anchor=north west][inner sep=0.75pt]   [align=left] {Speech\\Synthesis};
% Text Node
\draw (60,270) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textbf{Phonology}};
% Text Node
\draw (174.67,270) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textbf{Morphology}};
% Text Node
\draw (314.67,270) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textbf{Syntax}};
% Text Node
\draw (420.67,270) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textbf{Semantics}};
% Text Node
\draw (540,270) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textbf{Reasoning}};


\end{tikzpicture}


}

\hfill \break Notice the \textbf{complexity} and \textbf{number of components} needed for a process like this, even though the system is seemingly simple.\\

The issues involved in translating human language to objective data are immense, and so our systems built to tackle these tasks are fairly involved.\\

\subsection{General Industry}

These days, people in industry are working on a bunch of NLP-related interesting problems.

\begin{itemize}
	\item Speech recognition
	\item Caption generation
	\item Natural Language Generation
	\item Optical Character Recognition
	\item Word Sense Disambiguation
\end{itemize}

\textbf{Text classification} is a common subset of problems in this field, for example. For example, we can take a look at an email spam classifier.\\

{

\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Rectangle [id:dp5957351502696815] 
\draw   (241,69.67) -- (376.33,69.67) -- (376.33,173.33) -- (241,173.33) -- cycle ;
%Straight Lines [id:da13924454285877397] 
\draw    (176.33,124) -- (239,124) ;
\draw [shift={(241,124)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Flowchart: Document [id:dp6530782050633164] 
\draw   (45,84.83) -- (176.67,84.83) -- (176.67,149.46) .. controls (94.38,149.46) and (110.83,172.76) .. (45,157.68) -- cycle ;
%Shape: Cube [id:dp23320547092917177] 
\draw   (424,163.67) -- (445,142.67) -- (610.33,142.67) -- (610.33,191.67) -- (589.33,212.67) -- (424,212.67) -- cycle ; \draw   (610.33,142.67) -- (589.33,163.67) -- (424,163.67) ; \draw   (589.33,163.67) -- (589.33,212.67) ;
%Shape: Cube [id:dp670449158073195] 
\draw   (441.33,52.33) -- (462.33,31.33) -- (627.67,31.33) -- (627.67,80.33) -- (606.67,101.33) -- (441.33,101.33) -- cycle ; \draw   (627.67,31.33) -- (606.67,52.33) -- (441.33,52.33) ; \draw   (606.67,52.33) -- (606.67,101.33) ;
%Straight Lines [id:da33294820493668054] 
\draw    (376.33,124) -- (438.72,78.51) ;
\draw [shift={(440.33,77.33)}, rotate = 503.9] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da9969041421340407] 
\draw    (376.33,124) -- (432.52,149.83) ;
\draw [shift={(434.33,150.67)}, rotate = 204.69] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (55.33,97.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {Input: Document};
% Text Node
\draw (104.67,128) node [anchor=north west][inner sep=0.75pt]  [color={rgb, 255:red, 144; green, 19; blue, 254 }  ,opacity=1 ] [align=left] {$\displaystyle \omega $};
% Text Node
\draw (279.33,77.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {Classifier};
% Text Node
\draw (264,128) node [anchor=north west][inner sep=0.75pt]   [align=left] {$\displaystyle \textcolor[rgb]{0,0,0}{y} =[\textcolor[rgb]{0.49,0.83,0.13}{y_{1}} ,\ \textcolor[rgb]{0.25,0.46,0.02}{y_{2}}]$};
% Text Node
\draw (492.67,65) node [anchor=north west][inner sep=0.75pt]   [align=left] {Spam $\displaystyle (\textcolor[rgb]{0.49,0.83,0.13}{y_{1}})$};
% Text Node
\draw (458,177) node [anchor=north west][inner sep=0.75pt]   [align=left] {Not Spam $\displaystyle (\textcolor[rgb]{0.25,0.46,0.02}{y_{2}})$};


\end{tikzpicture}


}

\hfill \break In the diagram above, general ML naming schemes are also used. More on this in ML + NLP classes (and later lectures).\\

\begin{tcolorbox}[title=Aside: Hardcoded Rules,colframe=black,colback=white,arc=0pt,fonttitle=\bfseries]

It was, at a time, the only option to use hard-coded rules to accomplish the tasks that we allocate to modern NLP techniques. We have some pros and cons for this approach.
\begin{itemize}
	\item \textbf{Pros} $\rightarrow$ Plenty of domain-expertise and customizability, human readable (we can easily understand the motivation behind decision making)
	\item \textbf{Cons} $\rightarrow$ Brittle, expensive to maintain, and extremely hard to generalize. (Most customization is very domain-specific)
\end{itemize}

\end{tcolorbox}


\subsection{Supervised Learning}

Supervised learning is one such common method to escape the issues that we find in the hardcoded rules mentioned in the Aside above. Although we can get more generalized solution capability, we trade that for transparency. The algorithm that decides where an object should be classified essentially becomes a black box.

\begin{myproof}
Let's say we have the following as \textbf{input}:

\begin{itemize}
	\item A \textbf{document}, $\omega$
	\item A set of \textbf{classes}, $y=\{y_1,y_2,y_3\}$
	\item A 'training' set of \textbf{manually classified documents} (hand-labeled),\\ $k=\{(\omega _1, y_1),(\omega _2, y_3),(\omega _3, y_1)\}$
\end{itemize}

As output, our \textbf{supervised learning} setup would produce a \textbf{learned classifier} that was able to classify documents, $\omega \rightarrow y$ 
\end{myproof}

\hfill \break In order to figure out how to approach this problem, we need to think like a computer would need to. First and foremost, we need a way to keep track of what the document \textit{is}. We can do this by keeping track of \textbf{features} that we produce based on what we observe about the document.\\

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Feature} \textit{(machine learning)} $\rightarrow$ A feature is an individual measurable property or characteristic of a phenomenon being observed. \textit{(Wikipedia)}
\end{tcolorbox}


\hfill \break Although we may be able to extract a lot of meaning from a clump of words, a computer cannot readily do the same. As such, we will first try and represent this \textbf{document} as \textbf{math}, loosely speaking. How can we do that?\\


\begin{myproof}
\textbf{Idea: Bag of Words}\\
One idea is to represent each document as a \textbf{vector} of \textbf{word frequencies}. A good visual example of something like this is a word cloud, but could also be as simple as a bar graph with frequencies.\\

In this case, \textit{term frequency} represents the number of times that a term appears in a document.\\

Although this isn't our whole solution, it's a part of it. We can use this as a \textbf{feature} with which we can train our classifier, $\omega \rightarrow y$.
\end{myproof}

\hfill \break Similarly to how we identified just one feature above, we can deduce many other features from our documents. Then, we can assign \textbf{weights} to these features to tell our algorithms just how important each feature when our classifier renders its final decision.\\

However, not all is as simple as it seems- there's a clear \textbf{issue} with the \textbf{term frequency} we discussed.\\

When we approach this problem realistically, we can see that term frequency gets \textbf{overloaded} with \textbf{common words}, like \textit{a}, \textit{to}, \textit{I}, and \textit{the}. How can we intelligently address something like this?

\begin{myproof}
\textbf{Solution: Inverse Document Frequency}\\

\textbf{Idea:} In order to address the commonality issue, we weight individual words negatively by how frequently they appear in the corpus.\\

$$IDF_j= log(\frac{d_{total}}{d_j})$$

In the above example, $d_{total}$ represents the total number of documents and $d_j$ represents the number of documents that contain word $j$. Note that $IDF$ is defined across all documents, not a specific word-document pair.
\end{myproof}

\hfill \break In practice, we end up combining term frequency and inverse document frequency to get a more functional result.

\subsection{NLP In Python}

In order to work with NLP in Python, there are two major libraries we can make use of- \textbf{\texttt{NLTK}} and \textbf{\texttt{SPACY}}.\\

\begin{itemize}
	\item \textbf{\texttt{NLTK}} $\rightarrow$ Has \textbf{more "stuff" implemented} and is more customizable. Started as a research tool, now has found lots of use in industry as well.
	\item \textbf{\texttt{SPACY}} $\rightarrow$ Younger and more sparse than \texttt{NLTK}, but can be much \textbf{faster than \texttt{NLTK}}. Much newer, more streamlined implementation.
\end{itemize}

\subsection{Distributional Models of Meaning}

Distributional models of meaning revolve around the idea that you know a word by the company that it keeps. \\

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Distributional models of meaning} $\rightarrow$ quantitative; express the semantic relation between terms but offering no immediately obvious way of modelling the contribution of sentence structure to meaning
\end{tcolorbox}

The idea is that two words are \textbf{similar} if they have \textbf{similar word contexts}.

\subsection{Document Matrices}

A \textbf{document matrix} is a way to quickly compare features of documents.

Here's an example that records 'term frequency'- how often each of these given words appear in the selected texts. We can determine two main things from the following document matrix.\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Rectangle [id:dp2478510784432143] 
\draw   (95.33,62.67) -- (543.67,62.67) -- (543.67,222.67) -- (95.33,222.67) -- cycle ;
%Shape: Rectangle [id:dp04260382568597276] 
\draw   (95.33,22.67) -- (185,22.67) -- (185,62.67) -- (95.33,62.67) -- cycle ;
%Shape: Rectangle [id:dp22922453318876046] 
\draw   (185,22.67) -- (274.67,22.67) -- (274.67,62.67) -- (185,62.67) -- cycle ;
%Shape: Rectangle [id:dp42730925833582556] 
\draw   (274.67,22.67) -- (364.33,22.67) -- (364.33,62.67) -- (274.67,62.67) -- cycle ;
%Shape: Rectangle [id:dp23538913728045674] 
\draw   (364.33,22.67) -- (454,22.67) -- (454,62.67) -- (364.33,62.67) -- cycle ;
%Shape: Rectangle [id:dp9654673895790655] 
\draw   (454,22.67) -- (543.67,22.67) -- (543.67,62.67) -- (454,62.67) -- cycle ;
%Shape: Rectangle [id:dp7801296194846411] 
\draw   (95.33,62.67) -- (185,62.67) -- (185,102.67) -- (95.33,102.67) -- cycle ;
%Shape: Rectangle [id:dp12054212062700542] 
\draw   (95.33,102.67) -- (185,102.67) -- (185,142.67) -- (95.33,142.67) -- cycle ;
%Shape: Rectangle [id:dp31221018949565704] 
\draw   (95.33,142.67) -- (185,142.67) -- (185,182.67) -- (95.33,182.67) -- cycle ;
%Shape: Rectangle [id:dp8440884365925955] 
\draw   (95.33,182.67) -- (185,182.67) -- (185,222.67) -- (95.33,222.67) -- cycle ;
%Straight Lines [id:da49294477728710373] 
\draw    (274.67,62.67) -- (274.67,223.33) ;
%Straight Lines [id:da7884746242568562] 
\draw    (364.33,62.67) -- (364.33,223.33) ;
%Straight Lines [id:da24798511925788602] 
\draw    (454,62.67) -- (454,223.33) ;
%Shape: Ellipse [id:dp7521885992840447] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (463.83,148) .. controls (463.83,97.56) and (479.5,56.67) .. (498.83,56.67) .. controls (518.16,56.67) and (533.83,97.56) .. (533.83,148) .. controls (533.83,198.44) and (518.16,239.33) .. (498.83,239.33) .. controls (479.5,239.33) and (463.83,198.44) .. (463.83,148) -- cycle ;
%Shape: Ellipse [id:dp18898786339216123] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (373.83,150.67) .. controls (373.83,100.22) and (389.5,59.33) .. (408.83,59.33) .. controls (428.16,59.33) and (443.83,100.22) .. (443.83,150.67) .. controls (443.83,201.11) and (428.16,242) .. (408.83,242) .. controls (389.5,242) and (373.83,201.11) .. (373.83,150.67) -- cycle ;
%Straight Lines [id:da43167078277561544] 
\draw [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ]   (456.33,261.33) -- (410.69,242.75) ;
\draw [shift={(408.83,242)}, rotate = 382.15] [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da1607094671672762] 
\draw [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ]   (456.33,261.33) -- (497.06,240.25) ;
\draw [shift={(498.83,239.33)}, rotate = 512.63] [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Shape: Ellipse [id:dp8182016509988886] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (177.33,206.67) .. controls (177.33,195.99) and (259.34,187.33) .. (360.5,187.33) .. controls (461.66,187.33) and (543.67,195.99) .. (543.67,206.67) .. controls (543.67,217.34) and (461.66,226) .. (360.5,226) .. controls (259.34,226) and (177.33,217.34) .. (177.33,206.67) -- cycle ;
%Shape: Ellipse [id:dp9230130966168822] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (179.33,162.67) .. controls (179.33,151.99) and (261.34,143.33) .. (362.5,143.33) .. controls (463.66,143.33) and (545.67,151.99) .. (545.67,162.67) .. controls (545.67,173.34) and (463.66,182) .. (362.5,182) .. controls (261.34,182) and (179.33,173.34) .. (179.33,162.67) -- cycle ;
%Straight Lines [id:da42515748550090093] 
\draw [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ]   (206.33,255.33) -- (265.24,223.61) ;
\draw [shift={(267,222.67)}, rotate = 511.7] [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da06447257921970884] 
\draw [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ]   (206.33,255.33) -- (286.82,185.31) ;
\draw [shift={(288.33,184)}, rotate = 498.98] [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (205.33,24) node [anchor=north west][inner sep=0.75pt]   [align=left] {As You\\Like it};
% Text Node
\draw (304,23.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {12th\\Night};
% Text Node
\draw (384,23.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {Julius\\Caesar};
% Text Node
\draw (478.67,23.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {Henry\\V};
% Text Node
\draw (119.33,74.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {Battle};
% Text Node
\draw (123.83,154.17) node [anchor=north west][inner sep=0.75pt]   [align=left] {Fool};
% Text Node
\draw (117.33,194.17) node [anchor=north west][inner sep=0.75pt]   [align=left] {Clown};
% Text Node
\draw (225.33,73.83) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 1}};
% Text Node
\draw (225.33,113.17) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 2}};
% Text Node
\draw (221.83,153.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 37}};
% Text Node
\draw (225.33,193.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 6}};
% Text Node
\draw (314,73.83) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 1}};
% Text Node
\draw (314,113.17) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 2}};
% Text Node
\draw (310.5,153.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 58}};
% Text Node
\draw (306.5,193.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 117}};
% Text Node
\draw (403.5,73.83) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 8}};
% Text Node
\draw (400,113.17) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 12}};
% Text Node
\draw (403.5,153.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 1}};
% Text Node
\draw (403.5,193.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 0}};
% Text Node
\draw (489.83,73.83) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 15}};
% Text Node
\draw (489.83,113.17) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 36}};
% Text Node
\draw (493.33,153.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 5}};
% Text Node
\draw (493.33,193.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 0}};
% Text Node
\draw (114.83,113.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {Soldier};
% Text Node
\draw (388.17,263.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {Two \textcolor[rgb]{0.82,0.01,0.11}{documents} are similar if \\their vectors are similar.};
% Text Node
\draw (149.33,256.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {Two \textcolor[rgb]{0.29,0.56,0.89}{words} are similar if their\\vectors are similar.};


\end{tikzpicture}


}


\subsection{Measuring Similarity}

We've seen a visual way to represent document matrices, but how do we actually, \textbf{quantitatively} tell if vectors are similar?

\subsubsection{Using The Dot Product}

One such method is to use the \textbf{vector dot product}. Given vectors $v$ and $w$, recall that we can calculate dot product by multiplying their components as follows.

$$\sum_{i=1}^{n}{v_i \cdot w_i}$$\\

We can use this to develop a general idea of when 2 vectors have \textbf{large values} in the \textbf{same dimensions}. The dot product will also be lower for two vectors with different dimensional structures.\\

It will be zero for completely orthogonal vectors (representing wholly different objects), i.e. orthogonal vectors represent objects that cannot be any more different.\\

However, the dot product is affected by term frequency as a whole. Higher word frequencies overall lead to a larger dot product overall. This is \textbf{bad}- we don't want a similarity metric to be sensitive to word frequency in this way. For this reason, we need some way to \textbf{normalize} the dot product.\\

\textbf{Solution}: normalize the dot product by dividing by the lengths of both vectors.

$$\frac{\vec{a} \cdot \vec{b}}{|\vec{a}||\vec{b}|}$$\\

Now, take advantage of a property of the dot product to help us define this process.

$$\frac{\vec{a} \cdot \vec{b}}{|\vec{a}||\vec{b}|} = cos \theta$$\\

Thus, we can say that the \textbf{cosine of the angle between the vectors} provides us with a \textbf{normalized} measure of 'similarity' between both vectors. This process is formalized below.

\subsubsection{Normalizing Dot Product: Cosine Similarity}

Given two documents $X$ and $Y$, represented here as vectors in matrix form, the \textbf{cosine similarity} of the two can be represented by the following equation:\\

$$similarity_{cos}(X,Y)=\frac{X^TY}{|X||Y|}$$\\

Technically, we're measuring the \textbf{cosine of the angle between vectors $X$ and $Y$}. This is how we determine similarity while ignoring term frequency.\\

\begin{itemize}
	\item \textbf{Similar} documents have \textbf{high} cosine similarity.
	\item \textbf{Dissimilar} documents have \textbf{low} cosine similarity.
\end{itemize}

\subsubsection{Example: Calculating Cosine Similarity}

Given the following document matrix, here are a few examples of calculated cosine similarity. Assume the rows represent \textbf{documents} and the columns represent \textbf{words}.\\

\begin{center}
\begin{tabular}{ |c|c c c| } 
 \hline
  & \textbf{large} & \textbf{data} & \textbf{computer} \\
  \hline 
 \textbf{apricot} & 2 & 0 & 0 \\ 
 \textbf{digital} & 0 & 1 & 2 \\ 
 \textbf{information} & 1 & 6 & 1 \\ 
 \hline
\end{tabular}
\end{center}

Also, note the formula for calculating cosine similarity given vectors $v$ and $w$ amounts to the following:

$$similarity_{cos}(v,w)=\frac{\vec{v} \cdot \vec{w}}{|v||w|}=\frac{\sum_{i=1}^{n}{(v_i \cdot w_i)}}{\sqrt{\sum_{1}^{n}{v_i^2}} \sqrt{\sum_{1}^{n}{w_i^2}}}$$\\

\begin{myproof}
\textbf{Examples:}

$$similarity_{cos}(\textrm{\textbf{apricot}},\textrm{\textbf{information}})=\frac{2+0+0}{\sqrt{2} \sqrt{38}}=0.23$$

$$similarity_{cos}(\textrm{\textbf{digital}},\textrm{\textbf{information}})=\frac{0+6+2}{\sqrt{38} \sqrt{5}}=0.58$$

$$similarity_{cos}(\textrm{\textbf{apricot}},\textrm{\textbf{digital}})=\frac{0+0+0}{\sqrt{1} \sqrt{5}}=0.00$$
\end{myproof}

\subsubsection{Minimum Edit Distance}

A metric used to calculate how similar 2 strings are to one another is \textbf{minimum edit distance}.\\

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Minimum Edit Distance} $\rightarrow$ The minimum edit distance between two strings is the number of edit operations to turn one string into another.
\end{tcolorbox}

\hfill \break This seemingly simple tool is used widely in NLP, namely for \textbf{spelling correction}, \textbf{information extraction}, \textbf{speech recognition}, and plenty of other things.\\

\begin{tcolorbox}[title=Aside: Information Extraction Systems,colframe=black,colback=white,arc=0pt,fonttitle=\bfseries]

Miscellaneous further info on information extraction systems. Here are some general goals for an information extraction system:
\begin{itemize}
	\item To extract \textbf{clear}, \textbf{factual} information
	\item To find and understand texts
	\item To gather information from \textbf{many} pieces of text
	\item To provide a structured representation of relevant information (e.g. in \textbf{relations} or \textbf{data structures})
	\item To organize information to make it useful to humans
\end{itemize}

\end{tcolorbox}

\subsection{Representing Text with N-Grams}

\textbf{Idea:} We can represent text as contiguous sequences (arrays) of $N$ tokens/words.

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{N-Gram} $\rightarrow$ An n-gram is a contiguous sequence of N tokens/words. In layman's terms, it's just a group of words. Latin prefixes can denote how big they are, i.e. unigram is one word, bigram is two words, trigram is 3 words, etc.
\end{tcolorbox}

\subsubsection{Probabilities}

We can use probabilities as the basis of solving a lot of problems in a language. Examples:

\begin{itemize}
	\item \textbf{Machine Translation} $\rightarrow$ P(High winds tonight) > P(Large winds tonight)
	\item \textbf{Correctness Check/Grammar} $\rightarrow$ P(15 Minutes) > P(15 Minuets)
	\item \textbf{Speech Recognition} $\rightarrow$ P(I saw a van) > P(Ice of a ban)
\end{itemize}

In that regard, our goal should be to be able to calculate the probability of a sentence of words. (Generalized version of the above examples)

$$P(w)=P(w_1,w_2,w_3 ... w_n)$$\\

We can use \textbf{n-grams} for this purpose. An \textbf{n-gram model} is a type of probabilistic model for predicting the next item in such a sequence in the form of a (n-1) order Markov chain.\\

\textbf{N-gram models} are popular due to their \textbf{simplicity} and \textbf{scalability}. The given structure allows small experiments to scale up efficiently.

\section{Machine Learning (L16 + 19)}

Supervised machine learning gives us a way to automatically \textbf{infer} a predictive model, given some amount of pre-labeled data.

\subsection{Overview}

This diagram provides a broad overview of the supervised learning process.\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Straight Lines [id:da6934013484644858] 
\draw    (59,40.33) -- (190.33,40.33) ;
%Straight Lines [id:da22235545874167506] 
\draw    (255,40.33) -- (386.33,40.33) ;
%Straight Lines [id:da7620370986062277] 
\draw    (447.67,40.33) -- (579,40.33) ;
%Straight Lines [id:da5092396514647811] 
\draw    (315.67,153.33) -- (331.83,90.6) ;
\draw [shift={(332.33,88.67)}, rotate = 464.45] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Shape: Rectangle [id:dp7402750968279076] 
\draw   (48.67,5.33) -- (205,5.33) -- (205,296.67) -- (48.67,296.67) -- cycle ;
%Shape: Rectangle [id:dp05887269365061776] 
\draw   (242,5.33) -- (398.33,5.33) -- (398.33,296.67) -- (242,296.67) -- cycle ;
%Shape: Rectangle [id:dp2515662723024984] 
\draw   (433.33,6.67) -- (589.67,6.67) -- (589.67,298) -- (433.33,298) -- cycle ;
%Straight Lines [id:da11949475196221093] 
\draw    (205,148.33) -- (239.67,148.33) ;
\draw [shift={(241.67,148.33)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da6197911001911289] 
\draw    (397.67,148.33) -- (430.33,148.33) ;
\draw [shift={(432.33,148.33)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (82,19.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {Training Data};
% Text Node
\draw (276.67,19.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {ML Algorithm};
% Text Node
\draw (474.67,19.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {Predictions};
% Text Node
\draw (87.33,49.73) node [anchor=north west][inner sep=0.75pt]    {$\left( x^{1} ,y^{1}\right)$};
% Text Node
\draw (68,81.73) node [anchor=north west][inner sep=0.75pt]    {$\left( x^{2} ,y^{2}\right)$};
% Text Node
\draw (85.33,137.73) node [anchor=north west][inner sep=0.75pt]    {$\left( x^{4} ,y^{4}\right)$};
% Text Node
\draw (112.67,105.73) node [anchor=north west][inner sep=0.75pt]    {$\left( x^{3} ,y^{3}\right)$};
% Text Node
\draw (116,169.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {.\\.\\.};
% Text Node
\draw (88,239.07) node [anchor=north west][inner sep=0.75pt]    {$\left( x^{n} ,y^{n}\right)$};
% Text Node
\draw (278.67,157.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {Hypothesis\\function\\formed\\based on \\the given\\training data};
% Text Node
\draw (284,60) node [anchor=north west][inner sep=0.75pt]   [align=left] {$\displaystyle y^{i}$$\displaystyle =h\left( x^{i}\right)$};
% Text Node
\draw (451.33,50.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {Given a new input};
% Text Node
\draw (497.33,73.07) node [anchor=north west][inner sep=0.75pt]    {$x_{new}$};
% Text Node
\draw (450,107.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {The algorithm will\\predict what it\\ends up as};
% Text Node
\draw (462,179.07) node [anchor=north west][inner sep=0.75pt]    {$y_{new} =h( x_{new})$};

$$


\end{tikzpicture}


}

\hfill \break Labeled data (usually done manually) is provided to the machine learning algorithm, the algorithm runs and selects the best hypothesis function from the given hypothesis space, and then is able to render its own predictions on unlabeled data.

\subsection{Hypothesis}

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Hypothesis} $\rightarrow$ A function that describes \textit{a} relation between the input and the target in supervised learning.
\end{tcolorbox}

\hfill \break The hypothesis can be affected by \textbf{data}, \textbf{restrictions}, \textbf{bias}, and other factors. These are the nuances of the ML algorithm.\\

The hypothesis itself must be selected from a given \textbf{hypothesis space}. Think of it as a starting point for the algorithm.\\

Providing the algorithm with a hypothesis space and asking it to design a hypothesis is like saying, "Here's the domain of all possible functions you can specify your hypothesis of choice from, now decide which one is the most optimal".\\

\subsubsection{Hypothesis Function}

At the end of the day, our algorithm is just trying to figure out the best hypothesis function. The end goal is the following: a function that can (supposedly) successfully classify all given inputs.

$$ h_{\theta}:R^n \rightarrow y $$
\begin{center}
\textit{(Takes all possible inputs and classifies them)}
\end{center} 

Our hypothesis function depends on domain knowledge. For example, a hypothesis function could be a line on a Naviance college acceptances graph that determines where acceptances and rejections will lie given an x coordinate of SAT scores and a y coordinate of GPA.

\subsubsection{Hypothesis Space}

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Hypothesis Space} $\rightarrow$ Set of all possible legal hypotheses. The set from which the ML algorithm will decide the best.
\end{tcolorbox}

\hfill \break Overall, the \textbf{hypothesis space} is simply the set of \textbf{all possible hypotheses}, and a \textbf{hypothesis} is a single potential solution. (The algorithm then chooses the \textbf{best} hypothesis.)

\subsection{The Loss Function}

The loss function (defined below) is a component of a machine learning algorithm that measures the difference between a prediction and the true output (what it's supposed to be).

$$ l:y \times y \rightarrow R_+$$

There are \textbf{two types} of \textbf{loss functions}.
\begin{itemize}
	\item \textbf{Classification Loss} $\rightarrow$ Classification functions predict a \textbf{label}. Examples of this type of loss function would be logarithmic loss, focal loss, KL divergence, relative entropy, exponential loss, hinge loss.
	\item \textbf{Regression Loss} $\rightarrow$ Regression functions predict a quantity. Examples of this type of loss function would be Mean square error (quadratic loss), Mean absolute error, Huber loss, Smooth mean absolute error, log cosh loss, quantile loss.
\end{itemize}

There is no single loss function that works on all sets of data. Your choice of loss function depends on factors like the following:
\begin{itemize}
	\item Outliers
	\item Choice of ML Algorithm
	\item Time Efficiency of Gradient Descent
	\item Confidence of Predictions
\end{itemize}

It looks like, at the end of the day, you should just pick the right loss function for your situation.

\subsection{The Canonical Machine Learning Problem}

At the end of the day, we want to \textbf{learn} a hypothesis function that \textbf{accurately} predicts outputs given inputs.\\

As such, we can say that our \textbf{goal} is to \textbf{minimize loss} by choosing the parameterization that minimizes loss over all possible parameters.\\

Here's where the hypothesis function and the loss function fit into the big picture:

$$ \underbrace{\sum_{i=1}^{m}{l((h_{\theta}(x^i)), y^i)}}_{\mathrm{minimize}}$$

We want to \textbf{find a hypothesis} function that \textbf{minimizes the loss} across all the training data inputs that we have. In this case, $m$ is our training data.\\

Minimizing this is not as easy as it seems- it's a high order differential equation, and we need an efficient way to optimize it. Different optimization methods work differently for different machine learning algorithms.

\subsection{Overarching Idea}

Big Idea: \textbf{Take in a hypothesis space, loss function and an optimizer, then decide a hypothesis function given a hypothesis space.}\\

Our \textbf{hypothesis function} will be domain-specific. It could be anything from drawing a line on a graph to separate pass from failure, or designing a function to classify different types of fruits.\\

Our \textbf{loss function} can be chosen based on our problem type, as we explored above. Again, this is case-specific.\\

Our \textbf{optimization method} is usually gradient descent or stochastic gradient descent- this is basically how you easily find minima of a high order differential equation.

\subsection{Examples of ML Algorithms}

\begin{center}
\begin{tabular}{ |c|c|c|c| } 
 \hline
 \textbf{Name} & \textbf{Hypothesis Function} & \textbf{Loss Function} & \textbf{Optimization Approach} \\ 
 \hline
 Least Squares & Linear & Squared & Algo / Gradient Descent \\
 \hline 
 Linear Regression & Linear & Squared & Algo / Gradient Descent \\
 \hline
 Support Vector Machine & Linear/Kernel & Hinge & Algo / Gradient Descent \\
 \hline
 Perceptron & Linear & Perceptron Criterion & Perceptron Algorithm \\
 \hline
 Neural Networks & Compound Nonlinear & Squared, Hinge & Stochastic Gradient Descent \\
 \hline
 Decision Trees & Hierarchical Half-planes & Many & Greedy \\
 \hline
 Naive Bayes & Linear & Joint Probability & \#SAT \\
 \hline
\end{tabular}
\end{center}

We can see that ML Algorithms are just the sum of their components. Its not so much our job to discover these methods, but instead to decide where to apply which method, using scientific insight.

\subsection{Example: Linear Regression As Machine Learning}

Consider linear regression that minimizes the sum of squared error, i.e. makes use of least squares. (First row in example table above).\\

In technical terms:\\

\begin{myproof}
\begin{itemize}
	\item \textbf{Hypothesis Function} $\rightarrow$ Linear ($h_{\theta}(x)=\theta^T x$)
	\item \textbf{Loss Function} $\rightarrow$ Squared error loss ($l(\hat{y},y)=\frac{1}{2}(\hat{y}-y)^2$)
	\item \textbf{Optimization} $\rightarrow$ Minimize $\sum_{i=1}^{m}(\theta^T x^i-y^i)^2$
\end{itemize}
\end{myproof}

Now, our task is to minimize the function we wish to optimize. Rewrite inputs in technical form:\\

$$X=\begin{pmatrix}
  (x_1)^T\\ 
  (x_2)^T\\
  .\\
  .\\
  .\\
  (x_m)^T\\
\end{pmatrix} \in R^{m \times n}$$

\begin{center}
\textit{Feature Vectors a.k.a. Training Data Inputs}
\end{center}

$$y=\begin{pmatrix}
  y_1\\ 
  y_2\\
  .\\
  .\\
  .\\
  y\\
\end{pmatrix} \in R^{m}$$

\begin{center}
\textit{Labels a.k.a. Training Data Outputs}\\
\end{center}

\hfill \break Now we'll review the gradient, then get into how we're applying gradient descent. Recall the following from multivariate calculus:
\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Gradient} $\rightarrow$ The multivariate generalization of the derivative\\

For a function $f:\mathbb{R}^n \rightarrow \mathbb{R}$, the gradient is a vector of all $n$ partial derivatives of that function:

$$ \nabla_{\theta} f(\theta) = \begin{pmatrix}
\frac{\partial f(\theta)}{\partial f(\theta)_1}\\
.\\
.\\
.\\
\frac{\partial f(\theta)}{\partial f(\theta)_n}\\
\end{pmatrix} \in \mathbb{R}^n$$

\end{tcolorbox}

As such, minimizing a multivariate function involves finding a point where the gradient is zero.

$$ \nabla_{\theta} f(\theta)=\begin{pmatrix}
0\\
0\\
.\\
.\\
.\\
0
\end{pmatrix}$$\\

That is, we find when the gradient is a zero vector of order $n$. However, note that points where the gradient is zero are mainly local minima. Only in convex functions can we be sure that this point is the global minimum.\\

Given our optimization problem can written as the following, we can look to find the gradient from there.                                                     

$$\mathrm{minimize}_{\theta} \frac{1}{2}||X \theta - y||_2^2$$\\


By applying a few rules, we can arrive at the following gradient.\\

$$X^T(X \theta -y)$$\\

Now, solve for the model parameters $\theta$, because we already have defined $X$ and $y$ above.

$$X^T(X \theta -y)=0$$
$$X^TX \theta - X^T y =0$$
$$X^TX \theta =X^T y$$
$$ \underbrace{ (X^T X)^{-1} \cdot X^TX}_{cancels}  \theta = (X^T X)^{-1} \cdot X^T y$$
$$\theta = (X^T X)^{-1} \cdot X^T y$$\\

Thus, we have found a systematic way to find the optimal model parameters for linear regression, given the training data.

\subsection{Machine Learning in Python}

Python provides us with lots of options in terms of machine learning libraries.\\

\textbf{\texttt{Scikit-learn}} (\texttt{sklearn}) is the most well-known library. It provides us with a lot of functionality, namely:

\begin{itemize}
	\item Classification (SVN, K-NN, Random Forest)
	\item Regression (SVR, Ridge, Lasso)
	\item Clustering (K-means, spectral, mean-shift)
	\item Dimensionality Reduction (PCA, matrix factorization)
	\item Model selection (Grid search)
	\item Preprocessing (Cleaning, EDA)
\end{itemize}

\texttt{sklearn} also works well with \texttt{numpy} and \texttt{matplotlib}.

\subsection{(Stochastic) Gradient Descent}

First, let's establish this definition.\\

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Stochastic} $\rightarrow$ Randomly determined; having a random probability distribution or pattern that may be analyzed statistically, but may not be predicted precisely.
\end{tcolorbox}

\hfill \break Recall that the \textbf{gradient} always points in the \textit{local} direction of steepest ascent.\\

As such, we need to get to the 'lowest' point on our multivariate $n$-dimensional plane by taking minute steps \textit{against} the gradient.

\begin{center}
	\includegraphics[scale=0.35]{img/gradient.png} 
\end{center}

For example, in the above graph, the gradient points us towards higher points on the 3-dimensional plane. Our objective is to step against the gradient and find the minimum point.\\

\textit{Notice that you won't always end up at the global minimum (the blue)- it depends on where you start. If you start to the left of the red portion, you're fine, but if you start to the right of the red portion, you're in trouble. This is the nuance of gradient descent.}\\

\subsubsection{Procedure}

\textbf{Given} the following initial elements:

\begin{myproof}

\begin{itemize}
	\item a hypothesis function: $h_{\theta}:R^n \rightarrow y$
	\item a loss function: $l:y \times y \rightarrow R_+$
	\item a step size: $\gamma$
	\item an initial parameter vector, $\theta$ of all zeroes
\end{itemize}
\end{myproof}

We have all we need to perform gradient descent. From there, repeat the following:

\begin{myproof}
\begin{itemize}
	\item Compute gradient: $$ g \leftarrow \sum_{i=1}^{m}{\nabla_{\theta} l(h_{\theta}(x^i), y^i)}$$
	(We want to see how our multidimensional optimization function is behaving- let's change up our parameters and make sure we're stepping in the right direction)
	\item Update parameters: $$ \theta \leftarrow \theta - \gamma \cdot g$$
\end{itemize}
\end{myproof}

Step size is also an important parameter to consider. If it's too big, you can end up oscillating around the minimum. If it's too small, you can take a really long time to converge on the minimum.

\subsubsection{Batch vs. Stochastic Gradient Descent}

\begin{itemize}
	\item \textbf{Batch Gradient Descent} calculates gradient descent using the whole dataset. It's good for convex or relatively smooth error manifolds. That is, cases where you're pretty clearly moving towards an optimal solution or goal.
	\item \textbf{Stochastic Gradient Descent} computes using one sample. It works well for examples with lots of local minima and maxima. This seems to be a more realistic case.
\end{itemize}


\section{Decision Trees (L20)}

Let's look at the big picture of learning first, then move on to decision trees specifically.

\subsection{Big Picture of Learning}

Learning can be seen as \textbf{fitting a function} to the data. We can consider \textbf{different target functions} (which basically means that we can consider different hypothesis spaces).\\

A learning problem is \textbf{realizable} if its hypothesis space contains the \textbf{true function}. (The most accurate function)\\

There is always a \textbf{tradeoff} between the \textbf{expressiveness of a hypothesis space} and the \textbf{complexity of finding simple, consistent hypotheses} within the space.\\

Examples of different target functions:
\begin{itemize}
	\item Propositional if-then rules
	\item Decision Trees
	\item Linear functions
	\item Neural networks
\end{itemize}

\subsection{Decision Trees Intro}

We'll be analyzing \textbf{Decision Trees}. Specifically, decision trees for boolean classification, where each example is classified as positive or negative.\\

First, what is a decision tree?

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Decision Tree} $\rightarrow$ A function that makes consecutive, sequential decisions based on the nature of that input, and produces a final result.
\begin{itemize}
	\item \textbf{Input:} An object or situation described by a set of attributes (features)
	\item \textbf{Output:} A "decision" that predicts the corresponding output value.
\end{itemize} 
Structured sort of like a flowchart or a how-to guide, the decision tree is very human-readable.\\\\

It is composed of two types of components: \textbf{Decision Nodes} and \textbf{Leaf Nodes}.\\\\

\textbf{Decision Node} $\rightarrow$ Specifies a choice or test of some attribute with 2 or more alternatives. Every decision node is a part of a path to a leaf node.\\

\textbf{Leaf Node} $\rightarrow$ Indicates classification of an example- function terminates.
\end{tcolorbox}

\hfill \break A decision tree is a tree-like graph with nodes representing the place where we pick an attribute and ask a question; edges represent the answers the to the question; and the leaves represent the actual output or class label. They are used in non-linear decision making with simple linear decision surface. (\textit{hackerearth})

\subsubsection{Example: Election Decision Tree}

The following is an example of a decision tree. Note that the order of the decisions taken matters, and the results are all based in likelihood, not certainty.\\

\begin{center}
	\includegraphics[scale=0.4]{img/dtree.png}\\
\end{center}

\begin{myproof}
\textbf{Given}:\\
A collection of examples $(x, f(x))$\\\\

\textbf{Return}:\\
A function $h$ (\textit{hypothesis}) that approximates $f$, where $h$ is a \textbf{decision tree}.
\end{myproof}


\subsection{Decision Tree Learning Algorithm (ID3)}

Decision trees can express \textbf{any Boolean function}.\\

Our goal is to find a decision tree that '\textbf{agrees}' with our training data set. If it agrees with the training data, it should theoretically produce correct output for the actual inputs.\\

One idea is to \textbf{construct a specific path for each example}; this would guarantee us accuracy with the training data. Unfortunately, this just won't work. This approach would just memorize the example, and \textbf{doesn't generalize} at all.\\

Therefore, we want the smallest tree, to minimize chances of this sort of 'memorization' from happening. One would think there's some systematic way to minimize this, but that problem ends up being \textbf{NP-hard}.\\

Therefore, our \textbf{overall goal} should be to get the best classification with the smallest number of tests- minimize decision nodes while maximizing accuracy.\\

\subsubsection{Badminton Example: Hackerearth}

Let's assume we had the following data as a table:

\begin{center}
	\includegraphics[scale=0.8]{img/table.png} 
\end{center}

\hfill \break It indicates our human observations as to whether or not we'd decide to play badminton on a given day based on environmental factors, for the last 10 days.\\

We may use this table to help us decide whether we want to play or not in the future, but let's say that on day $n$, we find that the environmental factors are different than any of the previously recorded days.\\

We can try and form a \textbf{decision tree} based on this data that will help us make our decision. Below is an example decision tree for this data.

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Rectangle [id:dp7287379245887146] 
\draw   (310,31) -- (380,31) -- (380,71) -- (310,71) -- cycle ;
%Shape: Rectangle [id:dp8626558304342224] 
\draw   (206.67,119) -- (276.67,119) -- (276.67,159) -- (206.67,159) -- cycle ;
%Shape: Rectangle [id:dp23388971934608005] 
\draw   (415,119) -- (485,119) -- (485,159) -- (415,159) -- cycle ;
%Straight Lines [id:da19359411263540038] 
\draw    (302.33,51) -- (243.14,105.31) ;
\draw [shift={(241.67,106.67)}, rotate = 317.46000000000004] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da8341409973532903] 
\draw    (389,51) -- (448.55,107.29) ;
\draw [shift={(450,108.67)}, rotate = 223.39] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Rounded Diagonal Corner Rect [id:dp7974115492268236] 
\draw   (320.33,130.8) .. controls (320.33,127.78) and (322.78,125.33) .. (325.8,125.33) -- (376,125.33) .. controls (376,125.33) and (376,125.33) .. (376,125.33) -- (376,147.2) .. controls (376,150.22) and (373.55,152.67) .. (370.53,152.67) -- (320.33,152.67) .. controls (320.33,152.67) and (320.33,152.67) .. (320.33,152.67) -- cycle ;
%Straight Lines [id:da4367305738199382] 
\draw    (347.67,96.67) -- (347.67,113.33) ;
\draw [shift={(347.67,115.33)}, rotate = 270] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da46243154172426126] 
\draw    (241.67,161.33) -- (201.11,222.33) ;
\draw [shift={(200,224)}, rotate = 303.62] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da1481738222577752] 
\draw    (241.67,161.33) -- (285.79,219.08) ;
\draw [shift={(287,220.67)}, rotate = 232.62] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da6081981084391918] 
\draw    (449.67,161.33) -- (409.11,222.33) ;
\draw [shift={(408,224)}, rotate = 303.62] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da8710681703922716] 
\draw    (449.67,161.33) -- (493.79,219.08) ;
\draw [shift={(495,220.67)}, rotate = 232.62] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Rounded Diagonal Corner Rect [id:dp1289608674372874] 
\draw   (169.67,232.8) .. controls (169.67,229.78) and (172.11,227.33) .. (175.13,227.33) -- (225.33,227.33) .. controls (225.33,227.33) and (225.33,227.33) .. (225.33,227.33) -- (225.33,249.2) .. controls (225.33,252.22) and (222.89,254.67) .. (219.87,254.67) -- (169.67,254.67) .. controls (169.67,254.67) and (169.67,254.67) .. (169.67,254.67) -- cycle ;
%Rounded Diagonal Corner Rect [id:dp0009906265808617976] 
\draw   (259.67,232.8) .. controls (259.67,229.78) and (262.11,227.33) .. (265.13,227.33) -- (315.33,227.33) .. controls (315.33,227.33) and (315.33,227.33) .. (315.33,227.33) -- (315.33,249.2) .. controls (315.33,252.22) and (312.89,254.67) .. (309.87,254.67) -- (259.67,254.67) .. controls (259.67,254.67) and (259.67,254.67) .. (259.67,254.67) -- cycle ;
%Rounded Diagonal Corner Rect [id:dp13747987290793218] 
\draw   (467.67,231.47) .. controls (467.67,228.45) and (470.11,226) .. (473.13,226) -- (523.33,226) .. controls (523.33,226) and (523.33,226) .. (523.33,226) -- (523.33,247.87) .. controls (523.33,250.89) and (520.89,253.33) .. (517.87,253.33) -- (467.67,253.33) .. controls (467.67,253.33) and (467.67,253.33) .. (467.67,253.33) -- cycle ;
%Rounded Diagonal Corner Rect [id:dp07596070403498112] 
\draw   (379.67,233.47) .. controls (379.67,230.45) and (382.11,228) .. (385.13,228) -- (435.33,228) .. controls (435.33,228) and (435.33,228) .. (435.33,228) -- (435.33,249.87) .. controls (435.33,252.89) and (432.89,255.33) .. (429.87,255.33) -- (379.67,255.33) .. controls (379.67,255.33) and (379.67,255.33) .. (379.67,255.33) -- cycle ;
%Shape: Rectangle [id:dp379136576621135] 
\draw   (12.67,41) -- (37.67,41) -- (37.67,61) -- (12.67,61) -- cycle ;
%Rounded Diagonal Corner Rect [id:dp5937279433237983] 
\draw   (7,74.68) .. controls (7,72.79) and (8.54,71.25) .. (10.43,71.25) -- (44.67,71.25) .. controls (44.67,71.25) and (44.67,71.25) .. (44.67,71.25) -- (44.67,84.98) .. controls (44.67,86.88) and (43.13,88.42) .. (41.23,88.42) -- (7,88.42) .. controls (7,88.42) and (7,88.42) .. (7,88.42) -- cycle ;

% Text Node
\draw (228,55) node [anchor=north west][inner sep=0.75pt]   [align=left] {Sunny};
% Text Node
\draw (416.67,55) node [anchor=north west][inner sep=0.75pt]   [align=left] {Rainy};
% Text Node
\draw (316,42.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {Weather};
% Text Node
\draw (211.67,130.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {Humidity};
% Text Node
\draw (432,130.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {Wind};
% Text Node
\draw (334,130.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {Yes};
% Text Node
\draw (321.33,81) node [anchor=north west][inner sep=0.75pt]   [align=left] {Cloudy};
% Text Node
\draw (177.33,173) node [anchor=north west][inner sep=0.75pt]   [align=left] {>75\%};
% Text Node
\draw (275.33,173) node [anchor=north west][inner sep=0.75pt]   [align=left] {$\leq$ 75\%};
% Text Node
\draw (385.33,173) node [anchor=north west][inner sep=0.75pt]   [align=left] {>20};
% Text Node
\draw (483.33,173) node [anchor=north west][inner sep=0.75pt]   [align=left] {$\leq$ 20};
% Text Node
\draw (186.67,231.83) node [anchor=north west][inner sep=0.75pt]   [align=left] {No};
% Text Node
\draw (273.33,232.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {Yes};
% Text Node
\draw (481.33,231.17) node [anchor=north west][inner sep=0.75pt]   [align=left] {Yes};
% Text Node
\draw (396.67,232.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {No};
% Text Node
\draw (38,42.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {\mbox{-} Decision Node};
% Text Node
\draw (48.67,71.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\mbox{-} Leaf Node};


\end{tikzpicture}


}

\subsubsection{ID3 Algorithm}

The following is a basic algorithm used to construct decision trees. It's known as the \textit{Iterative Dichotomiser 3}, or \textit{ID3} for short. It builds decision trees using a \textbf{top-down}, \textbf{greedy} approach.\\

\begin{myproof}
\begin{itemize}
	\item[1.] Select the \textbf{best} attribute/feature, $\gamma$, from the pool of given features. The "best" attribute is the attribute which best splits or separates the data.
	\item[2.] Assign $\gamma$ as the decision attribute for a new node, $\beta$.
	\item[3.] For each value of $\gamma$, create a new descendant of $\beta$.
	\item[4.] If the training data is now perfectly classified, stop. Otherwise, recursively start from step 1 on each of the descendants of $\beta$.
\end{itemize}
\end{myproof}

\hfill \break This approach is fairly straightforward; let's investigate further into how we can end up choosing the 'best' attribute from the pool of given attributes. For this, we need to establish the concept of \textbf{information gain}.

\subsubsection{Information Gain + Entropy}

\hfill \begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Information Gain} $\rightarrow$ A measure that expresses how well an attribute splits our data into groups based on classification.
\end{tcolorbox}

\hfill \break Information gain is a statistical property that measures how well a given attribute separates the training examples according to target classification. If the presence of a certain attribute lead to way more of one outcome than another, we consider that attribute to have a \textbf{high information gain}.\\

However, we still need a way to define this property precisely. To aid in this definition, we will define the idea of entropy.\\

\hfill \begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Entropy} $\rightarrow$ A property that measures the level of \textit{impurity} in a group of examples. 
\end{tcolorbox}

\hfill \break In the case of our \textit{ID3} algorithm, which only provides results in the form of \textbf{positive} or \textbf{negative}, we can define entropy with the following formula:\\

\begin{myproof}

$$ Entropy(S)=-p_+ log_2p_+ -p_-log_2p_- $$

Where:

\begin{itemize}
	\item $S$ is a sample from our training data.
	\item $p_+$ is the proportion of positive examples in $S$.
	\item $p_-$ is the proportion of negative examples in $S$.
\end{itemize}

\end{myproof}

\hfill \break This formula essentially tells us how 'split' the data is on a scale from 0 to 1.0- for example:\\

\begin{myproof}
Example: $S$ is a sample containing 14 boolean examples, with 9 positive and 5 negative examples. The Entropy of $S$ relative to this classification is:\\

$$ Entropy ([9+, 5-]) = -\frac{9}{14}\cdot log_2\frac{9}{14} - \frac{5}{14}\cdot log_2 \frac{5}{14} = 0.940$$\\

Note that entropy is $0$, or optimal, if all members of $S$ belong to the same class, and entropy is $1.0$ if exactly half the members belong to one class, and the other half belong to the other.
\end{myproof}

\hfill \break At this point, our intuition tells us that we need to \textbf{pick the attribute that reduces the entropy the most.}\\

Using this definition of entropy, we can now properly define information gain as a measure of the effectiveness of an attribute in classifying the training data.\\

Information Gain, $Gain(S,\gamma)$, of an attribute $\gamma$ relative to a sample of examples $S$ is defined as the following: \\
\begin{myproof}
$$Gain(S,\gamma)=\underbrace{Entropy(S)}_{\text{Entropy of original sample S}}-\underbrace{\sum_{v\in Values(\gamma)}{\frac{|S_v|}{|S|}Entropy(S_v)}}_{\text{Expected value of entropy after S is partitioned using } \gamma}$$
\end{myproof}

\hfill \break Where:
\begin{itemize}
	\item $Values(\gamma)$ is the set of all possible values for attribute $\gamma$
	\item $S_v$ is the subset of $S$ for which attribute $\gamma$ has value $v$
\end{itemize}

Overall, we can see that $Gain(S,\gamma)$ is therefore the expected reduction in entropy caused by knowing the value of attribute $\gamma$. We're basically trying to say this:

$$ \text{Information Gain} = Entropy(\text{Parent Node}) - AvgEntropy(\text{Children Nodes})$$

\subsubsection{Example: Information Gain + Entropy}

Suppose a sample $S$ has 30 instances, 14 positive and 16 negative.\\

Also suppose that an attribute $\gamma$ divides the samples into two subsamples of 17 instances (4 negative and 13 positive) and 13 instances (1 positive and 12 negative).\\\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Rectangle [id:dp4519386664719742] 
\draw   (81.33,12) -- (403,12) -- (403,112) -- (81.33,112) -- cycle ;
%Straight Lines [id:da005391573051303222] 
\draw [color={rgb, 255:red, 245; green, 166; blue, 35 }  ,draw opacity=1 ]   (241,112) -- (142.07,168.34) ;
\draw [shift={(140.33,169.33)}, rotate = 330.34000000000003] [color={rgb, 255:red, 245; green, 166; blue, 35 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da46164442306473463] 
\draw [color={rgb, 255:red, 245; green, 166; blue, 35 }  ,draw opacity=1 ]   (241,112) -- (331.3,168.28) ;
\draw [shift={(333,169.33)}, rotate = 211.93] [color={rgb, 255:red, 245; green, 166; blue, 35 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Shape: Rectangle [id:dp3853088098543145] 
\draw   (73.33,169.67) -- (209,169.67) -- (209,236) -- (73.33,236) -- cycle ;
%Shape: Rectangle [id:dp27389516072713194] 
\draw   (269.33,170.33) -- (405,170.33) -- (405,236.67) -- (269.33,236.67) -- cycle ;

% Text Node
\draw (494.67,38) node [anchor=north west][inner sep=0.75pt]   [align=left] {30 instances\\};
% Text Node
\draw (94.67,21.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (114.67,41.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (194,22.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (146.67,37.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (205.33,58.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (147.33,70.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (304.67,36) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (302,63.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (344.67,76) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (366.67,30) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (251.33,65.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (248,27.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (274.67,54.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (271.33,85.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (230.67,82.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (226.67,32) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (168,48) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (116.67,72.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (180,72) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (275.33,24.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (336,39.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (318,82) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (145.33,14.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (159.33,90.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (372,54.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (314.67,15.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (372.67,76.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (234.67,48.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (193.33,40.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (96,58.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (454.67,60) node [anchor=north west][inner sep=0.75pt]   [align=left] {(\textcolor[rgb]{0.49,0.83,0.13}{14 positive}, \textcolor[rgb]{0.82,0.01,0.11}{16 negative})};
% Text Node
\draw (82,179.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (102,199.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (122,219.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (157.33,182.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (130.67,180) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (176,204.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (144.67,193.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (108.67,174.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (84,213.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (127.33,202) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (152,214) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (102,216.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (184,177.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (337.33,190) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.49,0.83,0.13}{+}};
% Text Node
\draw (168.67,189.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (188.67,209.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (116,190.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (137.33,214) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (277.33,175.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (297.33,195.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (317.33,215.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (309.33,179.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (329.33,199.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (370.67,178) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (390.67,198) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (359.33,204) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (281.33,211.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (335,172.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (341.33,215.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (275.33,191.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{1,0,0}{-}};
% Text Node
\draw (91.33,244) node [anchor=north west][inner sep=0.75pt]   [align=left] {17 instances\\};
% Text Node
\draw (51.33,266) node [anchor=north west][inner sep=0.75pt]   [align=left] {(\textcolor[rgb]{0.49,0.83,0.13}{13 positive}, \textcolor[rgb]{0.82,0.01,0.11}{4 negative})};
% Text Node
\draw (296.67,243.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {13 instances\\};
% Text Node
\draw (256.67,265.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {(\textcolor[rgb]{0.49,0.83,0.13}{1 positive}, \textcolor[rgb]{0.82,0.01,0.11}{12 negative})};
% Text Node
\draw (362.67,130.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.96,0.65,0.14}{Divide on $\displaystyle \gamma $}};


\end{tikzpicture}


}

\hfill \break Given these conditions, we can calculate the \textbf{information gain} of attribute $\gamma$.\\

We know that the following formula calculates information gain of attribute $\gamma$ from a sample S:\\

$$Gain(S,\gamma)=\underbrace{Entropy(S)}_{\text{Entropy of original sample S}}-\underbrace{\sum_{v\in Values(\gamma)}{\frac{|S_v|}{|S|}Entropy(S_v)}}_{\text{Expected value of entropy after S is partitioned using } \gamma}$$\\

We also know that the entropy of sample $S$ can be calculated using:

$$Entropy(S) = -p_+log_2p_+ - p_-log_2p_-$$\\

Using this formula, we can deduce the following:
\begin{itemize}
	\item Entropy of parent (30 instances): $\frac{14}{30}\cdot log_2 \frac{14}{30}-\frac{16}{30}\cdot log_2 \frac{16}{30}=0.996$
	\item Entropy of child (17 instances): $\frac{13}{17}\cdot log_2 \frac{13}{17} - \frac{4}{17} \cdot log_2 \frac{4}{17}=0.787$
	\item Entropy of child (13 instances): $\frac{1}{13}\cdot log_2 \frac{1}{13}-\frac{12}{13}\cdot log_2 \frac{12}{13}=0.391$
\end{itemize}

Now, we take the weighted average of the entropy of the children (expected value of entropy after $S$ is partitioned using $\gamma$).

$$\frac{17}{30}\cdot 0.787+\frac{13}{30}\cdot 0.391=0.615$$

\hfill \break Using all this information, we can calculate the information gain for attribute $\gamma$:

$$G(S,\gamma)=0.996-0.615=0.38$$

Keeping this in mind, we are free to calculate information gain for \textbf{each attribute} and, based on this number, decide which attribute that we should split the data on for our ID3 algorithm.\\

\subsection{Noisy Data}

Lots of 'noise' could potentially occur in your training data- \textbf{training data is not necessarily perfect}. Examples:

\begin{itemize}
	\item Two examples have the same attribute/value pairs, but different classifications
	\item Some values of attributes are incorrect because of errors in the data acquisition or preprocessing phase.
	\item The classification itself is wrong due to some error
\end{itemize}

Noisy data is just one of the reasons you don't want to \textbf{overfit} your data.

\subsection{Overfitting}

\hfill \begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Overfitting} $\rightarrow$ Fitting the training set "too well" such that performance on the test set degrades.
\end{tcolorbox}

At the end of the day, the training data is just that- training data. You don't want your algorithm to learn so much from your training data such that it adheres to it so perfectly that it fails to properly classify new training data that it hasn't yet seen. In other words, you want your algorithm to learn, but not totally and entirely meticulously.

\subsubsection{Example: Overfitting}

\begin{myproof}
You are trying to predict the roll of a die. The experiment data includes:
\begin{itemize}
	\item Day of the week
	\item Month of the week
	\item Color of the die
\end{itemize}
\end{myproof}

Decision Tree Learning may find a hypothesis that fits the data, but with irrelevant attributes.\\

Some attributes are irrelevant to the decision making process, such as the color of the die, but the algorithm may use them to differentiate examples all the same.\\

If the hypothesis space has \textbf{many dimensions} because of a \textbf{large number of attributes}, we may find a \textbf{meaningless regularity} in the data that is irrelevant to the true, important distinguishing features of the dataset.\\

Overfitting is a key problem in learning there are formal results on the number of examples needed to properly train a hypothesis of a certain complexity- the more parameters, the more data is needed.

\subsubsection{Defining Overfitting}

Consider $D$, the entire distribution of data, and $T$, the training set. We can say that hypothesis $h \in H$ overfits $D$ if $\exists h' \neq h \in H$ such that $error_T(h) < error_T(h')$ but $error_D(h) > error_D(h')$.\\

That's basically saying that hypothesis $h$ overfits the data if there exists another hypothesis that's less accurate for the training data but more accurate on the validation data.\\

\begin{tcolorbox}[title=Aside: Fitting Training Data vs. Validation Data,colframe=black,colback=white,arc=0pt,fonttitle=\bfseries]

It's hard to think of a better to the training data as a 'worse' result. It's often difficult to fit training data well, so it seems that a better fit to the training data means a good result. This is an easy way to fall victim to overfitting.

\end{tcolorbox}

\hfill \break Here's an example of overfitting as decision tree size increases.\\

\includegraphics[scale=0.5]{img/overfitting.png} 

\subsection{Evaluation Methodology}

How do we evaluate the quality of the hypotheses produced by learning algorithms? We generally do this by seeing how good they are at classifying unseen examples.

\subsubsection{Standard Methodology: Holdout Cross-Validation}

\begin{itemize}
	\item[1.] Collect a large set of examples.
	\item[2.] Randomly divide the collection into two disjoint sets, training set and validation set
	\item[3.] Apply a learning algorithm to the training set and generate hypothesis $h$
	\item[4.] Measure performance of $h$ with regards to the test set
\end{itemize}

It's important to keep the testing sets and training sets disjoint as well- that is, make sure there is no \textbf{peeking}.\\

\begin{myproof}
\textbf{Peeking Example}:\\
Suppose generate four different hypotheses- for example by using different criteria to pick the next attribute to branch on.
We test the performance of the four different hypotheses on the test set and then select the best hypothesis.\\

This is considered \textbf{peeking}, as the hypothesis was selected \textbf{based on} its performance on the \textbf{test set}, so information about the test set leaked into the learning algorithm.\\

In this case, a new, separate test set would be required- one that our algorithm has not yet seen.
\end{myproof}

\subsection{Learning Curve Graph}

The learning curve graph is the average prediction quality as a function of the size of the training set. Usually, as the training set increases, so does the quality of the prediction.

\subsection{Precision vs. Recall}

\hfill \begin{tcolorbox}[title=Definitions:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Precision} $\rightarrow$ $\frac{\text{number of true positives}}{\text{number of true positives}+\text{number of false positives}}$\\\\
\textbf{Recall} $\rightarrow$ $\frac{\text{number of true positives}}{\text{number of true positives}+\text{number of false negatives}}$
\end{tcolorbox}

\hfill \break A precise classifier is selective, while a classifier with high recall is inclusive. There are situations where we'd err to either one side due to caution, precision or recall. It's a case-by-case thing.

\subsection{Decision Trees in Scikit}

As expected, it's very easy to load up a decision tree in \texttt{sklearn}. Here's a python snippet to train a decision tree using default parameters.\\

{\centering
\begin{lstlisting}[language=python]
from sklearn.datasets import load_iris
from sklearn import tree

# Load a common dataset, fit a decision tree to it
iris = load_iris()
clf = tree.DEcisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)

# Predict the most likely class
clf.predit([[2., 2.]])
\end{lstlisting}
}

You can also use \texttt{graphviz} and \texttt{pydotplus} to visualize a decision tree using Python.

\subsection{Random Forests}

Decision trees are very interpret-able, but may be brittle to changes in the training data, as well as noise.\\

\textbf{Random forests} are an ensemble method that:

\begin{itemize}
	\item Resamples the training data
	\item Builds many decision trees
	\item Averages predictions of trees in order to classify
\end{itemize}

This is done via bagging and random feature selection.

\subsubsection{Bagging}

\textbf{Bagging} is short for \textbf{B}ootstrap \textbf{Ag}gregation.\\

Essentially what you're doing when you're bagging is resampling a training set of size $n$ via the bootstrap, a sample of $n$ elements taken \textit{with replacement} from the original dataset. For example:\\

\begin{myproof}
\textbf{Original dataset:} \texttt{1,2,3,4,5,6,7,8,9,10}\\\\

\textbf{Resampled training set 1:} \texttt{2,3,3,5,6,1,8,10,9,1}\\
\textbf{Resampled training set 2:} \texttt{1,1,5,6,3,8,9,10,2,7}\\
\textbf{Resampled training set 3:} \texttt{1,5,8,9,2,10,9,7,5,4}
\end{myproof}

\hfill \break Below is a visual example of bagging:\\

{
\centering
\includegraphics[scale=0.5]{img/bagging.png} 
}

\hfill \break Since bagging resamples the original training data \textit{with replacement}, some instances (or even entire resampled datasets) may be present multiple times, while others are left out.\\

Once we perform analysis on all the separate datasets, we can have those analyses 'vote' and decide what the final classification should be.\\

Random forest classification is available in \texttt{sklearn}, simply import \texttt{RandomForestClassifier} from \texttt{sklearn.ensemble}.

\section{K-Nearest Neighbors \& Support Vector Machines (L21)}

\subsection{Overview}

\textbf{Basic idea:} If it walks like a duck, quacks like a duck, then it's probably a duck.\\

It's a classifier that decides the value of a data point by analyzing that data point's $k$ nearest neighbors.\\

The \textbf{K-NN Classifier} requires three things:
\begin{itemize}
	\item The set of stored records
	\item A distance metric, to compute distance between records
	\item The value of $k$, the number of nearest neighbors to retrieve
\end{itemize}

In order to classify an unknown record given the above three things:

\begin{itemize}
	\item You compute the distance from your unknown record to other training records
	\item You identify the list of $k$ nearest neighbors
	\item Use the class labels of those $k$ nearest neighbors to determine the class label of this record (e.g. by taking a majority vote)
\end{itemize}

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Rectangle [id:dp728048521664103] 
\draw   (38.67,21.33) -- (350.33,21.33) -- (350.33,256.67) -- (38.67,256.67) -- cycle ;
%Shape: Circle [id:dp4770571693065703] 
\draw  [color={rgb, 255:red, 80; green, 227; blue, 194 }  ,draw opacity=1 ][dash pattern={on 0.84pt off 2.51pt}] (219.33,65.5) .. controls (219.33,47.18) and (234.18,32.33) .. (252.5,32.33) .. controls (270.82,32.33) and (285.67,47.18) .. (285.67,65.5) .. controls (285.67,83.82) and (270.82,98.67) .. (252.5,98.67) .. controls (234.18,98.67) and (219.33,83.82) .. (219.33,65.5) -- cycle ;
%Straight Lines [id:da28623831301148495] 
\draw    (509.33,91.67) -- (267.66,66.21) ;
\draw [shift={(265.67,66)}, rotate = 366.01] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (87.33,89) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (107.33,109) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (156.67,113) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (209.33,121) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (136,97) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (203.33,44.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (285.33,79.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (279.33,104.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (216,148.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (112,197.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (154.67,218.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (216.67,219) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (115.33,42.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (136.67,62.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (170,41) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (176,83.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (56,228.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (291.33,166.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (68.67,145.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (293.33,210.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (329.33,86.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (320,35) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (40,17) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (60,37) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{-}};
% Text Node
\draw (124.67,128) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{+}};
% Text Node
\draw (182.67,187.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{+}};
% Text Node
\draw (123.33,159.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{+}};
% Text Node
\draw (168,134.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{+}};
% Text Node
\draw (158,171.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{+}};
% Text Node
\draw (226,71.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{+}};
% Text Node
\draw (236.67,37.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{+}};
% Text Node
\draw (269.33,42.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{+}};
% Text Node
\draw (248,56) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.31,0.89,0.76}{x}};
% Text Node
\draw (513.33,59.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {The K-NN classifier\\will mark this\\unknown point \textcolor[rgb]{0.31,0.89,0.76}{x}\\as a \textcolor[rgb]{0.29,0.56,0.89}{+}, due to its\\neighbors};


\end{tikzpicture}


}


\hfill \break You can have 1-nearest neighbor, 2-nearest neighbors, 3-nearest neighbor, etc. The bottom line is that K-nearest neighbors of a record $x$ are data points that have the $k$ smallest distances to $x$.\\

\begin{tcolorbox}[title=Aside: Voronoi Diagram,colframe=black,colback=white,arc=0pt,fonttitle=\bfseries]

In mathematics, a Voronoi diagram is a partition of a plane into regions close to each of a given set of objects. In the simplest case, these objects are just finitely many points in the plane. For each seed there is a corresponding region consisting of all points of the plane closer to that seed than to any other.\\

This type of diagram helps us visualize the 1-nearest neighbor algorithm, like below:\\

\includegraphics[scale=0.375]{img/voronoi.png} 

\end{tcolorbox}

\subsection{Using Nearest Neighbors}

We can compute the distance between our unknown point $x$ and all the other points to develop a nearest neighbor list. We can then take the majority vote of class labels among the k-nearest neighbors. We can also weight the vote according to distance.

\subsection{Choosing a Value for $k$}
\begin{itemize}
	\item If $k$ is too small, it's sensitive to noise points (i.e. one or two 'mistake' points may throw it off)
	\item If $k$ is too large, the neighborhood may include points from other classes; it may end up being useless.
\end{itemize}

\subsection{Scaling}

Additionally, attributes have to scaled to prevent distance measured from being dominated by one of the attributes. For example. if we are comparing people:\\

\begin{myproof}
\begin{itemize}
	\item height of a person may vary from 1.5m to 1.8m
	\item weight of a person may vary from 90lb to 300lb
	\item income of a person may vary from \$10k to \$1M
\end{itemize}
\end{myproof}

\hfill \break We can see that some of these variables vary greatly, while others vary minimally. As such, we have to \textbf{standardize} the variables in order to prevent this problem.

\subsection{Curse of Dimensionality}

There is another problem with using a euclidean measure, if we have data of a high dimension, we know from the \textit{Curse of Dimensionality} that data becomes sparse relative to the total volume of the space, so the distance metrics that we're using effectively lose meaning. This can produce counter-intuitive results- for example:\\

{
\centering
\includegraphics[scale=0.55]{img/cursedim.png} 
}

These two data points have the same \textit{distance}, which is the only measure that the K-NN classifier uses, but we can see with the human eye that they are extremely different comparisons.\\

Think of each 1 or 0 as a value for a variable, these are just multivariable vectors.\\

We can combat this by \textbf{normalizing vectors to unit length}.

\subsubsection{K-NN is Lazy}

K-NN classifiers are known as lazy learners. It doesn't build a model explicitly, such as eager learners like the decision-tree algorithm.

\begin{tcolorbox}[title=Aside: Lazy vs. Eager Learners,colframe=black,colback=white,arc=0pt,fonttitle=\bfseries]

Learning algorithms can either be called lazy or eager learners. Here are the differences.\\

\textbf{Lazy Learners:}
\begin{itemize}
	\item Store the dataset without initially \textbf{learning} from it.
	\item Start classifying data when it receives test/validation data.
	\item Takes less time to 'learn' and more time to 'classify'.
\end{itemize}

\hfill \break \textbf{Eager Learners:}
\begin{itemize}
	\item When it receives data, it immediately starts classifying them.
	\item Does not wait for test data to proceed with learning.
	\item Takes a long time to learn, but less time to classify data.
\end{itemize}

\hfill \break Examples of both in supervised learning:
\begin{itemize}
	\item \textbf{Lazy:}
	\begin{itemize}
		\item K-NN
		\item Case-based reasoning
	\end{itemize}
	\item \textbf{Eager:}
	\begin{itemize}
		\item Decision Tree
		\item Naive Bayes
		\item Artifical Neural Networks
	\end{itemize}
\end{itemize}

\end{tcolorbox}

As such, \textbf{classifying unknown records is relatively expensive}.\\

The Naive algorithm is done in linear time, \texttt{O(n)}. We are in need of data structure that help us retrieve nearest neighbors quickly. Spatial data structures that can help us with this will be covered more in CMSC420.

\subsection{Nearest Neighbor Search}

The nearest neighbor search problem is an \textbf{optimization problem} of finding the point in a given set that is closest or most similar to a given point.\\

Formally, the problem is defined as follows:\\

\begin{myproof}
Given a set $S$ of points in a space $M$ and a query point $q \in M$, find the closest point in $S$ to $q$.
\end{myproof}

\textbf{Knuth} called it the 'post-office' problem, referring to an application of assigning to a residence the nearest post-office.\\

Most commonly, $M$ is a metric space, and dissimilarity is expressed as a distance metric (e.g. Euclidean Distance or Manhattan Distance).

\subsubsection{Exact Methods}

\hfill

\begin{tcolorbox}[title=Aside: Branch and Bound Paradigm,colframe=black,colback=white,arc=0pt,fonttitle=\bfseries]

Branch and bound is an algorithm design paradigm which is generally used for solving combinatorial \textbf{optimization problems}. These problems are typically exponential in terms of time complexity and may require exploring all possible permutations in worst case. The Branch and Bound Algorithm technique solves these problems relatively quickly.\\

More is covered in Algorithms.

\end{tcolorbox}

\hfill

\begin{tcolorbox}[title=Aside: R-tree,colframe=black,colback=white,arc=0pt,fonttitle=\bfseries,breakable]

R-trees are tree data structures used for spatial access methods, i.e., for indexing multi-dimensional information such as geographical coordinates, rectangles or polygons. In other words, the best way to think about an R-tree is like for a \textbf{2-d map space}.\\

A common real-world usage for an R-tree might be to store spatial objects such as restaurant locations or the polygons that typical maps are made of: streets, buildings, outlines of lakes, coastlines, etc. and then find answers quickly to queries such as "Find all museums within 2 km of my current location", "retrieve all road segments within 2 km of my location" (to display them in a navigation system) or "find the nearest gas station" (although not taking roads into account).\\
 
R-trees are similar to B+ Trees in that {\textbf{no data is actually stored in the internal nodes, only in the leaf nodes}}.\\

\includegraphics[scale=0.4]{img/rtree.png} 

\hfill \break An R-tree consists of:
\begin{itemize}
	\item Root node - holds a pointer reference to largest region - \textbf{(grey  wide dashed line)}
	\item Internal nodes - hierarchically store pointers to child nodes, which either store more internal nodes, or leaf nodes. In any case, this will be \textbf{data bounded by this internal node} - \textbf{(smaller dashed lines)}
	\item Leaf nodes - store actual data within respective Minimum Bounding Regions (sub-regions within the entire space that group data in as few regions as possible) - \textbf{(solid lines)}
\end{itemize}

The R-tree can also be generalized to higher dimensions than just the two we see here, (which is where we can make use for it with k-NN classifiers).\\

\includegraphics[scale=0.4]{img/rstartree.png}

\hfill \break Above is a 3 dimensional generalization of the an R-tree. 

\end{tcolorbox}


\begin{itemize}
	\item \textbf{Linear Search}
	\begin{itemize}
		\item The simplest solution to NNS is to compute the distance from the query point to every other point in the database, while keeping track of the current minimum. (Naive approach)
	\end{itemize}
	\item \textbf{Space Partitioning}
	\begin{itemize}
		\item The 'branch and bound' methodology has been applied to this problem since the 1970s. The simplest of these approaches is the \textbf{kd-tree}, which bisects the search spaces into two regions containing half the points of the parent region, over and over again.
		\begin{itemize}
			\item E.g. a \textbf{2d kd-tree} is a data structure for answering nearest neighbor queries in $R^2$.
			\begin{myproof}
				\begin{itemize}
					\item Select the $x$ or $y$ dimension (alternating between the two
					\item Partition the space into two with a line passing from the median point
					\item Repeat recursively in the two partitions as long as there are enough points to do so
					\item Now that you've built the tree, you can \textbf{quickly query it} for nearest neighbors by finding an incumbent best, and pruning large chunks of the tree away as you go
				\end{itemize}
				
				\hfill \break It's worth nothing that this algorithm can be generalized to higher dimensions, for the sake of our advanced problems.
			\end{myproof} 
		\end{itemize}
		\item The \textbf{r-tree} was also developed to support nearest neighbor search in a dynamic context.
	\end{itemize}
\end{itemize}

\subsubsection{Approximation Methods}

An approximate nearest neighbor search algorithm is allowed to return points, whose distance from the query is at most $c$ times the distance from the query to its nearest points. The appeal of this approach is that, in many cases, an approximate nearest neighbor is almost as good as the exact one.\\

\textbf{Proximity Graph Methods, such as HNSW, are considered the current state-of-the-art for the approximate nearest neighbors search}.\\

\begin{tcolorbox}[title=Aside: HNSW,colframe=black,colback=white,arc=0pt,fonttitle=\bfseries,breakable]

The industry standard for the approximate K-nearest neighbor search is known as the method leveraging \textbf{Hierarchical Navigable Small World} graphs.\\

The solution is graph-based, and doesn't depend on any other additional search structures.\\

Hierarchical NSW incrementally builds a multi-layer structure consisting from hierarchical set of proximity graphs (layers) for nested subsets of the stored elements. The maximum layer in which an element is present is selected randomly with an exponentially decaying probability distribution. This allows producing graphs similar to the previously studied Navigable Small World (NSW) structures while additionally having the links separated by their characteristic distance scales.\\

Starting search from the upper layer together with utilizing the scale separation boosts the performance compared to NSW and allows a logarithmic complexity scaling. Additional employment of a heuristic for selecting proximity graph neighbors significantly increases performance at high recall and in case of highly clustered data.\\
\end{tcolorbox}

Other methods include:

\begin{itemize}
	\item Locality sensitive hashing
	\item NNS in spaces with small intrinsic dimension using cover trees
	\item Projected radial search
	\item Vector approximation files
	\item Compression/clustering based search
\end{itemize}

\subsection{Advantages of K-NN}

\begin{itemize}
	\item \textbf{Simple} technique that is easily implemented
	\item Building a model is \textbf{cheap}
	\item Extremely \textbf{flexible} classification scheme
\end{itemize}

K-NN is well-suited for multimodal classes and records with multiple class labels.\\

Additionally, it's been shown that it sometimes is the best method.

\subsection{Disadvantages of K-NN}

\textbf{Classifying unknown records are relatively expensive}, as you have to perform the distance computation of k-nearest neighbors. This is computationally intensive, especially as the size of the training set grows.\\

The accuracy of the classification can also be severely degraded by the presence of:
\begin{itemize}
	\item Noisy or irrelevant features
	\item High-dimensional space (curse of dimensionality)
	\item Choosing the wrong distance metric
	\item Picking a bad value for $k$
\end{itemize}

\subsection{K-NN Classification in Scikit-Learn}

As expected, \texttt{sklearn} supports K-NN classification.

{\centering
\begin{lstlisting}[language=python]
from sklearn import neighbors, datasets

# Load a common dataset, fit a 15-NN classifier to it
iris = datasets.load_iris()
X = iris.data[:, :2] # take the first two features
y = iris.target

clf = neighbors.KNeighborsClassifier(15, weights='uniform')
clf.fit(X,y)
\end{lstlisting}
}


\subsection{K-NN Regression}

Here's the idea- to predict a target value y for a data point $x$, we simply apply interpolation and regression to the neighborhood of $x$.\\

The most naive approach to this would be to simply connect the dots in your graph, yielding the following regression:\\

{
\centering

\includegraphics[scale=0.2]{img/connect.png}

}

\hfill \break We can also, however, apply K-NN methodology to this problem. For example, we can fit a line using linear regression (when $k$ is some number, or we could predict the average target value of the $k$ points.\\

Here is what those two graphs look like.\\

{
\centering

\includegraphics[scale=0.3]{img/knnregression.png}

}

\subsection{Support Vector Machines}

The objective of a Support Vector Machine algorithm is to find a hyperplane in an $n$-dimensional space (where $n$ is the number of features) that directly classifies the data points.\\

Below are examples of such planes in a simple 2D space.\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Straight Lines [id:da18386530311215066] 
\draw    (171,217.33) -- (171,3.75) ;
\draw [shift={(171,1.75)}, rotate = 450] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da8609346891296188] 
\draw    (171,217.33) -- (430,217.33) ;
\draw [shift={(432,217.33)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Shape: Square [id:dp6848519964006714] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (187,179.5) -- (197,179.5) -- (197,189.5) -- (187,189.5) -- cycle ;
%Shape: Square [id:dp3340403515381454] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (207,199.5) -- (217,199.5) -- (217,209.5) -- (207,209.5) -- cycle ;
%Shape: Square [id:dp41498390501770255] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (207.5,184) -- (217.5,184) -- (217.5,194) -- (207.5,194) -- cycle ;
%Shape: Square [id:dp44403283392803305] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (205,137) -- (215,137) -- (215,147) -- (205,147) -- cycle ;
%Shape: Square [id:dp9556758602028731] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (271,171.5) -- (281,171.5) -- (281,181.5) -- (271,181.5) -- cycle ;
%Shape: Square [id:dp7219029696699881] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (236.5,202) -- (246.5,202) -- (246.5,212) -- (236.5,212) -- cycle ;
%Shape: Circle [id:dp6949900053679295] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (282,56.38) .. controls (282,52.72) and (284.97,49.75) .. (288.63,49.75) .. controls (292.28,49.75) and (295.25,52.72) .. (295.25,56.38) .. controls (295.25,60.03) and (292.28,63) .. (288.63,63) .. controls (284.97,63) and (282,60.03) .. (282,56.38) -- cycle ;
%Shape: Circle [id:dp28759288083550705] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (333.5,81.38) .. controls (333.5,77.72) and (336.47,74.75) .. (340.13,74.75) .. controls (343.78,74.75) and (346.75,77.72) .. (346.75,81.38) .. controls (346.75,85.03) and (343.78,88) .. (340.13,88) .. controls (336.47,88) and (333.5,85.03) .. (333.5,81.38) -- cycle ;
%Shape: Circle [id:dp41281492253679297] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (310,41.38) .. controls (310,37.72) and (312.97,34.75) .. (316.63,34.75) .. controls (320.28,34.75) and (323.25,37.72) .. (323.25,41.38) .. controls (323.25,45.03) and (320.28,48) .. (316.63,48) .. controls (312.97,48) and (310,45.03) .. (310,41.38) -- cycle ;
%Shape: Circle [id:dp43905037715166473] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (338.5,42.38) .. controls (338.5,38.72) and (341.47,35.75) .. (345.13,35.75) .. controls (348.78,35.75) and (351.75,38.72) .. (351.75,42.38) .. controls (351.75,46.03) and (348.78,49) .. (345.13,49) .. controls (341.47,49) and (338.5,46.03) .. (338.5,42.38) -- cycle ;
%Shape: Circle [id:dp7915990666164764] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (366,96.88) .. controls (366,93.22) and (368.97,90.25) .. (372.63,90.25) .. controls (376.28,90.25) and (379.25,93.22) .. (379.25,96.88) .. controls (379.25,100.53) and (376.28,103.5) .. (372.63,103.5) .. controls (368.97,103.5) and (366,100.53) .. (366,96.88) -- cycle ;
%Straight Lines [id:da24793340972831157] 
\draw [color={rgb, 255:red, 65; green, 117; blue, 5 }  ,draw opacity=1 ]   (156.5,73.75) -- (357.5,229.25) ;
%Straight Lines [id:da0504208940868186] 
\draw [color={rgb, 255:red, 65; green, 117; blue, 5 }  ,draw opacity=1 ]   (191.5,18.25) -- (427.5,132.75) ;
%Straight Lines [id:da7528490058779704] 
\draw [color={rgb, 255:red, 65; green, 117; blue, 5 }  ,draw opacity=1 ]   (250.5,15.75) -- (324.5,235.75) ;
%Straight Lines [id:da9573027238278069] 
\draw [color={rgb, 255:red, 65; green, 117; blue, 5 }  ,draw opacity=1 ]   (158,102.75) -- (424.5,124.75) ;

% Text Node
\draw (183,249.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.25,0.46,0.02}{These are all valid 'hyperplanes' that}\\\textcolor[rgb]{0.25,0.46,0.02}{appropriately separate the data}};


\end{tikzpicture}


}

\hfill \break Not only this, but the hyperplane is supposed to be \textbf{optimal}. This means that we maximize the margin between the hyperplane and the given data.\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Straight Lines [id:da8848515352287722] 
\draw    (191,220.83) -- (191,7.25) ;
\draw [shift={(191,5.25)}, rotate = 450] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da9427799777360648] 
\draw    (191,220.83) -- (450,220.83) ;
\draw [shift={(452,220.83)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Shape: Square [id:dp7738712618449884] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (207,183) -- (217,183) -- (217,193) -- (207,193) -- cycle ;
%Shape: Square [id:dp6367968791355435] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (227,203) -- (237,203) -- (237,213) -- (227,213) -- cycle ;
%Shape: Square [id:dp7049923323858955] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (227.5,187.5) -- (237.5,187.5) -- (237.5,197.5) -- (227.5,197.5) -- cycle ;
%Shape: Square [id:dp525026405321415] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (225,140.5) -- (235,140.5) -- (235,150.5) -- (225,150.5) -- cycle ;
%Shape: Square [id:dp5091410331124013] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (291,175) -- (301,175) -- (301,185) -- (291,185) -- cycle ;
%Shape: Square [id:dp3104066682039195] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (256.5,205.5) -- (266.5,205.5) -- (266.5,215.5) -- (256.5,215.5) -- cycle ;
%Shape: Circle [id:dp6193179365473596] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (302,59.88) .. controls (302,56.22) and (304.97,53.25) .. (308.63,53.25) .. controls (312.28,53.25) and (315.25,56.22) .. (315.25,59.88) .. controls (315.25,63.53) and (312.28,66.5) .. (308.63,66.5) .. controls (304.97,66.5) and (302,63.53) .. (302,59.88) -- cycle ;
%Shape: Circle [id:dp19335520051482402] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (353.5,84.88) .. controls (353.5,81.22) and (356.47,78.25) .. (360.13,78.25) .. controls (363.78,78.25) and (366.75,81.22) .. (366.75,84.88) .. controls (366.75,88.53) and (363.78,91.5) .. (360.13,91.5) .. controls (356.47,91.5) and (353.5,88.53) .. (353.5,84.88) -- cycle ;
%Shape: Circle [id:dp22051017880086232] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (330,44.88) .. controls (330,41.22) and (332.97,38.25) .. (336.63,38.25) .. controls (340.28,38.25) and (343.25,41.22) .. (343.25,44.88) .. controls (343.25,48.53) and (340.28,51.5) .. (336.63,51.5) .. controls (332.97,51.5) and (330,48.53) .. (330,44.88) -- cycle ;
%Shape: Circle [id:dp9874335837449665] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (358.5,45.88) .. controls (358.5,42.22) and (361.47,39.25) .. (365.13,39.25) .. controls (368.78,39.25) and (371.75,42.22) .. (371.75,45.88) .. controls (371.75,49.53) and (368.78,52.5) .. (365.13,52.5) .. controls (361.47,52.5) and (358.5,49.53) .. (358.5,45.88) -- cycle ;
%Shape: Circle [id:dp20116025262445836] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (386,100.38) .. controls (386,96.72) and (388.97,93.75) .. (392.63,93.75) .. controls (396.28,93.75) and (399.25,96.72) .. (399.25,100.38) .. controls (399.25,104.03) and (396.28,107) .. (392.63,107) .. controls (388.97,107) and (386,104.03) .. (386,100.38) -- cycle ;
%Straight Lines [id:da7695830217123698] 
\draw [color={rgb, 255:red, 65; green, 117; blue, 5 }  ,draw opacity=1 ]   (175,33.75) -- (448.5,216.75) ;
%Straight Lines [id:da09456630945912003] 
\draw [color={rgb, 255:red, 65; green, 117; blue, 5 }  ,draw opacity=1 ] [dash pattern={on 0.84pt off 2.51pt}]  (143,67.25) -- (395,234.25) ;
%Straight Lines [id:da5936354951846885] 
\draw [color={rgb, 255:red, 65; green, 117; blue, 5 }  ,draw opacity=1 ] [dash pattern={on 0.84pt off 2.51pt}]  (207.5,2.75) -- (447.5,167.75) ;
%Straight Lines [id:da6389711771295175] 
\draw [color={rgb, 255:red, 144; green, 19; blue, 254 }  ,draw opacity=1 ]   (363,158.75) -- (338.21,191.16) ;
\draw [shift={(337,192.75)}, rotate = 307.40999999999997] [color={rgb, 255:red, 144; green, 19; blue, 254 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da5763440187465451] 
\draw [color={rgb, 255:red, 144; green, 19; blue, 254 }  ,draw opacity=1 ]   (363,158.75) -- (384.3,130.35) ;
\draw [shift={(385.5,128.75)}, rotate = 486.87] [color={rgb, 255:red, 144; green, 19; blue, 254 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (213.5,249.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0,0,0}{The }\textcolor[rgb]{0.25,0.46,0.02}{optimal hyperplane }\textcolor[rgb]{0,0,0}{ maximizes }\\\textcolor[rgb]{0,0,0}{the }\textcolor[rgb]{0.56,0.07,1}{margin }\textcolor[rgb]{0,0,0}{        
 from the hyperplane.}};


\end{tikzpicture}


}


\hfill \begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Hyperplane} $\rightarrow$ Decision boundaries that exist in $n$-dimensional space which help us classify data points. Data points falling on either side of a hyperplane can be attributed to different classes.
\end{tcolorbox}

\hfill \break It's important to note that although the given examples lie in $\mathbb{R}^2$, this learning method generalizes hyperplanes to $\mathbb{R}^N$.\\

\textbf{If the problem is not linearly separable} but should still be described by a linear hyperplane, we introduce \textbf{slack variables}.\\

We can introduce some sort of penalty for hyperplanes that deduct accuracy points for how far wrongly classified points lie from the hyperplane. Example:\\\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Straight Lines [id:da01796656186721346] 
\draw    (211,240.83) -- (211,27.25) ;
\draw [shift={(211,25.25)}, rotate = 450] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da2782396605078653] 
\draw    (211,240.83) -- (470,240.83) ;
\draw [shift={(472,240.83)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Shape: Square [id:dp3315314160826902] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (227,203) -- (237,203) -- (237,213) -- (227,213) -- cycle ;
%Shape: Square [id:dp0016860256588290268] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (247,223) -- (257,223) -- (257,233) -- (247,233) -- cycle ;
%Shape: Square [id:dp5936173874020919] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (247.5,207.5) -- (257.5,207.5) -- (257.5,217.5) -- (247.5,217.5) -- cycle ;
%Shape: Square [id:dp9948970524491898] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (245,160.5) -- (255,160.5) -- (255,170.5) -- (245,170.5) -- cycle ;
%Shape: Square [id:dp22512005205837982] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (276.5,225.5) -- (286.5,225.5) -- (286.5,235.5) -- (276.5,235.5) -- cycle ;
%Shape: Circle [id:dp1467636690035563] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (322,79.88) .. controls (322,76.22) and (324.97,73.25) .. (328.63,73.25) .. controls (332.28,73.25) and (335.25,76.22) .. (335.25,79.88) .. controls (335.25,83.53) and (332.28,86.5) .. (328.63,86.5) .. controls (324.97,86.5) and (322,83.53) .. (322,79.88) -- cycle ;
%Shape: Circle [id:dp042043022281902376] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (373.5,104.88) .. controls (373.5,101.22) and (376.47,98.25) .. (380.13,98.25) .. controls (383.78,98.25) and (386.75,101.22) .. (386.75,104.88) .. controls (386.75,108.53) and (383.78,111.5) .. (380.13,111.5) .. controls (376.47,111.5) and (373.5,108.53) .. (373.5,104.88) -- cycle ;
%Shape: Circle [id:dp7213397614579665] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (406,120.38) .. controls (406,116.72) and (408.97,113.75) .. (412.63,113.75) .. controls (416.28,113.75) and (419.25,116.72) .. (419.25,120.38) .. controls (419.25,124.03) and (416.28,127) .. (412.63,127) .. controls (408.97,127) and (406,124.03) .. (406,120.38) -- cycle ;
%Straight Lines [id:da24130497928153327] 
\draw [color={rgb, 255:red, 65; green, 117; blue, 5 }  ,draw opacity=1 ]   (195,53.75) -- (468.5,236.75) ;
%Straight Lines [id:da018442435199803597] 
\draw [color={rgb, 255:red, 65; green, 117; blue, 5 }  ,draw opacity=1 ] [dash pattern={on 0.84pt off 2.51pt}]  (163,87.25) -- (415,254.25) ;
%Straight Lines [id:da8988625347181801] 
\draw [color={rgb, 255:red, 65; green, 117; blue, 5 }  ,draw opacity=1 ] [dash pattern={on 0.84pt off 2.51pt}]  (227.5,22.75) -- (467.5,187.75) ;
%Straight Lines [id:da7951430137319083] 
\draw [color={rgb, 255:red, 144; green, 19; blue, 254 }  ,draw opacity=1 ]   (383,178.75) -- (358.21,211.16) ;
\draw [shift={(357,212.75)}, rotate = 307.40999999999997] [color={rgb, 255:red, 144; green, 19; blue, 254 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da47541606275337067] 
\draw [color={rgb, 255:red, 144; green, 19; blue, 254 }  ,draw opacity=1 ]   (383,178.75) -- (404.3,150.35) ;
\draw [shift={(405.5,148.75)}, rotate = 486.87] [color={rgb, 255:red, 144; green, 19; blue, 254 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Shape: Square [id:dp582014665137361] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (331,215) -- (341,215) -- (341,225) -- (331,225) -- cycle ;
%Shape: Square [id:dp7351590540629533] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (287.5,177) -- (297.5,177) -- (297.5,187) -- (287.5,187) -- cycle ;
%Shape: Square [id:dp6363191214824445] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (346,108.5) -- (356,108.5) -- (356,118.5) -- (346,118.5) -- cycle ;
%Shape: Circle [id:dp8084153525154247] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (259.5,70.38) .. controls (259.5,66.72) and (262.47,63.75) .. (266.13,63.75) .. controls (269.78,63.75) and (272.75,66.72) .. (272.75,70.38) .. controls (272.75,74.03) and (269.78,77) .. (266.13,77) .. controls (262.47,77) and (259.5,74.03) .. (259.5,70.38) -- cycle ;
%Shape: Circle [id:dp016747517759887365] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (313,203.38) .. controls (313,199.72) and (315.97,196.75) .. (319.63,196.75) .. controls (323.28,196.75) and (326.25,199.72) .. (326.25,203.38) .. controls (326.25,207.03) and (323.28,210) .. (319.63,210) .. controls (315.97,210) and (313,207.03) .. (313,203.38) -- cycle ;
%Shape: Square [id:dp6589351747801206] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (221.5,111) -- (231.5,111) -- (231.5,121) -- (221.5,121) -- cycle ;
%Shape: Circle [id:dp9527276371732271] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (289,133.88) .. controls (289,130.22) and (291.97,127.25) .. (295.63,127.25) .. controls (299.28,127.25) and (302.25,130.22) .. (302.25,133.88) .. controls (302.25,137.53) and (299.28,140.5) .. (295.63,140.5) .. controls (291.97,140.5) and (289,137.53) .. (289,133.88) -- cycle ;
%Shape: Circle [id:dp027146634060734698] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (361,74.38) .. controls (361,70.72) and (363.97,67.75) .. (367.63,67.75) .. controls (371.28,67.75) and (374.25,70.72) .. (374.25,74.38) .. controls (374.25,78.03) and (371.28,81) .. (367.63,81) .. controls (363.97,81) and (361,78.03) .. (361,74.38) -- cycle ;
%Shape: Circle [id:dp0005858333226763657] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (396,80.88) .. controls (396,77.22) and (398.97,74.25) .. (402.63,74.25) .. controls (406.28,74.25) and (409.25,77.22) .. (409.25,80.88) .. controls (409.25,84.53) and (406.28,87.5) .. (402.63,87.5) .. controls (398.97,87.5) and (396,84.53) .. (396,80.88) -- cycle ;
%Shape: Circle [id:dp875011269786432] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (320,42.88) .. controls (320,39.22) and (322.97,36.25) .. (326.63,36.25) .. controls (330.28,36.25) and (333.25,39.22) .. (333.25,42.88) .. controls (333.25,46.53) and (330.28,49.5) .. (326.63,49.5) .. controls (322.97,49.5) and (320,46.53) .. (320,42.88) -- cycle ;
%Straight Lines [id:da6997040311272985] 
\draw [color={rgb, 255:red, 245; green, 166; blue, 35 }  ,draw opacity=1 ]   (141,131) -- (287,133.84) ;
\draw [shift={(289,133.88)}, rotate = 181.11] [color={rgb, 255:red, 245; green, 166; blue, 35 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da6947878114702344] 
\draw [color={rgb, 255:red, 245; green, 166; blue, 35 }  ,draw opacity=1 ]   (141,131) -- (343.51,114.41) ;
\draw [shift={(345.5,114.25)}, rotate = 535.3199999999999] [color={rgb, 255:red, 245; green, 166; blue, 35 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da9309440088182815] 
\draw [color={rgb, 255:red, 245; green, 166; blue, 35 }  ,draw opacity=1 ]   (141,131) -- (310.14,198.01) ;
\draw [shift={(312,198.75)}, rotate = 201.61] [color={rgb, 255:red, 245; green, 166; blue, 35 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (26.5,100.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.96,0.65,0.14}{Apply some sort}\\\textcolor[rgb]{0.96,0.65,0.14}{of penalty for all}\\\textcolor[rgb]{0.96,0.65,0.14}{cases over the line}};


\end{tikzpicture}
}

\subsubsection{If the Decision Boundary Is Not Linear}

If the decision boundary itself is not linear, e.g. if we have something like this, it's not conventional to then establish some sort of bendy-looking hyperplane, like below.\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Straight Lines [id:da7593240158764352] 
\draw    (191,237.33) -- (191,23.75) ;
\draw [shift={(191,21.75)}, rotate = 450] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da9314079217566626] 
\draw    (191,237.33) -- (450,237.33) ;
\draw [shift={(452,237.33)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Shape: Square [id:dp6569522950579582] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (207,199.5) -- (217,199.5) -- (217,209.5) -- (207,209.5) -- cycle ;
%Shape: Square [id:dp6645625332270019] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (227,219.5) -- (237,219.5) -- (237,229.5) -- (227,229.5) -- cycle ;
%Shape: Square [id:dp5344188467985732] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (227.5,204) -- (237.5,204) -- (237.5,214) -- (227.5,214) -- cycle ;
%Shape: Square [id:dp9711908633057964] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (225,157) -- (235,157) -- (235,167) -- (225,167) -- cycle ;
%Shape: Square [id:dp16440098376925016] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (291,191.5) -- (301,191.5) -- (301,201.5) -- (291,201.5) -- cycle ;
%Shape: Square [id:dp6295522422798431] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (256.5,222) -- (266.5,222) -- (266.5,232) -- (256.5,232) -- cycle ;
%Shape: Circle [id:dp8088147286543861] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (354,141.71) .. controls (354,138.05) and (356.97,135.08) .. (360.63,135.08) .. controls (364.28,135.08) and (367.25,138.05) .. (367.25,141.71) .. controls (367.25,145.37) and (364.28,148.33) .. (360.63,148.33) .. controls (356.97,148.33) and (354,145.37) .. (354,141.71) -- cycle ;
%Shape: Circle [id:dp1992577358895542] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (328.83,178.71) .. controls (328.83,175.05) and (331.8,172.08) .. (335.46,172.08) .. controls (339.12,172.08) and (342.08,175.05) .. (342.08,178.71) .. controls (342.08,182.37) and (339.12,185.33) .. (335.46,185.33) .. controls (331.8,185.33) and (328.83,182.37) .. (328.83,178.71) -- cycle ;
%Shape: Circle [id:dp16246109141935272] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (307.33,137.37) .. controls (307.33,133.72) and (310.3,130.75) .. (313.96,130.75) .. controls (317.62,130.75) and (320.58,133.72) .. (320.58,137.37) .. controls (320.58,141.03) and (317.62,144) .. (313.96,144) .. controls (310.3,144) and (307.33,141.03) .. (307.33,137.37) -- cycle ;
%Shape: Circle [id:dp8662536973301378] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (326.5,108.38) .. controls (326.5,104.72) and (329.47,101.75) .. (333.13,101.75) .. controls (336.78,101.75) and (339.75,104.72) .. (339.75,108.38) .. controls (339.75,112.03) and (336.78,115) .. (333.13,115) .. controls (329.47,115) and (326.5,112.03) .. (326.5,108.38) -- cycle ;
%Shape: Circle [id:dp05370372919991773] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (380.67,168.88) .. controls (380.67,165.22) and (383.63,162.25) .. (387.29,162.25) .. controls (390.95,162.25) and (393.92,165.22) .. (393.92,168.88) .. controls (393.92,172.53) and (390.95,175.5) .. (387.29,175.5) .. controls (383.63,175.5) and (380.67,172.53) .. (380.67,168.88) -- cycle ;
%Shape: Square [id:dp00830661912116315] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (261.67,56.33) -- (271.67,56.33) -- (271.67,66.33) -- (261.67,66.33) -- cycle ;
%Shape: Square [id:dp7723754051562635] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (370.33,55) -- (380.33,55) -- (380.33,65) -- (370.33,65) -- cycle ;
%Shape: Square [id:dp520347148909745] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (436.33,117) -- (446.33,117) -- (446.33,127) -- (436.33,127) -- cycle ;
%Shape: Square [id:dp08076004458411945] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (390.33,75) -- (400.33,75) -- (400.33,85) -- (390.33,85) -- cycle ;
%Shape: Circle [id:dp8683207709929821] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (282.5,93.04) .. controls (282.5,89.38) and (285.47,86.42) .. (289.13,86.42) .. controls (292.78,86.42) and (295.75,89.38) .. (295.75,93.04) .. controls (295.75,96.7) and (292.78,99.67) .. (289.13,99.67) .. controls (285.47,99.67) and (282.5,96.7) .. (282.5,93.04) -- cycle ;
%Shape: Square [id:dp34029863343790434] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (245,177) -- (255,177) -- (255,187) -- (245,187) -- cycle ;
%Shape: Square [id:dp8863610232387268] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (221.67,107.67) -- (231.67,107.67) -- (231.67,117.67) -- (221.67,117.67) -- cycle ;
%Shape: Square [id:dp1334661665603727] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (241.67,127.67) -- (251.67,127.67) -- (251.67,137.67) -- (241.67,137.67) -- cycle ;
%Shape: Circle [id:dp33186013093871825] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (302.5,113.04) .. controls (302.5,109.38) and (305.47,106.42) .. (309.13,106.42) .. controls (312.78,106.42) and (315.75,109.38) .. (315.75,113.04) .. controls (315.75,116.7) and (312.78,119.67) .. (309.13,119.67) .. controls (305.47,119.67) and (302.5,116.7) .. (302.5,113.04) -- cycle ;
%Shape: Circle [id:dp5459936486339871] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (327.33,157.37) .. controls (327.33,153.72) and (330.3,150.75) .. (333.96,150.75) .. controls (337.62,150.75) and (340.58,153.72) .. (340.58,157.37) .. controls (340.58,161.03) and (337.62,164) .. (333.96,164) .. controls (330.3,164) and (327.33,161.03) .. (327.33,157.37) -- cycle ;
%Shape: Circle [id:dp19118451063184194] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (348.83,198.71) .. controls (348.83,195.05) and (351.8,192.08) .. (355.46,192.08) .. controls (359.12,192.08) and (362.08,195.05) .. (362.08,198.71) .. controls (362.08,202.37) and (359.12,205.33) .. (355.46,205.33) .. controls (351.8,205.33) and (348.83,202.37) .. (348.83,198.71) -- cycle ;
%Curve Lines [id:da7185389460482552] 
\draw [color={rgb, 255:red, 65; green, 117; blue, 5 }  ,draw opacity=1 ]   (353,238.67) .. controls (156.33,18) and (328,22.67) .. (471.67,220.67) ;

% Text Node
\draw (334,253) node [anchor=north west][inner sep=0.75pt]   [align=left] {$\displaystyle x_{1}$};
% Text Node
\draw (154.67,130.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {$\displaystyle x_{2}$};


\end{tikzpicture}

}

\hfill \break Instead, we can simply transform the data into higher dimensional space, and attempt to establish a hyperplane there.\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Straight Lines [id:da4976815129846708] 
\draw    (211,237.33) -- (211,23.75) ;
\draw [shift={(211,21.75)}, rotate = 450] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da6215468482044874] 
\draw    (211,237.33) -- (470,237.33) ;
\draw [shift={(472,237.33)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Shape: Square [id:dp7650732726962683] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (227,199.5) -- (237,199.5) -- (237,209.5) -- (227,209.5) -- cycle ;
%Shape: Square [id:dp2542887845385543] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (274.33,192.83) -- (284.33,192.83) -- (284.33,202.83) -- (274.33,202.83) -- cycle ;
%Shape: Square [id:dp002216367259890495] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (252.33,184.17) -- (262.33,184.17) -- (262.33,194.17) -- (252.33,194.17) -- cycle ;
%Shape: Square [id:dp4889930310875269] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (366.5,72.67) -- (376.5,72.67) -- (376.5,82.67) -- (366.5,82.67) -- cycle ;
%Shape: Circle [id:dp5040609804719949] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (368.67,213.38) .. controls (368.67,209.72) and (371.63,206.75) .. (375.29,206.75) .. controls (378.95,206.75) and (381.92,209.72) .. (381.92,213.38) .. controls (381.92,217.03) and (378.95,220) .. (375.29,220) .. controls (371.63,220) and (368.67,217.03) .. (368.67,213.38) -- cycle ;
%Shape: Circle [id:dp21530560373765883] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (437.17,195.71) .. controls (437.17,192.05) and (440.13,189.08) .. (443.79,189.08) .. controls (447.45,189.08) and (450.42,192.05) .. (450.42,195.71) .. controls (450.42,199.37) and (447.45,202.33) .. (443.79,202.33) .. controls (440.13,202.33) and (437.17,199.37) .. (437.17,195.71) -- cycle ;
%Shape: Square [id:dp5926863681290168] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (339,100.33) -- (349,100.33) -- (349,110.33) -- (339,110.33) -- cycle ;
%Shape: Square [id:dp4836124599093601] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (390.33,55) -- (400.33,55) -- (400.33,65) -- (390.33,65) -- cycle ;
%Shape: Square [id:dp38407194535659794] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (431,35.67) -- (441,35.67) -- (441,45.67) -- (431,45.67) -- cycle ;
%Shape: Square [id:dp057368776129545385] 
\draw  [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ] (410.33,75) -- (420.33,75) -- (420.33,85) -- (410.33,85) -- cycle ;
%Shape: Circle [id:dp686335363647088] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (379.83,183.04) .. controls (379.83,179.38) and (382.8,176.42) .. (386.46,176.42) .. controls (390.12,176.42) and (393.08,179.38) .. (393.08,183.04) .. controls (393.08,186.7) and (390.12,189.67) .. (386.46,189.67) .. controls (382.8,189.67) and (379.83,186.7) .. (379.83,183.04) -- cycle ;
%Shape: Circle [id:dp9338176478167173] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (413.17,200.38) .. controls (413.17,196.72) and (416.13,193.75) .. (419.79,193.75) .. controls (423.45,193.75) and (426.42,196.72) .. (426.42,200.38) .. controls (426.42,204.03) and (423.45,207) .. (419.79,207) .. controls (416.13,207) and (413.17,204.03) .. (413.17,200.38) -- cycle ;
%Shape: Circle [id:dp7380340003829592] 
\draw  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ] (350.17,196.71) .. controls (350.17,193.05) and (353.13,190.08) .. (356.79,190.08) .. controls (360.45,190.08) and (363.42,193.05) .. (363.42,196.71) .. controls (363.42,200.37) and (360.45,203.33) .. (356.79,203.33) .. controls (353.13,203.33) and (350.17,200.37) .. (350.17,196.71) -- cycle ;
%Straight Lines [id:da4236226895450327] 
\draw [color={rgb, 255:red, 65; green, 117; blue, 5 }  ,draw opacity=1 ]   (259.33,247.67) -- (483.67,48.67) ;

% Text Node
\draw (354,253) node [anchor=north west][inner sep=0.75pt]   [align=left] {$\displaystyle x_{1}$};
% Text Node
\draw (130.67,126.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {$\displaystyle ( x_{1} +x_{2})^{4}$};


\end{tikzpicture}

}

\subsubsection{SVMs in Scikit-Learn}

\texttt{sklearn} also has the ability to produce SVM classifiers.\\


{\centering
\begin{lstlisting}[language=python]
from sklearn import svm

# Fit a default SVM classifier to fake data
X = [[0, 0], [1, 1]]
y = [0, 1]

clf = svm.SVC()
clf.fit(X, y)
\end{lstlisting}
}

\section{Nonlinear Regression \& Regularization (L22 + 23)}

So far, we have known linear regression as just 'drawing a line through our data', but it was in actuality a function of our input features.\\

It may seem limited, but when you apply linear regression algorithms to non-linear features of input data, you can essentially create a non-linear prediction.\\

Our \textbf{linear regression algorithm can be generalized to higher degree, non-linear functions as well}.

\subsection{Generalization Error}

The problem with the canonical machine learning problem (discussed earlier, is the following:

$$ \underbrace{\sum_{i=1}^{m}{l((h_{\theta}(x^i)), y^i)}}_{\mathrm{minimize}}$$

The whole point is to minimize the error on the \textit{training} data set, not on the \textit{validation set}. I.e. we're optimizing for the given training data by definition, not the new data that will be provided.\\

\subsubsection{Training vs. Generalization Loss}

Here, we should make an important distinction between training loss and generalization loss.\\

As an example, as we increase the degree of our regression function, from $1$ to $2$ to $n$, we have to understand that\textbf{ at a certain point although we become more and more accurate in fitting a function to the training data, we start losing accuracy with new data that we need to classify.}\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Straight Lines [id:da05956360553502815] 
\draw    (191,237.33) -- (191,23.75) ;
\draw [shift={(191,21.75)}, rotate = 450] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da951863238215068] 
\draw    (191,237.33) -- (450,237.33) ;
\draw [shift={(452,237.33)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Curve Lines [id:da37960579489360324] 
\draw [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,draw opacity=1 ]   (191.67,48) .. controls (263.67,200.67) and (344.33,218.67) .. (439.67,224.67) ;
%Curve Lines [id:da6900295471133798] 
\draw [color={rgb, 255:red, 208; green, 2; blue, 27 }  ,draw opacity=1 ]   (192.33,44) .. controls (264.33,196.67) and (351,232) .. (439.67,165.33) ;

% Text Node
\draw (138,123) node [anchor=north west][inner sep=0.75pt]   [align=left] {Loss};
% Text Node
\draw (259.33,259) node [anchor=north west][inner sep=0.75pt]   [align=left] {Model Complexity};
% Text Node
\draw (239.14,86.5) node [anchor=north west][inner sep=0.75pt]  [rotate=-47.7] [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{Generalization}};
% Text Node
\draw (231.02,136.03) node [anchor=north west][inner sep=0.75pt]  [rotate=-45.18] [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{Training}};


\end{tikzpicture}


}

\subsubsection{Cross-Validation}

Although it is difficult to quantify the true generalization error, we can approximate it using \textbf{holdout cross-validation}.\\

We want to split the dataset into a training set and a 'holdout' set- we train our algorithm on the training set and evaluate on the holdout set.\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Rectangle [id:dp8876375748948402] 
\draw  [fill={rgb, 255:red, 184; green, 233; blue, 134 }  ,fill opacity=1 ] (101.33,112.67) -- (571,112.67) -- (571,181.33) -- (101.33,181.33) -- cycle ;
%Shape: Rectangle [id:dp8252875670245605] 
\draw  [fill={rgb, 255:red, 80; green, 227; blue, 194 }  ,fill opacity=1 ] (101.33,112.67) -- (409,112.67) -- (409,181.33) -- (101.33,181.33) -- cycle ;

% Text Node
\draw (177.33,138) node [anchor=north west][inner sep=0.75pt]   [align=left] {Training Set (E.g. $\displaystyle 70\%)$};
% Text Node
\draw (425.33,127.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {Holdout / Validation\\set (E.g. $\displaystyle 30\%)$};
% Text Node
\draw (96.67,199) node [anchor=north west][inner sep=0.75pt]   [align=left] {$\displaystyle \underbrace{\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ }_{All\ data}$};


\end{tikzpicture}


}

\subsection{Regularization}

We have previously seen that the degree of a polynomial acts as a natural measure of the 'complexity' of a model: \textbf{higher degree polynomials are more complex}.\\

However, we can notice that fitting these models also requires \textbf{extremely large coefficients on these very high degree polynomials}. Thus, we can look at an alternative way to control model complexity: by keeping weights small.

\subsection{Unsupervised Learning}

Previously, we learned about the Supervised Learning paradigm. Here it is again.\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Straight Lines [id:da6934013484644858] 
\draw    (59,40.33) -- (190.33,40.33) ;
%Straight Lines [id:da22235545874167506] 
\draw    (255,40.33) -- (386.33,40.33) ;
%Straight Lines [id:da7620370986062277] 
\draw    (447.67,40.33) -- (579,40.33) ;
%Straight Lines [id:da5092396514647811] 
\draw    (315.67,153.33) -- (331.83,90.6) ;
\draw [shift={(332.33,88.67)}, rotate = 464.45] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Shape: Rectangle [id:dp7402750968279076] 
\draw   (48.67,5.33) -- (205,5.33) -- (205,296.67) -- (48.67,296.67) -- cycle ;
%Shape: Rectangle [id:dp05887269365061776] 
\draw   (242,5.33) -- (398.33,5.33) -- (398.33,296.67) -- (242,296.67) -- cycle ;
%Shape: Rectangle [id:dp2515662723024984] 
\draw   (433.33,6.67) -- (589.67,6.67) -- (589.67,298) -- (433.33,298) -- cycle ;
%Straight Lines [id:da11949475196221093] 
\draw    (205,148.33) -- (239.67,148.33) ;
\draw [shift={(241.67,148.33)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da6197911001911289] 
\draw    (397.67,148.33) -- (430.33,148.33) ;
\draw [shift={(432.33,148.33)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (82,19.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {Training Data};
% Text Node
\draw (276.67,19.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {ML Algorithm};
% Text Node
\draw (474.67,19.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {Predictions};
% Text Node
\draw (87.33,49.73) node [anchor=north west][inner sep=0.75pt]    {$\left( x^{1} ,y^{1}\right)$};
% Text Node
\draw (68,81.73) node [anchor=north west][inner sep=0.75pt]    {$\left( x^{2} ,y^{2}\right)$};
% Text Node
\draw (85.33,137.73) node [anchor=north west][inner sep=0.75pt]    {$\left( x^{4} ,y^{4}\right)$};
% Text Node
\draw (112.67,105.73) node [anchor=north west][inner sep=0.75pt]    {$\left( x^{3} ,y^{3}\right)$};
% Text Node
\draw (116,169.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {.\\.\\.};
% Text Node
\draw (88,239.07) node [anchor=north west][inner sep=0.75pt]    {$\left( x^{n} ,y^{n}\right)$};
% Text Node
\draw (278.67,157.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {Hypothesis\\function\\formed\\based on \\the given\\training data};
% Text Node
\draw (284,60) node [anchor=north west][inner sep=0.75pt]   [align=left] {$\displaystyle y^{i}$$\displaystyle =h\left( x^{i}\right)$};
% Text Node
\draw (451.33,50.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {Given a new input};
% Text Node
\draw (497.33,73.07) node [anchor=north west][inner sep=0.75pt]    {$x_{new}$};
% Text Node
\draw (450,107.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {The algorithm will\\predict what it\\ends up as};
% Text Node
\draw (462,179.07) node [anchor=north west][inner sep=0.75pt]    {$y_{new} =h( x_{new})$};

$$


\end{tikzpicture}


}


\hfill \break Now, we can introduce the Unsupervised Learning paradigm. It's basically \textbf{equivalent to the Supervised Learning paradigm, but our training data no longer has valid answers mapped to it}.\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Straight Lines [id:da6934013484644858] 
\draw    (59,40.33) -- (190.33,40.33) ;
%Straight Lines [id:da22235545874167506] 
\draw    (255,40.33) -- (386.33,40.33) ;
%Straight Lines [id:da7620370986062277] 
\draw    (447.67,40.33) -- (579,40.33) ;
%Straight Lines [id:da5092396514647811] 
\draw    (315.67,153.33) -- (331.83,90.6) ;
\draw [shift={(332.33,88.67)}, rotate = 464.45] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Shape: Rectangle [id:dp7402750968279076] 
\draw   (48.67,5.33) -- (205,5.33) -- (205,296.67) -- (48.67,296.67) -- cycle ;
%Shape: Rectangle [id:dp05887269365061776] 
\draw   (242,5.33) -- (398.33,5.33) -- (398.33,296.67) -- (242,296.67) -- cycle ;
%Shape: Rectangle [id:dp2515662723024984] 
\draw   (433.33,6.67) -- (589.67,6.67) -- (589.67,298) -- (433.33,298) -- cycle ;
%Straight Lines [id:da11949475196221093] 
\draw    (205,148.33) -- (239.67,148.33) ;
\draw [shift={(241.67,148.33)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da6197911001911289] 
\draw    (397.67,148.33) -- (430.33,148.33) ;
\draw [shift={(432.33,148.33)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (82,19.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {Training Data};
% Text Node
\draw (276.67,19.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {ML Algorithm};
% Text Node
\draw (474.67,19.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {Predictions};
% Text Node
\draw (87.33,49.73) node [anchor=north west][inner sep=0.75pt]    {$\left( x^{1}\right)$};
% Text Node
\draw (68,81.73) node [anchor=north west][inner sep=0.75pt]    {$\left( x^{2}\right)$};
% Text Node
\draw (85.33,137.73) node [anchor=north west][inner sep=0.75pt]    {$\left( x^{4}\right)$};
% Text Node
\draw (112.67,105.73) node [anchor=north west][inner sep=0.75pt]    {$\left( x^{3}\right)$};
% Text Node
\draw (116,169.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {.\\.\\.};
% Text Node
\draw (88,239.07) node [anchor=north west][inner sep=0.75pt]    {$\left( x^{n}\right)$};
% Text Node
\draw (278.67,157.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {Hypothesis\\function\\formed\\based on \\the given\\training data};
% Text Node
\draw (284,60) node [anchor=north west][inner sep=0.75pt]   [align=left] {$\displaystyle y^{i}$$\displaystyle =h\left( x^{i}\right)$};
% Text Node
\draw (451.33,50.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {Given a new input};
% Text Node
\draw (497.33,73.07) node [anchor=north west][inner sep=0.75pt]    {$x_{new}$};
% Text Node
\draw (450,107.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {The algorithm will\\predict what it\\ends up as};
% Text Node
\draw (462,179.07) node [anchor=north west][inner sep=0.75pt]    {$y_{new} =h( x_{new})$};

$$


\end{tikzpicture}


}

\hfill \break In a less formal sense, virtually all unsupervised learning algorithms can be considered in the same manner as supervised learning.\\

Define a hypothesis function, define a loss function, and define how to optimize the loss function. The question is: \textbf{What do the hypothesis function and loss function signify in unsupervised learning?}\\

The goal of unsupervised learning is to pick some restricted class of hypothesis functions that extract some kind of \textbf{structure} from the data that isn't explicitly given to us. Examples of this type of algorithm are K-means and Principal Component Analysis.\\

\subsection{K-Means Clustering}

The K-Means Algorithm is easy to visualize. If we are given some collection of data points, we want to find $k$ 'center' points such that all points are close to at least one of these 'centers'.\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Circle [id:dp49730200176650663] 
\draw   (184.5,122) .. controls (184.5,119.51) and (182.49,117.5) .. (180,117.5) .. controls (177.51,117.5) and (175.5,119.51) .. (175.5,122) .. controls (175.5,124.49) and (177.51,126.5) .. (180,126.5) .. controls (182.49,126.5) and (184.5,124.49) .. (184.5,122) -- cycle ;
%Straight Lines [id:da019959509989611823] 
\draw    (136,188.75) -- (175.39,129.91) ;
\draw [shift={(176.5,128.25)}, rotate = 483.8] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Shape: Circle [id:dp6027620182293444] 
\draw  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ] (431.5,158.5) .. controls (431.5,156.01) and (429.49,154) .. (427,154) .. controls (424.51,154) and (422.5,156.01) .. (422.5,158.5) .. controls (422.5,160.99) and (424.51,163) .. (427,163) .. controls (429.49,163) and (431.5,160.99) .. (431.5,158.5) -- cycle ;
%Straight Lines [id:da47278888790990925] 
\draw [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ]   (383,225.25) -- (422.39,166.41) ;
\draw [shift={(423.5,164.75)}, rotate = 483.8] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (122,127.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{x}};
% Text Node
\draw (142,147.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{x}};
% Text Node
\draw (161.33,107) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{x}};
% Text Node
\draw (181.33,153.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{x}};
% Text Node
\draw (210,91.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{x}};
% Text Node
\draw (227.33,115) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{x}};
% Text Node
\draw (126.67,85) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{x}};
% Text Node
\draw (184.67,70.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{x}};
% Text Node
\draw (121,187.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {$\displaystyle \mu _{1}$};
% Text Node
\draw (373.67,121.5) node [anchor=north west][inner sep=0.75pt]  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,opacity=1 ] [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{x}};
% Text Node
\draw (368,224) node [anchor=north west][inner sep=0.75pt]   [align=left] {$\displaystyle \mu _{2}$};
% Text Node
\draw (393.67,141.5) node [anchor=north west][inner sep=0.75pt]  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,opacity=1 ] [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{x}};
% Text Node
\draw (426.67,176) node [anchor=north west][inner sep=0.75pt]  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,opacity=1 ] [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{x}};
% Text Node
\draw (467.17,117.5) node [anchor=north west][inner sep=0.75pt]  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,opacity=1 ] [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{x}};
% Text Node
\draw (464.17,153.5) node [anchor=north west][inner sep=0.75pt]  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,opacity=1 ] [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{x}};
% Text Node
\draw (380.67,177) node [anchor=north west][inner sep=0.75pt]  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,opacity=1 ] [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{x}};
% Text Node
\draw (414.17,97) node [anchor=north west][inner sep=0.75pt]  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,opacity=1 ] [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{x}};
% Text Node
\draw (464.17,193.5) node [anchor=north west][inner sep=0.75pt]  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,opacity=1 ] [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{x}};
% Text Node
\draw (410.67,193) node [anchor=north west][inner sep=0.75pt]  [color={rgb, 255:red, 74; green, 144; blue, 226 }  ,opacity=1 ] [align=left] {\textcolor[rgb]{0.29,0.56,0.89}{x}};


\end{tikzpicture}


}


\hfill \break Here, The centers that we have identified in this 2-means clustering variant are labeled with $\mu$. he 'centers' are known as centroids. We can formally define clusters and centroids here.

\hfill \begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Cluster} $\rightarrow$ A collection of data points aggregated together because of certain similarities.\\
\textbf{Centroid} $\rightarrow$ The imaginary or real location representing the center of the cluster.
\end{tcolorbox}


\subsubsection{How K-Means fits Into the Unsupervised Framework Paradigm}

\begin{itemize}
	\item \textbf{Parameters} of K-Means are the choice of centers $\theta = \{\mu_1, ... \mu_k\}$ where $\mu_i \in \mathbb{R}^n$.
	\item \textbf{Hypothesis} function is a function that decides the center closest to a point $x$. Formally: $$ h_{\theta}(x)= \underbrace{argmin}_{\mu \in \{\mu_1, ... \mu_k\}} ||\mu -x||_2^2 $$
	\item \textbf{Loss function} is squared error between input and hypothesis. Formally: $$l(h_{\theta}(x),x)=||h_{\theta}(x)-x||_2^2$$
	\item \textbf{Optimization problem} will therefore be: $$ \underbrace{minimize}_{\text{for } \mu_1, ... \mu_k} \sum_{i=1}^{m}{||h_{\theta}(x_i)-x_i||_2^2}$$
\end{itemize}

\subsubsection{K-Means Clustering Algorithm}

To process learning data, the K-Means algorithm starts with a group of randomly selected centroids, which are used as the beginning points for every cluster, then performs iterative, repetetive calculations to optimize the positions of the centroids.\\

It \textbf{halts} when either the \textbf{centroids have 'stabilized'}, meaning that there is (basically) no change in their values because the clustering has been successful, or \textbf{when the defined number of iterations has been completed}.

\subsubsection{How Do We Select $k$?}

There is no \textit{right} way to select $k$. Note that \textbf{a larger $k$ virtually always will have lower loss than smaller $k$}, even on a hold out/validation set.\\

Insetad, it's common to look at the loss function as a function of increasing $k$, and then you just stop when things look "good".\\

There are other heuristic ways to do this, but they actually don't convincingly outperform this simple method.

\subsubsection{Example of K-Means Clustering}

K-Means Clustering is just an example of Unsupervised learning at the end of the day, so this is also a great example for unsupervised learning as well.\\

The following example is K-Means Clustering performed on the MNIST digit classification dataset. Here are some examples from the actual dataset:\\

{
\centering

\includegraphics[scale=0.35]{img/mnistdata.png}

}

\hfill \break K-Means can be run with $k=50$ on this given dataset, and here's the results it produces.\\

{
\centering

\includegraphics[scale=0.25]{img/mnistk50.png}

}

\hfill \break It is evident that this data was just categorized, and not 'classified' under any label. Such is the nuance of unsupervised learning.\\

\subsection{Principal Component Analysis}

\textbf{Large datasets} are increasingly common and are often \textbf{difficult to interpret}. More on this excellent example (along with interactive components) can be found at the following website:\\

\texttt{https://setosa.io/ev/principal-component-analysis/}

\hfill \begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Principal Component Analysis} $\rightarrow$ technique for reducing the dimensionality of large and complex datasets.
\end{tcolorbox}

Principal Component Analysis, or \textbf{PCA}, accomplishes this by creating new (uncorrelated) variables that successively maximize variance.\\

It provides us the answer to the question: \textbf{Can we combine raw features into new features that yield a simpler description of the same system?}

\subsubsection{3-Dimensional Example}

PCA really reveals its usefulness in 3 dimensions, because it's hard to see through a cloud of data. In the following example, the original data are plotted in 3d.\\

{
\centering

\includegraphics[scale=0.65]{img/pca3dbefore.png} 

}

\hfill \break You can project this data into 2 dimensions using a transformation. (This particular transformation is no different than finding a different camera angle: rotate the axes to find a better angle.)\\

So what's this 'better' angle? Usually, the one with the most variance. Here's a visual explanation. Here's the above dataset projected into 2 dimensions (unchanged).\\

{
\centering

\includegraphics[scale=0.65]{img/pca2dbefore.png} 

}

Our goal is the apply a transformation (adjust the camera angle) such that the 3 dimensions that are available to us are arranged as such:

\begin{itemize}
	\item The horizontal axis, PC1, has the most variation
	\item The vertical axis, PC2, has the second most
	\item The third axis, PC3, has the least
\end{itemize}

Once we've found this optimal camera angle that ensures these three conditions are met with the most variance in PC1 and the least in PC3, we have something like this:\\

{
\centering

\includegraphics[scale=0.65]{img/pca3dafter.png} 

}

\hfill \break The real beauty of this transformation can be seen when we return to the two dimensional point of view, where it can be observed that we've maximized variance.\\

{
\centering

\includegraphics[scale=0.65]{img/pca2dafter.png} 

}

\hfill \break From here, in order to reduce dimensionality from 3 to 2, we simply just drop PC3. Below is an alternative visualization of the data.\\

First, let's examine the variance given the default configuration of the axes, with no PCA applied.\\


{
\centering

\includegraphics[scale=0.5]{img/pcaanalysisbefore.png} 

}

\hfill \break After the transformation, we can see that with the new axes specified by PCA, the variance in PC1 is maximized.\\

{
\centering

\includegraphics[scale=0.5]{img/pcaanalysisafter.png}

}

\subsubsection{17-Dimensional Example}

More realistically, our data will have far more dimensions (and thus, it won't be such a huge and clear loss of information to move down a few dimensions, relative to what we're given.)\\

In the given table is data for the average consumption of 17 types of food per person per week for every country in the U.K. (\textit{grams per person})\\

{
\centering

\includegraphics[scale=0.6]{img/17dtable.png} 

}

\hfill \break We can glean some information from this table, and it shows us some interesting variations, but \textbf{overall differences aren't very notable}. PCA may be able to help us eliminate dimensions to help us emphasize how countries differ.\\

E.g. here is the plot of the data for the first principal component. We can see that Northern Ireland is looking different from the others.


\hfill \break (This is a number line, as we're only observing a one dimensional slice of the 17 dimensional data)

{
\centering

\includegraphics[scale=0.6]{img/17dpc1.png} 

}

\hfill \break We can take this analysis further by looking at the first and second principal components.\\

{
\centering

\includegraphics[scale=0.6]{img/17dpc2.png} 

}

\hfill \break We can see that in the first and second principal components, Northern Ireland is a major outlier. This makes sense: the Northern Irish have such differing diets from the rest of these countries because it's the only one of the four countries not found on the island of Great Britain.\\

We've come to a conclusion that reflects real-world geography.

\subsection{General Notes}

Unsupervised learning methods are useful for Exploratory Data Analysis. You're able to cluster or reduce your data to a few dimensions and visualize. Usually, this stuff is a means to an end, not an end within itself.\\

Examples:

\begin{itemize}
	\item Run PCA and transform your data to optimize for variance while reducing dimensions, then run an appropriate supervised classifier.
	\item Use k-Means to set up radial basis functions by getting $k$ centers, then creating the RBF Features as needed.
\end{itemize}


\subsection{Recommender Systems}

In simple terms, a recommender system is just a system that predicts a user's rating of an item.\\

Little bit of a mix between supervised learning and unsupervised learning.
\begin{itemize}
	\item Like \textbf{Supervised Learning} in that we have some semblance of labels there
	\item Like \textbf{Unsupervised Learning} in that we are trying to find some latent structure within the data
\end{itemize}

Generally, this sort of analysis falls somewhere in between information filtering and retrieval. However, this \textbf{does not stop us} from phrasing the problem in terms of hypothesis classes, loss functions, and optimization problems.

\subsection{Collaborative Filtering}

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Collaborative Filtering} $\rightarrow$ recommender systems that predict based only on the expressed preferences of other users for an item. 
\end{tcolorbox}

\subsubsection{Overview}
\hfill \break Collaborative filtering is a technique that can filter out items that a user might like on the basis of reactions by similar users. It works by searching a large group of people and finding a smaller set of users with tastes similar to a particular user.\\

Note the following visual example, where users 1 through 4 have ratings for items 1 through 4 if applicable.\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Square [id:dp9693485852887019] 
\draw   (198.33,53.67) -- (363,53.67) -- (363,218.33) -- (198.33,218.33) -- cycle ;
%Straight Lines [id:da06411415670338638] 
\draw    (489.67,79.33) -- (362.17,24.79) ;
\draw [shift={(360.33,24)}, rotate = 383.15999999999997] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Curve Lines [id:da3176791358547847] 
\draw    (426.33,186) .. controls (325.51,332.6) and (170.56,317.49) .. (166.22,225.39) ;
\draw [shift={(166.17,224)}, rotate = 448.26] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (218.67,15.4) node [anchor=north west][inner sep=0.75pt]    {$i_{1}$};
% Text Node
\draw (254.33,15.4) node [anchor=north west][inner sep=0.75pt]    {$i_{2}$};
% Text Node
\draw (297.5,15.4) node [anchor=north west][inner sep=0.75pt]    {$i_{3}$};
% Text Node
\draw (336,15.4) node [anchor=north west][inner sep=0.75pt]    {$i_{4}$};
% Text Node
\draw (156.67,70.73) node [anchor=north west][inner sep=0.75pt]  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,opacity=1 ]  {$u_{1}$};
% Text Node
\draw (156.67,107.73) node [anchor=north west][inner sep=0.75pt]  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,opacity=1 ]  {$u_{2}$};
% Text Node
\draw (156.67,140.9) node [anchor=north west][inner sep=0.75pt]  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,opacity=1 ]  {$u_{3}$};
% Text Node
\draw (156.67,177.4) node [anchor=north west][inner sep=0.75pt]  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,opacity=1 ]  {$u_{4}$};
% Text Node
\draw (220.17,70.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 1}};
% Text Node
\draw (337.5,70.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 3}};
% Text Node
\draw (255.83,107.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 2}};
% Text Node
\draw (299,107.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 5}};
% Text Node
\draw (255.83,140.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 3}};
% Text Node
\draw (337.5,140.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 5}};
% Text Node
\draw (220.17,177) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 4}};
% Text Node
\draw (299,177) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 4}};
% Text Node
\draw (495.33,70.83) node [anchor=north west][inner sep=0.75pt]   [align=left] {Columns are items};
% Text Node
\draw (392,165) node [anchor=north west][inner sep=0.75pt]   [align=left] {Rows are users};


\end{tikzpicture}


}

\hfill \break The blanks indicate where users have not left ratings. very simply, our job is to figure out what these users \textit{could} rate the items that they haven't rated. In other words, we are \textbf{predicting what they will rate items based on the expressed preferences of others}.\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Square [id:dp9693485852887019] 
\draw   (202.33,54.33) -- (367,54.33) -- (367,219) -- (202.33,219) -- cycle ;
%Straight Lines [id:da06411415670338638] 
\draw    (489.67,79.33) -- (362.17,24.79) ;
\draw [shift={(360.33,24)}, rotate = 383.15999999999997] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Curve Lines [id:da3176791358547847] 
\draw    (426.33,186) .. controls (325.51,332.6) and (170.56,317.49) .. (166.22,225.39) ;
\draw [shift={(166.17,224)}, rotate = 448.26] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (218.67,15.4) node [anchor=north west][inner sep=0.75pt]    {$i_{1}$};
% Text Node
\draw (254.33,15.4) node [anchor=north west][inner sep=0.75pt]    {$i_{2}$};
% Text Node
\draw (297.5,15.4) node [anchor=north west][inner sep=0.75pt]    {$i_{3}$};
% Text Node
\draw (336,15.4) node [anchor=north west][inner sep=0.75pt]    {$i_{4}$};
% Text Node
\draw (156.67,70.73) node [anchor=north west][inner sep=0.75pt]  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,opacity=1 ]  {$u_{1}$};
% Text Node
\draw (156.67,107.73) node [anchor=north west][inner sep=0.75pt]  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,opacity=1 ]  {$u_{2}$};
% Text Node
\draw (156.67,140.9) node [anchor=north west][inner sep=0.75pt]  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,opacity=1 ]  {$u_{3}$};
% Text Node
\draw (156.67,177.4) node [anchor=north west][inner sep=0.75pt]  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,opacity=1 ]  {$u_{4}$};
% Text Node
\draw (220.17,70.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 1}};
% Text Node
\draw (337.5,70.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 3}};
% Text Node
\draw (255.83,107.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 2}};
% Text Node
\draw (299,107.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 5}};
% Text Node
\draw (255.83,140.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 3}};
% Text Node
\draw (337.5,140.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 5}};
% Text Node
\draw (220.17,177) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 4}};
% Text Node
\draw (299,177) node [anchor=north west][inner sep=0.75pt]   [align=left] {{\fontfamily{pcr}\selectfont 4}};
% Text Node
\draw (495.33,70.83) node [anchor=north west][inner sep=0.75pt]   [align=left] {Columns are items};
% Text Node
\draw (392,165) node [anchor=north west][inner sep=0.75pt]   [align=left] {Rows are users};
% Text Node
\draw (299,70.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{{\fontfamily{pcr}\selectfont ?}}};
% Text Node
\draw (255.83,70.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{{\fontfamily{pcr}\selectfont ?}}};
% Text Node
\draw (220.17,107.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{{\fontfamily{pcr}\selectfont ?}}};
% Text Node
\draw (220.17,140.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{{\fontfamily{pcr}\selectfont ?}}};
% Text Node
\draw (299,140.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{{\fontfamily{pcr}\selectfont ?}}};
% Text Node
\draw (337.5,107.33) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{{\fontfamily{pcr}\selectfont ?}}};
% Text Node
\draw (337.5,177) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{{\fontfamily{pcr}\selectfont ?}}};
% Text Node
\draw (255.83,177) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textcolor[rgb]{0.82,0.01,0.11}{{\fontfamily{pcr}\selectfont ?}}};


\end{tikzpicture}


}

The basic goal is to \textbf{fill in the matrix} as best we can. In terms of data structures, this is usually stored as a sparse matrix.

\subsubsection{Approaches}

There are three main ways to approach Collaborative Filtering. \textbf{User-user}, \textbf{Item-item}, and \textbf{Matrix factorization}.

\begin{itemize}
	\item \textbf{Item-Based CF} $\rightarrow$ Here, we want to calculate the similarity between two items and then decide that users will rate it based on that. \textbf{We have done this before in a different context.} First, we should \textbf{only consider users who have rated both items}. For each user in that list, we should calculate the difference in ratings for the two items, and then take the average of this difference over the users. We \textbf{can also find the cosine similarity of these items}.
	\item \textbf{User-Based CF} $\rightarrow$ This approach is very similar to the previous one, we just do the converse of it.
	\item \textbf{Matrix Factorization} $\rightarrow$ This one's a little more formal in nature. We \textbf{want} all the entries $i, j$ of the ratings matrix $X \in \mathbb{R}^{m \times n}$.\\
	\begin{myproof}
	\textbf{Idea:} Approximate this as $u_i^T \cdot v_j$, where $u_i \in \mathbb{R}^k$ represents the user-specific weights (we're using the transpose of this matrix), and $u_j \in \mathbb{R}^k$ represents the item-specific weights.\\\\
	\textbf{Hypothesis function:} We want this to produce a matrix built to our specifications, i.e. $h_{\theta}(i,j)=u_i^T \cdot v_j$\\\\
	\textbf{Loss function:} Least squares on observed entries, i.e. $l(h_{\theta}(i,j),X_{ij})=(h_{\theta}(i,j)-X_{ij})^2$\\\\
	\textbf{Optimization problem:} Where $S$ is the observed entries, we need to minimize $\theta$ on $$\sum_{i,j \in S}{l(h_{\theta}(i,j),X_{ij})}$$
	\end{myproof}
	
\end{itemize}

Matrix factorization as a problem is known to not be a convex function, i.e. there's no one optimal solution that we're going to nail every time. Therefore, we won't go for perfect results.\\

Essentially, we are factorizing the ratings matrix $X$ as $X = UV$ where $U \in \mathbb{R}^{m \times k}$ and $V \in \mathbb{R}^{k \times n}$ but only penalizing mismatches between $UV$ and $X$ at the observed entries in $X$.

\subsection{Association Rules}

We can also find rules that \textbf{associate} the presence of one set of items with that of another set of item.\\

\begin{myproof}
An excellent example of this is the Amazon 'Customers also bought' section.
\end{myproof}

Association rules are usually represented in the form $X \rightarrow Y$, where $X$ (rule antecedent) and $Y$ (rule consequent) are disjoint conjunctions of features. Rule quality is usually measured by rule support and confidence.\\

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Rule support} $\rightarrow$ The percentage of records containing both $X$ and $Y$.\\\\
\textbf{Rule confidence} $\rightarrow$ The conditional probability of finding $Y$ given $X$, i.e. the conditional probability that a transaction will contain $Y$ given that it contains $X$. It describes the strength of the implication, and is given by $$c(X \rightarrow Y) = \frac{s(X \rightarrow Y)}{s(X)}$$ 
\end{tcolorbox}

\subsubsection{Examples}
\begin{itemize}
	\item buys(x, bread) $\land$ buys(x, eggs) $\rightarrow$ buys(x, milk) <support: 0.6\%, confidence: 65\%>
	\item age="30-45", income="50k-75k" $\rightarrow$ car="SUV"
\end{itemize}

\subsubsection{Basic Concepts of Association Rules}

To explain these concepts, we'll make use of the following table.\\

{
\centering

\begin{table}[]
\begin{tabular}{|l|l|}
\hline
\textbf{Transaction ID} & \textbf{Items Purchased}  \\ \hline
1                       & Bread, Milk               \\ \hline
2                       & Bread, Diaper, Beer, Eggs \\ \hline
3                       & Milk, Diaper, Beer, Coke  \\ \hline
4                       & Bread, Milk, Diaper, Beer \\ \hline
5                       & Bread, Milk, Diaper, Coke \\ \hline
\end{tabular}
\end{table}

}


\begin{itemize}
	\item \textbf{Itemset} $\rightarrow$ A set of one or more items. E.g. \{Milk, Bread, Diaper\}. 
	\item \textbf{K-itemset} $\rightarrow$ An itemset that contains $k$ items.
	\item \textbf{Support Count ($\sigma$)} $\rightarrow$ Frequency of occurrence of an itemset- the number of transactions in which it appears. E.g. $\sigma$(\{Milk, Bread, Diaper\}) = 2
	\item \textbf{Support} $\rightarrow$ Fraction of the transactions in which an itemset appears. E.g. s(\{Milk, Bread, Diaper\}) = $\frac{2}{5}$
	\item \textbf{Frequent Itemset} $\rightarrow$ An itemset whose support is greater than or equal to a minsup threshold.
\end{itemize}

\subsubsection{Miscellaneous}

There are other considerations of how 'interesting' a rule is, including \textbf{lift}, \textbf{conviction}, etc. 

\subsubsection{In Practice}

\texttt{Orange3} is a GUI and Python API that enumerates frequent itemsets and performs association mining. It plays nicely with \texttt{sklearn}.\\

Generally, we can make use of Association Rules for interpretable, fast data mining. However, this sort of analysis typically doesn't consider order or scalability issues.

\section{Big Data and MapReduce (L24)}

\subsection{Big Data}

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Big Data} $\rightarrow$ extremely large data sets that may be analyzed computationally to reveal patterns, trends, and associations, especially relating to human behavior and interactions.
\end{tcolorbox}

\hfill \break There is no formal point where we say, 'oh, that is now \textbf{big data}.' However, we can look at a few notable inflection points in terms of how large your actual data is. \\

\textit{Note how we're going based on disk and RAM sizes. This is because the definition of big data will inevitably vary with the amount of data we can easily manage.}

\begin{itemize}
	\item[1.] Your data fits in RAM on a single machine.
	\item[2.] Your data fits in disk on a single machine.
	\item[3.] Your data fits in RAM or disk on a small cluster of machines.
	\item[4.] Your data fits in RAM or disk on a large cluster of machines.
\end{itemize}

It's probably reasonable to refer to 3 and 4 as big data, but many would only consider 4 as big data. Again, the definition varies.

\textbf{Distributed Computing} rose to prominence in the 70s and 80s, and was made possible in part by innovations in computer architecture like the \textbf{Message Passing Interface}, a key technology that allowed multiple computer systems to work together on problems.\\

\textbf{MPI} is extremely powerful but is complicated and brittle- it's imperfect.

\subsection{MapReduce}

\subsubsection{Overview}

When Google first started building their datacenters, they used clusters of off-the-shelf commodity hardware. Machines had different speeds and failures were common given cluster sizes.\\

Data itself was distributed redundantly over many machines, they \textbf{wanted as much as possible to perform the computation on the machine where the data was stored}.\\

This led to the development of the \textbf{MapReduce} framework at Google, which was made popular through the Apache Hadoop open source implementation.\\

\subsubsection{Hadoop}

\begin{myproof}
Hadoop (owned by Apache) is an open-source framework designed for distributed storage and processing of very large data sets across clusters of computers. Apache Hadoop consists of components including:

\begin{itemize}
	\item \textbf{Hadoop Distributed File System} (HDFS), the bottom layer component for storage. HDFS breaks up files into chunks and distributes them across the nodes of the cluster.
	\item \textbf{Yarn} for job scheduling and cluster resource management.
	\item \textbf{MapReduce} for parallel processing.
	\item Other libraries.
\end{itemize}

\end{myproof}


\subsubsection{MapReduce Details}


Initially used for analyzing search results, \textbf{MapReduce gained popularity due to its ability to split and process terabytes of data in parallel, achieving quicker results}.\\

At the core of MapReduce are two functions: Map and Reduce. Here are their definitions in relation to this algorithm.\\

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Map} $\rightarrow$ The mapper reads in data from the filesystem and outputs modified data.\\\\
\textbf{Reduce} $\rightarrow$ The reducer collects all of the mappers' outputs on the keys, and then outputs 'reduced' data.
\end{tcolorbox}

\hfill \break Mappers and reducers are the Hadoop servers that run the Map and Reduce functions respectively.\\

\subsubsection{How MapReduce Works}

\includegraphics[scale=0.47]{img/mapreduce.jpg} \\

\begin{itemize}
	\item \textbf{Map} $\rightarrow$ The input data is first split into smaller blocks, and then each block is assigned to a \textbf{mapper} for processing. Mappers read in a chunk of data provided by the splitter (typically 64MB), and then emit a set of data derived from the input.
	\item \textbf{Combine} $\rightarrow$ This is an optional process. The \textbf{combiner} is a reducer that runs individually on each mapper server. It reduces the data on each mapper further to a more simplified form before passing it downstream.
	\item \textbf{Partition} $\rightarrow$ This is the process that translates the kv pairs from mappers to another set of kv pairs to feed into the reducer. It decides how the data has to be presented to the reducer and assigns it to a particular reducer.
	\item \textbf{Reduce} $\rightarrow$ After all the mappers complete processing, the framework shuffles and sorts the results before passing them onto the reducers. \textit{Note that a reducer cannot start while a mapper is still in progress}. This is the last step in the process, and the dataset that reducers produce is typically written to disk.
\end{itemize}

\subsubsection{Key Notes about Mappers/Reducers}

\begin{itemize}
	\item Mappers and reducers are typically single threaded and deterministic. Determinism allows for the restarting of failed jobs, or \textit{speculative execution}.
	\item If you need to handle more data, you can just add more mappers or reducers.
	\begin{itemize}
		\item No need to handle multi-threaded code.
		\item Since these systems are all independent of each other, you can run a nearly arbitrary number of nodes.
		\item A machine typically has a few nodes that can be allotted to mappers and reducers, it's usually one mapper or reducer per processor core.
	\end{itemize}
	\item Mappers/reducers run entirely independently of each other. (in the case of Hadoop, they run in separate JVMs)
\end{itemize}

\subsubsection{MapReduce's Pros and Cons}

\begin{itemize}
	\item \textbf{Pros}
	\begin{itemize}
		\item All you need to do is write a mapper and reducer; they can then be scaled as much as you need.
		\item Can get away with not exposing any of the internals if you're using a premade engine.
	\end{itemize}
	\item \textbf{Cons}
	\begin{itemize}
		\item Lots of reading and writing from disk
		\item Sometimes, communication between processes is necessary
	\end{itemize}
\end{itemize}

\subsubsection{MapReduce in Python}

Python's \texttt{mrjob} library allows you to write mappers and reducers in Python, which you can then deploy on Hadoop systems, Amazon Elastic MR, and Google Cloud.

\section{More Probability + Statistics (L25)}

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Statistical Inference} $\rightarrow$ The discipline that concerns itself with the development of procedures, methods, and theorems that allow us to extract meaning from data taht has been generated by stochastic (random) processes.
\end{tcolorbox}

\hfill \break It's like going from real life, to data, and then back to real life again. The goal is often \textbf{to develop a statistical model of the real world}.\\

Usually, we'll end up with one of the following.
\begin{itemize}
	\item An estimate
	\item A Confidence Interval
	\item Rejection of a hypothesis
	\item Clustering or Classification of data points into groups
\end{itemize}

\subsection{Review of Probability}

\textbf{Probability} is concerned with the outcome of a \textit{trial}. (i.e. experiment, observation)\\

\subsubsection{Basics}

\begin{tcolorbox}[title=Definitions:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Sample space} $\rightarrow$ Set of all possible outcomes of a trial.\\\\
\textbf{Event} $\rightarrow$ specification of the outcome of a trial.
\end{tcolorbox}

\hfill \break As such, the probability of the sample space will always be 1. \textit{(Recall that probability is measured on a float scale from 0 to 1)}\\

Example:\\

\begin{myproof}
Trial = Tossing a coin; Sample space = \{Heads, Tails\}; Event = Heads
\end{myproof}

\hfill \break If two events $E$ and $F$ are \textbf{independent} then the probability of $E$ does not change if $F$ has already happened. I.e. for \textbf{two independent events $E$ and $F$:} $$ P(E|F) = P(E) $$

Additionally, if two events $E$ and $F$ are \textbf{mutually exclusive}, then: $$P(E \union F) = P(E) + P(F)$$

\subsubsection{Bayes Theorem}

\textbf{Bayes Theorem} is a very simple equation, but it's fundamental to \textbf{Bayesian Inference}.\\

\begin{tcolorbox}[title=Aside: Bayes' Theorem,colframe=black,colback=white,arc=0pt,fonttitle=\bfseries]
\textbf{Bayes' Theorem} describes the probability of an event, based on prior knowledge of conditions that might be related to the event.

$$ P(A|B) = \frac{P(B|A) \cdot P(A)}{P(B)} $$
\end{tcolorbox}

\hfill \break \textbf{Conditional Independence} is the idea that events $A$ and $B$ are conditionally independent given $C$ if the probability of $A$ and $B$ happening given that $C$ happened is the same as the probability that $A$ happened given that $C$ happened, times the probability that $B$ happened given that $C$ happened. Formally: $$P(A \land B | C) = P(A|C) \cdot P(B|C)$$\\

\textbf{Entropy} is a measure of the uncertainty in a probability distribution.

\subsection{Hypothesis Testing}

This is the process in which we \textbf{accept} or \textbf{reject} a statistical hypothesis about a population.\\

We will refer to $H_0$ as the null hypothesis, and $H_1$ as the alternative hypothesis.\\

\subsubsection{Background Information}

\begin{tcolorbox}[title=Definition:,colframe=red!75!black,colback=red!5!white,arc=0pt,fonttitle=\bfseries]
\textbf{Null Hypothesis} $\rightarrow$ A statement about a population parameter. We can test the likelihood of this statement being true in order to decide whether to accept or reject our alternative hypothesis. \textit{Can include $=, \leq, or \geq$ sign.}\\\\
\textbf{Alternative Hypothesis} $\rightarrow$ A statement that directly contradicts the null hypothesis. We determine whether or not to accept or reject this statement based on the likelihood of the null hypothesis being true. \textit{Can include $=, <, or >$ sign.}\\\\
\textbf{P-value} $\rightarrow$ The probability of obtaining the observed results of a test, assuming that the null hypothesis is correct; a smaller p-value means that there is stronger evidence in favor of the alternative hypothesis.\\\\
\textbf{Alpha} $\rightarrow$ The significance level; the probability of rejecting the null hypothesis when it is true. Also known as the \textit{Type 1 Error}.
\end{tcolorbox}

\hfill \break To elaborate, the \textbf{null hypothesis} is the hypothesis that sample observations result purely from chance. The null hypothesis tends to state that there is no change. The \textbf{Alternative hypothesis}, on the other hand, is the hypothesis that sample observations are influenced by some non-random cause.\\

We can map these definitions to the following example from \textit{towardsdatascience} for better understanding.\\

\begin{myproof}
Imagine that you and your friend play a game. If a coin lands on heads, you win \$5 and if it lands on tails he wins \$5.\\

Let’s say the first two coin tosses landed on tails, meaning your friend won \$10. Should you be worried that he’s using a rigged coin? Well, the probability of the coin landing on tails two times in a row is 25\% (see above) which is not unlikely.\\

What if the coin landed on tails six times in a row? The probability of that occurring is approximately 1.56\% (see above), which is highly unlikely. At this point, it would be fair to assume that the coin is rigged. Typically, one would set a threshold, usually 5\%, to determine if an event occurred by chance or not (if you learned this before, this is known as the alpha!)\\

In this case:
\begin{itemize}
	\item The \textbf{null hypothesis} in our example is that the coin is a fair coin and that the observations result purely from chance
	\item The \textbf{alternative hypothesis} would then be that the coin is not fair, and that the scenario did not happen by chance.
	\item The \textbf{p-value} in the scenario of flipping tails 2 times in a row is 25\%, (it's 1.56\% when you're flipping tails 6 times in a row)
	\item The \textbf{alpha}, or level of significance would be 5\%.
\end{itemize}
\end{myproof}

\subsubsection{Accepting and Rejecting}

Given some data $x$, we want to accept or reject the null hypothesis in favor of the alternative hypothesis. If $H_0$ is true, but you reject it, you have a Type I error (false positive). If $H_1$ is true, but you accept $H_0$, you have a Type II error (false negative).\\

\subsubsection{Rejecting vs. Not Rejecting the Null}

The main rule in deciding whether or not you reject the null hypothesis is \textbf{PGADRN}.\\

If the \textbf{P} value is \textbf{G}reater than the \textbf{A}lpha, \textbf{D}o not \textbf{R}eject the \textbf{N}ull.\\

\begin{myproof}
In the case of flipping tails 2 times in a row, we \textbf{would not} reject the null, since 25\% (the chance of that case happening) is greater than 5\% (our $\alpha$ value).\\

In the case of flipping tails 6 times in a row, we \textbf{would} reject the null hypothesis, since 1.56\% < 5\% (our $\alpha$ value).
\end{myproof}

\subsubsection{What's the Point?}

Now we understand the terminology and the use through an example, so what's the point of this process?\\

It's used \textbf{to determine how likely or unlikely a hypothesis is for a given sample of data}. That last part is key, because more often than not, you won't be able to get an infinite amount of data, or even data that represents the \textit{entire} population.

\subsubsection{General Steps}

Here are the steps for performing a hypothesis test in a general sense.
\begin{itemize}
	\item[1.] State your null and alternative hypotheses. \textit{Usually, null states that everything is as normally was; that nothing has changed; alternative states that something has indeed changed.}
	\item[2.] Set your significance level, the $\alpha$. \textit{This is at which point you say "okay, the probability that this event occurred is a bit abnormal".} (Usually set to 5\%)
	\item[3.] Collect sample data and calculate sample statistics.
	\item[4.] Calculate the p-value given those sample statistics. Once you get the sample statistics, you can determine the p-value through different methods. Most commonly, we use T-score and Z-score given a normal distribution of data.
	\item[5.] Reject or do not reject the null hypothesis. (Using \textbf{PGADRN})
\end{itemize}

\subsection{Sample Statistics}

To be more consistent with standard statistics notation, we will introduce the notion of a \textit{population} and a \textit{sample}.\\

{
\centering


\tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        

\begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Ellipse [id:dp1941426265750391] 
\draw  [fill={rgb, 255:red, 74; green, 144; blue, 226 }  ,fill opacity=1 ] (90.67,58.67) .. controls (90.67,29.58) and (193.57,6) .. (320.5,6) .. controls (447.43,6) and (550.33,29.58) .. (550.33,58.67) .. controls (550.33,87.75) and (447.43,111.33) .. (320.5,111.33) .. controls (193.57,111.33) and (90.67,87.75) .. (90.67,58.67) -- cycle ;
%Shape: Ellipse [id:dp8575198252885649] 
\draw  [fill={rgb, 255:red, 80; green, 227; blue, 194 }  ,fill opacity=1 ] (342.67,55.67) .. controls (342.67,41.49) and (358.34,30) .. (377.67,30) .. controls (397,30) and (412.67,41.49) .. (412.67,55.67) .. controls (412.67,69.84) and (397,81.33) .. (377.67,81.33) .. controls (358.34,81.33) and (342.67,69.84) .. (342.67,55.67) -- cycle ;
%Straight Lines [id:da6374615822579148] 
\draw    (181,152.67) -- (189.96,105.96) ;
\draw [shift={(190.33,104)}, rotate = 460.86] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da7589563988427739] 
\draw    (452.33,152.67) -- (395.55,78.26) ;
\draw [shift={(394.33,76.67)}, rotate = 412.65] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;

% Text Node
\draw (138,156.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textbf{Population}};
% Text Node
\draw (425.33,156.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textbf{Sample}};
% Text Node
\draw (144.67,195) node [anchor=north west][inner sep=0.75pt]   [align=left] {$\displaystyle \mu =\mathbf{E}[ X]$};
% Text Node
\draw (409.33,178.5) node [anchor=north west][inner sep=0.75pt]   [align=left] {$\displaystyle \overline{x} =\frac{1}{m}\sum\limits ^{m}_{i=1} x^{( i)}$};
% Text Node
\draw (63.33,196) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textbf{Mean}};
% Text Node
\draw (42,259.67) node [anchor=north west][inner sep=0.75pt]   [align=left] {\textbf{Variance}};
% Text Node
\draw (121.33,257.17) node [anchor=north west][inner sep=0.75pt]   [align=left] {$\displaystyle \sigma =\mathbf{E}\left[( X-\mu )^{2}\right]$};
% Text Node
\draw (367.33,242.17) node [anchor=north west][inner sep=0.75pt]   [align=left] {$\displaystyle s^{2} =\frac{1}{m-1}\sum\limits ^{m}_{i=1}\left( x^{( i)} -\overline{x}\right)^{2}$};


\end{tikzpicture}


}


\subsection{Statistics + Hypothesis Testing: Big Picture}

Using these basic techniques, we can devise some tests to determine whether certain data gives evidence that some effect 'really' occurs in the real world.\\

Fundamentally, we are deciding whether \textbf{things are likely to be true about the population, given a sample to analyze}.\\

There are lots of caveats about the precise meaning of these terms, to the point that \textbf{many people debate the usefulness of hypothesis testing at all}.\\

Despite this debate, hypothesis testing is still very common in practice, and is important to understand.

\subsection{A/B Testing}

When we're \textbf{comparing two samples}, we want to compare the values of sampled individuals in Group A with the values of samples individuals in Group B.\\

Here, we ask: \textbf{Do the two sets of values come from the same underlying distribution?} Do Groups A and B point us in the same direction, statistically?\\

Answering this question by performing a statistical test is called \textbf{A/B Testing}.\\

Assume the following example:\\

\begin{myproof}
Assume we have a random sample of mothers and their newborns. Let us compare:
\begin{itemize}
	\item \textbf{Group A:} Birth weights of babies of mothers who smoked during pregnancy
	\item \textbf{Group B:} Birth weights of babies of mothers who didn't smoke
\end{itemize}

\textbf{Question:} Can we say that the difference is due to chance alone.
\end{myproof}

\hfill \break From here, let's assume some hypotheses as we approach this statistically.\\

\begin{myproof}
\textbf{Null Hypothesis ($H_0$):} In the population, the distributions of the birth weights of the babies in the two groups are the same. (They are different in the samples only due to chance)\\\\
\textbf{Alternative Hypothesis ($H_1$):} In the population, the babies of the mothers who didn't smoke were heavier, on average, than the babies of the smokers.
\end{myproof}

\hfill \break Now, let's establish a test statistic. Here, we will use the difference between the average weights of the babies. In other words: $$ Avg_{B} - Avg_{A} $$ The larger this statistic is, the more we favor the alternative hypothesis.\\

\subsubsection{Idea: Simulating Under the Null}

If the null hypothesis is true, we can just \textbf{shuffle all the birth weights around, between group B and A, and the test statistic should remain the same}.\\

Let's \textbf{first} find the test statistic using the given data for A and B, then let's \textbf{shuffle and try again}.\\


\section{Footnotes}

Taken and compiled by Akilesh Praveen.\\

I don't claim ownership to any of the material in this notesheet. It is merely a compilation of resources I used to understand materials presented in this course. If a resource needed simplifying in my mind, I tried to do so to the best of my abilities. If not, I left it as it was.\\

References:
\begin{itemize}
	\item CMSC320 - John Dickerson - UMD
	\item DATA 8 - Ani Adhikari, John Denero - UC Berkeley
	\item Geeksforgeeks.com
	\item Wikipedia.org
	\item Hackerearth.com
	\item Softwaretestinghelp.com
	\item CSE 591 - Stonybrook University
	\item Setosa.io
	\item Talend.com
	\item Mapr.com
	\item Towardsdatascience.com
\end{itemize}

\end{document}