Google BigQuery link:
https://console.cloud.google.com/bigquery?sq=132225947884:8de6f1dba3de42a8a1d94088c7217297
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
/*
ASSIGNMENT #2 - SQL BOOTCAMP (AUG 2023)
AD-HOC ANALYSIS EXERCISE FOR THE BIGQUERY PUBLIC DATASET: covid19_italy
BY: GINO FREUD HOBAYAN
*/
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
/*
GUIDES:
Regular data = SUM
Cumulative data = MAX
Don't use "recovered" column for queries about vaccine effectivity.
`bigquery-public-data.covid19_italy.data_by_province` AS province
`bigquery-public-data.covid19_italy.data_by_region` AS region
`bigquery-public-data.covid19_italy.national_trends` AS nat_trends
*/
-- 1. WHAT is the region with the MOST confirmed cases?
SELECT
region.region_name,
MAX(region.total_confirmed_cases) AS max_confirmed_cases
FROM
`bigquery-public-data.covid19_italy.data_by_region` AS region
GROUP BY
region.region_name
ORDER BY
max_confirmed_cases DESC
LIMIT
1
/*
ANSWER:
Region name: Lombardia
max_confirmed_cases: 4,170,894
*/
-- 2. In which province is the difference between [the number of confirmed cases] and [the number of recovered patients] the greatest?
-- Using region TABLE
SELECT
region.region_name,
region.total_confirmed_cases AS num_of_confirmed_cases,
region.recovered AS num_of_recovered_patients,
region.total_confirmed_cases - region.recovered AS difference
FROM
`bigquery-public-data.covid19_italy.data_by_region` AS region
ORDER BY
difference DESC
/*
ANSWER: Lombardia
difference - 613,919
num_of_confirmed_cases - 1,681,066
num_of_recovered_patients - 1,067,147
*/
-- 3. WHAT are the regions with the most recovered patients?
-- using "recovered" column for this specific query. Although not recommended when measuring vaccine effectivity, since a person can recover without a vaccine.
SELECT
DISTINCT(region.region_name) AS regions_with_the_most_recovered,
MAX(region.recovered) AS count_of_total_recovered
FROM
`bigquery-public-data.covid19_italy.data_by_region` AS region
GROUP BY
region.region_name
ORDER BY
count_of_total_recovered DESC
/*
ANSWER:
1.)Lombardia - 4,123,903
2.)Veneto - 2,701,259
3.)Campania - 2,444,627
4.)Lazio - 2,390,707
5.)Emilia-Romagna - 2,137,865
6.)Sicilia - 1,812,771
7.)Piemonte - 1,696,057
8.)Puglia - 1,632,818
*/
-- 4. WHAT is the total number of confirmed cases in italy
-- total_confirmed_cases column is Cumulative, we use MAX not SUM
SELECT
MAX(nat_trends.total_confirmed_cases) AS total_num_of_confirmed_cases_in_italy
FROM
`bigquery-public-data.covid19_italy.national_trends` AS nat_trends
/*
ANSWER:
total_num_of_confirmed_cases_in_italy: 25,929,238
*/
-- 5. WHAT is the total number of deaths in italy
SELECT
MAX(nat_trends.deaths) AS total_num_of_deaths_in_italy -- deaths column is Cumulative, we use MAX not SUM
FROM
`bigquery-public-data.covid19_italy.national_trends` AS nat_trends
/*
ANSWER:
total_num_of_deaths_in_italy: 191,167
*/
/*
-- used this to figure out if a column is cumulative.
SELECT
date,
deaths
FROM
`bigquery-public-data.covid19_italy.national_trends` AS nat_trends
ORDER BY
date
*/
-- 6. WHAT is the ratio of [the number of deaths] to [the number of confirmed cases nationwide?]
SELECT
MAX(nat_trends.deaths) AS num_of_deaths,
MAX(nat_trends.total_confirmed_cases) AS num_of_confirmed_cases_in_italy,
MAX(nat_trends.deaths) / MAX(nat_trends.total_confirmed_cases) AS multiply_100_for_the_ratio
FROM
`bigquery-public-data.covid19_italy.national_trends` AS nat_trends
/*
ANSWER:
multiply_100_for_the_ratio: 0.00737
num_of_deaths: 191,167
num_of_confirmed_cases_in_italy: 25,929,238
Multiply 0.00737 by 100 = 0.74% RATIO
EXPLANATION:
For every 100 people or confirmed cases, there are around 0.74 deaths
For every 1,000 people or confirmed cases, there are around 7.4 deaths
7.4 people - died
992.6 people - recovered
*/
-- 7. WHAT is the ratio of [the number of hospitalized patients] to [the number of confirmed cases in Italy?]
SELECT
MAX(nat_trends.total_hospitalized_patients) AS num_of_hospitalized_patients,
MAX(nat_trends.total_confirmed_cases) AS num_of_confirmed_cases_in_italy,
MAX(nat_trends.total_hospitalized_patients) / MAX(nat_trends.total_confirmed_cases) AS multiply_100_for_the_ratio
FROM
`bigquery-public-data.covid19_italy.national_trends` AS nat_trends
/*
ANSWER:
multiply_100_for_the_ratio: 0.001485
num_of_hospitalized_patients: 38,507
num_of_confirmed_cases_in_italy: 25,929,238
Multiply 0.001485 by 100 = 0.15% RATIO
EXPLAINED:
For every 100 people or confirmed cases in italy, 0.15% of those are hospitalized patients
For every 1,000 people or confirmed cases in italy, 1.5% of those are hospitalized patients
1.5 - hospitalized patients
998.5 - managing the illness at home
This ratio helps experts understand how serious the illness is and how many people are getting severely affected by it.
It's like a simple math tool that helps them keep track of what's happening with the illness in Italy.
*/
/*
---------- used this to figure out if it's cumulative. (total_hospitalized_patients IS CUMULATIVE)
SELECT
date,
total_hospitalized_patients
FROM
`bigquery-public-data.covid19_italy.national_trends` AS nat_trends
ORDER BY
date
--------- used this to figure out if it's cumulative. (total_confirmed_cases IS CUMULATIVE)
SELECT
date,
total_confirmed_cases
FROM
`bigquery-public-data.covid19_italy.national_trends` AS nat_trends
ORDER BY
date
*/
-- 8. WHAT are the top 5 provinces with the most confirmed cases?
SELECT
DISTINCT(province.province_name) AS distinct_province_name,
MAX(province.confirmed_cases) AS count_of_confirmed_cases
FROM
`bigquery-public-data.covid19_italy.data_by_province` AS province
GROUP BY
distinct_province_name
ORDER BY
count_of_confirmed_cases DESC
LIMIT
5
/*
ANSWER:
1.) Roma - 1,792,437
2.) Milano - 1,336,821
3.) Napoli - 1,303,985
4.) Torino - 923,618
5.) Padova - 540,031
*/
-- END