-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathR Script - Hypothesis Testing.R
151 lines (92 loc) · 3.33 KB
/
R Script - Hypothesis Testing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#MODULE 3 - R PRACTICE
#installing packages
install.packages("psych")
#importing libraries
library(FSA)
library(FSAdata)
library(magrittr)
library(dplyr)
library(tidyr)
library(plyr)
library(ggplot2)
library(scales)
library(psych)
#reading csv file
dataset_cities <- read.csv("cities_air_quality_water_pollution.18-10-2021.csv")
dataset_cities
#descriptive analysis
mean(dataset_cities$AirQuality)
summary(dataset_cities$AirQuality)
mean(dataset_cities$WaterPollution)
summary(dataset_cities$WaterPollution)
#creating new variables
#air quality
AirQuality <- c(dataset_cities$AirQuality)
AirQuality
#water pollution
WaterPollution <- c(dataset_cities$WaterPollution)
WaterPollution
#one sample t-test for air quality
t.test(AirQuality, mu=62) #reject alternative hypothesis
#output of air quality variable
#t-test statistic: 0.51561
#degrees of freedom: 3962
#p-value: 0.6062
#95% confidence interval for true mean: [61.28972,63.21718]
#mean of air quality: 62.25345
#Since the p-value of the test (0.6062) is not less than 0.05, we fail to reject the null hypothesis
#analyzing by changing the mu values for air quality
t.test(AirQuality, mu=30) #reject null hypothesis
t.test(AirQuality, mu=60) #reject null hypothesis
t.test(AirQuality, mu=80) #reject null hypothesis
#one sample t-test for water pollution
t.test(WaterPollution, mu=100) #reject null hypothesis
#output of water pollution variable
#t-test statistic: -135.81
#degrees of freedom: 3962
#p-value: p<2.2e-16
#95% confidence interval for true mean: [43.83611,45.43464]
#mean of air quality: 44.63537
#Since the p-value of the test (2.2e-16) is less than 0.05, we reject the null hypothesis
#analyzing by changing the mu values for water pollution
t.test(WaterPollution, mu=30) #reject null hypothesis
t.test(WaterPollution, mu=44) #reject alternative hypothesis
t.test(WaterPollution, mu=60) #reject null hypothesis
#additional parameters in one sample t-test
t.test(AirQuality, mu=60, alternative = "greater") #reject null hypothesis
t.test(WaterPollution, mu=30, alternative = "less") #reject alternative hypothesis
#plot1:air quality
qqnorm(dataset_cities$AirQuality)
qqline(dataset_cities$AirQuality, lty=2)
#plot2:water pollution
qqnorm(dataset_cities$WaterPollution)
qqline(dataset_cities$WaterPollution, lty=2)
#two sample t-test - (two sample tests would happen when we want to find the difference of two population means of the data set)
#code for two sample t-test
t.test(AirQuality,WaterPollution)
#calculating pvalue
summary(dataset_cities)
#airquality
length(AirQuality)
summary(AirQuality)
t <- (mean(AirQuality)-62)/(sd(AirQuality)/sqrt(length(AirQuality)))
t
meanofairquality <- mean(AirQuality-62)
meanofairquality
res1 <- sd(AirQuality)/sqrt(length(AirQuality))
res1
pvalue1 <- 2*pt(-abs(t),df=length(AirQuality)-1)
pvalue1
matrix(c(meanofairquality,res1,t,pvalue1))
#waterpollution
length(WaterPollution)
summary(WaterPollution)
t <- (mean(WaterPollution)-44)/(sd(WaterPollution)/sqrt(length(WaterPollution)))
t
meanofwaterpollution <- mean(AirQuality-44)
meanofwaterpollution
res2 <- sd(WaterPollution)/sqrt(length(WaterPollution))
res2
pvalue2 <- 2*pt(-abs(t),df=length(WaterPollution)-1)
pvalue2
matrix(c(meanofwaterpollution,res2,t,pvalue2))