-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path09-MFA_STATIS.Rmd
executable file
·320 lines (240 loc) · 14.2 KB
/
09-MFA_STATIS.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
# MFA/STATIS
Multiple Factor Analysis (MFA) analyzes observations described by several blocks or sets of variables. MFA is performed in two steps. First a principal component analysis is performed on each data set which is then normalized by dividing all its elements by the square root of the first eigenvalue obtained from its PCA. Second, the normalized dataset are merged to form a unique matrix and a global PCA is performed on this matrix. The individual data sets are then projected onto the global analysis to analyze communalities and discrepancies[@MFA].
To illustrate MFA, we use the `World Health` dataset. The heatmap of the dataset is shown in figure \@ref(fig:WorldCorPlot)
## Computation
For our exaple we have four sub-tables. Let's call them $\mathbf{X}_{[1]}$, $\mathbf{X}_{[2]}$, $\mathbf{X}_{[3]}$ and $\mathbf{X}_{[4]}$. The first singular value is the normalizing factor used to divide the elements of the data table. For example, the PCA of the first group gives a first eigenvalue $\varphi_1$. This gives the first normalized data matrix $\mathbf{Z}_{[1]}$: $$\mathbf{Z}_{[1]} = \varphi_1^{-1} \times \mathbf{X}_{[1]}$$. Similarly matrices $\mathbf{Z}_{[2]}$, $\mathbf{Z}_{[3]}$ and $\mathbf{Z}_{[4]}$ are computed. The Scree plot of our given dataset is shown in figure \@ref(fig:MFAScree)
```{r include=FALSE, warning=FALSE, message=FALSE}
# Clean Start
rm(list = ls())
graphics.off()
library(TInPosition)
require(ggplot2)
# Loading Dataset
WorldHealth <- read.csv("Data/WorldHealth.csv", row.names = 1)
# Cleaning Data
names(WorldHealth) <- c("Fires", "Drownings", "Poisonings", "Falls",
"T.Chole_F", "T.Chole_M",
"BP_F", "BP_M",
"BMI_F","BMI_M",
"GenGovExp", "TotalExp_GDP",
"PerCapitaExp_XchangeRate","PerCapita_PPP",
"IMM_MCV", "IMM_DTP3")
# A function for plotting multiple plots
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
Healthtable <- read.csv("Data/worldhealthtabledesign.csv")
Healthtable2 <- read.csv("Data/worldhealthtabledesign2.csv")
designMatrix <- read.csv("Data/design.csv")
library(MExPosition)
require(factoextra)
require(ggplot2)
designforpartial <- c(rep("RiskFactors", 175), rep("Spendings", 175), rep("Immunization", 175))
designregion <- read.csv("Data/designpartial.csv")
WorldHealth <- WorldHealth[,c(5:16)]
Healthtable2 <- Healthtable2[,c(5:16)]
Healthtable <- Healthtable[c(5:16),]
```
```{r include=FALSE, warning=F, message=FALSE}
resMFA <- mpMFA(WorldHealth, column.design = Healthtable2, DESIGN = designMatrix$Region, make.design.nominal = T, graphs = F, make.columndesign.nominal = T)
```
```{r plotScreeMFA, include=FALSE}
# Create a function to plot the scree
# ev: the eigen values to plot. no default
# max.ev the max eigen value
# needed because ExPosition does not return all ev
# but only the requested one. but return all tau
# so if max.ev is specified, it is used to recompute
# all eigenvalues
# p.ep: the probabilities associated to the ev
# alpha: threshold for significance. Default = .05
# col.ns = color for ns ev. Default is Green
# col.sig = color for significant ev. Default is Violet
PlotScree <- function(ev,p.ev=NULL,max.vp=NULL,
alpha=.05,
col.ns = '#006D2C',col.sig='#54278F',
title = "Explained Variance per Dimension"
){
# percentage of inertia
val.tau = (100*ev/sum(ev))
Top.y = ceiling(max(val.tau)*.1)*10
# if ev is already a percentage convert it back
if (!is.null(max.vp)){ev = ev*(max.vp/ev[1])}
p <- ggplot(ev, aes(x = c(1:12), y= ev))
p +
geom_line() + ylab("Inertia Extracted by the Components") +
scale_x_continuous(name='Dimensions', breaks = c(1,2,3,4,5,6,7,8,9,10,11,12)) +
geom_line() +
scale_y_continuous(sec.axis = sec_axis(~./sum(ev)*100,name="Percentage of Explained Variance")) +
geom_point( size=2) +theme_get()
} # end of function PlotScree
```
```{r MFAScree, out.width="60%", fig.align="center", fig.cap="Scree Plot for MFA", fig.pos='H', fig.width=7.5, fig.height=4.5, echo=FALSE}
PlotScree(as.data.frame(resMFA$mexPosition.Data$Table$eigs))
```
These normalized matrices are concatenated into an $I \times T$ matrix called the global data matrix dentoed as $\mathbf{Z}$. $$\mathbf{Z} = [ \mathbf{Z_{[1]}} \ \ \mathbf{Z_{[2]}} \ \ \mathbf{Z}_{[3]} \ \ \mathbf{Z}_{[4]} ]$$
## Computing the global PCA
To analyze the global matrix, we use standard PCA. This amounts to computing the singular value decomposition of the global data matrix: $$\mathbf{Z=U \Delta V^T}$$ and the global factor scores are obtained as: $$\mathbf{F = M^{-\frac{1}{2}}U \Delta} $$
The factoe scores of the given data table is shown in figure \@ref(fig:MFAFMap1) (Components 1 and 2) and \@ref(fig:MFAFMap2) (Components 3 and 4)
```{r MFAFMap1, out.width="70%", fig.align="center", fig.pos="H", warning=FALSE, message=FALSE, fig.cap="MFA World Health Characteristics. Factor scores of the observations plotted on the first two components", echo=FALSE}
require(tibble)
# Converting table into data frame for passing it to ggplot
FactorScore <- as.data.frame(resMFA$mexPosition.Data$Table$fi)[c(1,2)]
FactorScore$labels <- labels(resMFA$mexPosition.Data$Table$fi[,1])
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "darkslategray1", "blueviolet")
# Functions to plot Factor Scores
p <- ggplot() # Plots the base
png(filename = "MFAFactor.png", width = 1000, height = 800, units = "px")
p +
# geom_point () plots the data points
geom_point(aes(x=FactorScore[,1],y=FactorScore[,2],
color=designMatrix$Region),
position = "dodge", size=4) + geom_text_repel(aes(x=FactorScore[,1],y=FactorScore[,2]), label=FactorScore$labels)+
# scale_x_continuous() scales the plotting constraints of x axis
scale_x_continuous(limits = c(-0.3,0.3) ,name = paste0("Component 1 Inertia: ",
round(resMFA$mexPosition.Data$Table$t[1],3), "%")) +
# scale_y_continuous() scales the plotting constraints of y axis
scale_y_continuous(limits = c(-0.2,0.2),name = paste0("Component 2 Inertia: ",
round(resMFA$mexPosition.Data$Table$t[2],3), "%")) +
# geom_hline() prints a horizontal line at the specified y intercept
geom_hline(yintercept = 0)+
# geom_vline() prints a vertical line at the specified y intercept
geom_vline(xintercept = 0)+
# theme() is used for tweaking the final appearance of the plot
theme(legend.position = "bottom")+ scale_colour_manual(values=cbbPalette, name="Regions")+ theme_minimal()+
# theme() is used for tweaking the final appearance of the plot
theme(legend.position = "bottom") + theme(text = element_text(size=25)) + ggtitle("Factor Scores")
dev.off()
```
From the above figure we can see that component 1 is seperating Africa from Europe and component 2 is seperating Asia from Europe
```{r MFAFMap2, out.width="70%", fig.align="center", fig.pos="H", warning=FALSE, message=FALSE, fig.cap="MFA World Health Characteristics. Factor scores of the observations plotted on components three and four", echo=FALSE}
require(tibble)
# Converting table into data frame for passing it to ggplot
FactorScore <- as.data.frame(resMFA$mexPosition.Data$Table$fi)[c(3,4)]
FactorScore$labels <- labels(resMFA$mexPosition.Data$Table$fi[,1])
# Functions to plot Factor Scores
p <- ggplot() # Plots the base
p +
# geom_point () plots the data points
geom_point(aes(x=FactorScore[,1],y=FactorScore[,2],
color=designMatrix$Region),
position = "dodge") +
# scale_x_continuous() scales the plotting constraints of x axis
scale_x_continuous(name = paste0("Component 3 Inertia: ",
round(resMFA$mexPosition.Data$Table$t[3],3), "%")) +
scale_color_discrete(name= "Regions")+
# scale_y_continuous() scales the plotting constraints of y axis
scale_y_continuous(name = paste0("Component 4 Inertia: ",
round(resMFA$mexPosition.Data$Table$t[4],3), "%")) +
# geom_hline() prints a horizontal line at the specified y intercept
geom_hline(yintercept = 0)+
# geom_vline() prints a vertical line at the specified y intercept
geom_vline(xintercept = 0)+
# theme() is used for tweaking the final appearance of the plot
theme(legend.position = "bottom")
```
## Partial Factor Scores
```{r MFAPartialFMap1, out.width="70%", fig.align="center", fig.pos="H", warning=FALSE, message=FALSE, fig.cap="MFA World Health Characteristics. Partial Factor scores of the observations plotted on the first two components", echo=FALSE}
require(tibble)
# Converting table into data frame for passing it to ggplot
FactorScore <- as.data.frame(resMFA$mexPosition.Data$Table$partial.fi)[c(1,2)]
FactorScore$labels <- labels(resMFA$mexPosition.Data$Table$partial.fi[,1])
# Functions to plot Factor Scores
p <- ggplot() # Plots the base
p +
# geom_point () plots the data points
geom_point(aes(x=FactorScore[,1],y=FactorScore[,2], color=designregion$Region, shape=designforpartial)) +
# scale_x_continuous() scales the plotting constraints of x axis
scale_x_continuous(name = paste0("Component 1 Inertia: ",
round(resMFA$mexPosition.Data$Table$t[1],3), "%")) +
scale_color_discrete(name= "Regions")+
# scale_y_continuous() scales the plotting constraints of y axis
scale_y_continuous(name = paste0("Component 2 Inertia: ",
round(resMFA$mexPosition.Data$Table$t[2],3), "%")) +
# geom_hline() prints a horizontal line at the specified y intercept
geom_hline(yintercept = 0)+
# geom_vline() prints a vertical line at the specified y intercept
geom_vline(xintercept = 0)+
# theme() is used for tweaking the final appearance of the plot
theme(legend.position = "bottom")
```
```{r MFAPartialFMap2, out.width="70%", fig.align="center", fig.pos="H", warning=FALSE, message=FALSE, fig.cap="MFA World Health Characteristics. Partial Factor scores of the observations plotted on components three and four", echo=FALSE}
require(tibble)
# Converting table into data frame for passing it to ggplot
FactorScore <- as.data.frame(resMFA$mexPosition.Data$Table$partial.fi)[c(3,4)]
FactorScore$labels <- labels(resMFA$mexPosition.Data$Table$partial.fi[,1])
# Functions to plot Factor Scores
p <- ggplot() # Plots the base
p +
# geom_point () plots the data points
geom_point(aes(x=FactorScore[,1],y=FactorScore[,2], color=designregion$Region, shape=designforpartial)) +
# scale_x_continuous() scales the plotting constraints of x axis
scale_x_continuous(name = paste0("Component 3 Inertia: ",
round(resMFA$mexPosition.Data$Table$t[3],3), "%")) +
scale_color_discrete(name= "Regions")+
# scale_y_continuous() scales the plotting constraints of y axis
scale_y_continuous(name = paste0("Component 4 Inertia: ",
round(resMFA$mexPosition.Data$Table$t[4],3), "%")) +
# geom_hline() prints a horizontal line at the specified y intercept
geom_hline(yintercept = 0)+
# geom_vline() prints a vertical line at the specified y intercept
geom_vline(xintercept = 0)+
# theme() is used for tweaking the final appearance of the plot
theme(legend.position = "bottom")
```
## Loadings
```{r MFALoading, out.width="70%", fig.align="center", fig.pos="H", warning=FALSE, message=FALSE, fig.cap="MFA World Health Characteristics. Loadings of the observations plotted on the first two components", echo=FALSE}
require(tibble)
require(ggrepel)
# Converting table into data frame for passing it to ggplot
FactorScore <- as.data.frame(resMFA$mexPosition.Data$Table$Q)[c(1,2)]
FactorScore$labels <- labels(resMFA$mexPosition.Data$Table$Q[,1])
# Functions to plot Factor Scores
p <- ggplot() # Plots the base
p +
# geom_point () plots the data points
geom_point(aes(x=FactorScore[,1],y=FactorScore[,2], color=Healthtable$Grp),
position = "dodge") +
geom_text_repel(data = FactorScore,
mapping = aes(x=FactorScore[,1],y=FactorScore[,2], label = FactorScore$labels)) +
scale_color_discrete(name="Sub Tables")+
# scale_x_continuous() scales the plotting constraints of x axis
scale_x_continuous(name = paste0("Component 1 Inertia: ",
round(resMFA$mexPosition.Data$Table$t[1],3), "%"))+
# scale_y_continuous() scales the plotting constraints of y axis
scale_y_continuous(name = paste0("Component 2 Inertia: ",
round(resMFA$mexPosition.Data$Table$t[2],3), "%")) +
# geom_hline() prints a horizontal line at the specified y intercept
geom_hline(yintercept = 0)+
# geom_vline() prints a vertical line at the specified y intercept
geom_vline(xintercept = 0)+
# theme() is used for tweaking the final appearance of the plot
theme(legend.position = "bottom")
```
From the above figure we can see that variables within each sub-table is very well correlated as they are very near to each other in the Loadings map.
`r if (knitr:::is_html_output()) '# References {-}'`