-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPCA_python_BiPlot_sample_Code.qmd
123 lines (83 loc) · 2.33 KB
/
PCA_python_BiPlot_sample_Code.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
---
title: "PCA With Python"
author: "Habib Ezatabadi"
format: gfm
editor: visual
---
## import require libraries
```{python}
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
```
## create a pca model
```{python}
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plyer
## get path
pathh = plyer.filechooser.open_file()[0]
```
## implement Model
```{python}
dat = pd.read_excel(pathh)
dat.shape
index = dat['RTO_Name'].values
ind = [2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 16]
dat2 = pd.DataFrame(dat.iloc[:, ind])
dat2.index = index
dat2
## To define a standardizer for the data
scaler = StandardScaler()
```
```{python}
colNames = dat2.columns
Index = dat2.index
dat_scaled = pd.DataFrame(scaler.fit_transform(dat2), columns = colNames,
index = Index)
dat_scaled
```
## Obtaining all the main components
```{python}
from sklearn.preprocessing import MinMaxScaler
scaler2 = MinMaxScaler(feature_range=(0, 1))
pca = PCA(0.8)
pca.fit(dat_scaled)
pca.n_components_ ## The number of components that cover 90% of the variance
Loading = pca.components_
temp1 = np.repeat("PC", 4); temp2 = [1, 2, 3, 4]
temp3 = list(map(lambda x, y: x + str(y), temp1, temp2))
df_loading = pd.DataFrame(Loading.T, index = colNames, columns = temp3)
df_loading
```
## create biplot
```{python}
from adjustText import adjust_text
#| fig-width: 9
#| fig-height: 9
Scores = pca.transform(dat_scaled)
df_score = pd.DataFrame(scaler2.fit_transform(Scores[:, 0:2]),
index = index, columns = ["PC1", "PC2"])
def abbreviate(strings, length = 4):
if len(strings) > length:
ri = round(length/2)
le = length - ri
res = strings[:le] + strings[(len(strings) - le):len(strings)]
else:
res = strings
return res
fig, ax = plt.subplots(figsize=(14, 9))
for i, feature in enumerate(df_loading.index):
ax.arrow(0, 0, df_loading.iloc[i, 0],
df_loading.iloc[i, 1])
Texts = [ax.text(df_loading.iloc[i, 0] * 1.01,
df_loading.iloc[i, 1] * 1.01,
abbreviate(feature), fontsize=18, color = "purple",
style = "italic") for i, feature in enumerate(df_loading.index)]
adjust_text(Texts)
ax.scatter(df_score.PC1, df_score.PC2)
ax.set_xlabel('PC1', fontsize=20)
ax.set_ylabel('PC2', fontsize=20)
ax.set_title('Figure 1', fontsize=20)
plt.show()
```