-
Notifications
You must be signed in to change notification settings - Fork 0
/
5_5_feature_importances.py
46 lines (37 loc) · 1.65 KB
/
5_5_feature_importances.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
# Load training table (here for sigma 4.0)
training_table = np.load('training_table_sigma_4.0.npy')
# Select values to train on (all but current chronology) and split into inputs and targets (crn values, 1st column)
inputs = np.delete(training_table, 0, 1)
targets = training_table[:, 0]
# Fit random forest
rf = RandomForestRegressor(n_jobs=-1, n_estimators=500, random_state=42)
rf.fit(inputs, targets)
# Create empty numpy array for storing the results
out = np.empty([np.size(training_table, 1) - 1, 6])
out.fill(np.nan)
# Transform to Pandas dataframe
out = pd.DataFrame(out, columns=['group', 'var', 'type', 'offset', 'month', 'importance'])
# Filling the first column (group)
out.iloc[:288, 0] = 'meteo'
out.iloc[288:293, 0] = 'soil'
out.iloc[293, 0] = 'year'
out.iloc[294:296, 0] = 'coords'
out.iloc[296:, 0] = 'one_hot'
# Filling the second column (var)
out.iloc[:288, 1] = np.tile(np.repeat(np.array(['tmin', 'tmax', 'tavg', 'prec'], dtype=str), 12), 6)
out.iloc[288, 1] = 'clay'
out.iloc[289, 1] = 'sand'
out.iloc[290, 1] = 'silt'
out.iloc[291, 1] = 'soc'
out.iloc[292, 1] = 'nitrogen'
out.iloc[294, 1] = 'lat'
out.iloc[295, 1] = 'lon'
# Filling the rest of the columns (type, offset, month, importance)
out.iloc[:288, 2] = np.tile(np.repeat(np.array(['value', 'moving_avg', 'deviation'], dtype=str), 48), 2)
out.iloc[:288, 3] = np.repeat(np.array([0, -1]), 144)
out.iloc[:288, 4] = np.tile(np.arange(1, 13), 24)
out.iloc[:, 5] = rf.feature_importances_
out.to_csv('4_0_general_outputs/feature_importances.csv', sep=';', index=False)