AnirudhPenmatcha
/
Scalable-Cloud-Computing-for-Efficient-Big-Data-Analytics-A-Dask-Integration-Approach
Public
forked from Subhadra-Mishra-iub/Scalable-Cloud-Computing-for-Efficient-Big-Data-Analytics-A-Dask-Integration-Approach
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxgboost-dask.py
56 lines (43 loc) · 1.96 KB
/
xgboost-dask.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import dask.array as da
from dask_ml.model_selection import train_test_split
import xgboost as xgb
import optuna
from optuna.integration import DaskOptunaPruner
from dask.distributed import Client, LocalCluster
cluster = LocalCluster() #creating a local cluster
client = Client(cluster)
# we will plug in our data here once we finish the pre-processing pipeline
X, y = make_regression(n_samples=100000, n_features=5, noise=0.1, random_state=42)
# Convert the NumPy arrays to Dask arrays
X_dask = da.from_array(X, chunks=10000) # Chunk size can be adjusted based on available memory
y_dask = da.from_array(y, chunks=10000)
#splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_dask, y_dask, test_size=0.2)
def objective(trial):
# XGBoost hyperparameters
params = {
'objective': 'reg:squarederror',
'eval_metric': 'rmse',
'max_depth': trial.suggest_int('max_depth', 3, 10),
'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
'subsample': trial.suggest_float('subsample', 0.5, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),
}
# Train XGBoost model with dask
dtrain = xgb.dask.DaskDMatrix(client, X_train, y_train)
bst = xgb.dask.train(client=client, params=params, dtrain=dtrain, num_boost_round=100)
# Make predictions
y_pred = xgb.dask.predict(client=client, model=bst['booster'], data=X_test)
# performance metric RMSE
rmse = ((y_pred - y_test) ** 2).mean().compute() ** 0.5
return rmse
#
study = optuna.create_study(pruner=DaskOptunaPruner(), direction='minimize')
study.optimize(objective, n_trials=100)
print("Best parameters:", study.best_params)
print("Best RMSE:", study.best_value)
# Close the Dask client and cluster
client.close()
cluster.close()