-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest predictions
150 lines (129 loc) · 4.67 KB
/
test predictions
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# Required packages
library(forecast)
library(ggplot2)
library(tidyr)
library(dplyr)
library(zoo)
library(gridExtra)
# Create simulated data
set.seed(123)
dates <- seq(as.yearmon("Jan 2020"), length.out = 48, by = 1/12)
mydf <- data.frame(
date = dates,
var1 = rnorm(48, mean = 100, sd = 15),
var2 = rnorm(48, mean = 50, sd = 10),
var3 = rnorm(48, mean = 150, sd = 20),
var4 = rnorm(48, mean = 75, sd = 12)
)
# Function to fit model on training data and compare with test data
fit_and_validate <- function(data, variable) {
n_total <- nrow(data)
n_test <- 6
n_train <- n_total - n_test
train_data <- data[1:n_train, ]
test_data <- data[(n_train + 1):n_total, ]
ts_train <- ts(train_data[[variable]], frequency = 12)
model <- auto.arima(ts_train)
forecast_result <- forecast(model, h = n_test)
# Create forecast dataframe with overlap point
result_df <- data.frame(
date = data$date,
actual = data[[variable]],
fitted = c(model$fitted, rep(NA, n_test)),
forecast = c(rep(NA, n_train - 1), # One less NA to make room for overlap
train_data[[variable]][n_train], # Add overlap point
forecast_result$mean),
lower = c(rep(NA, n_train - 1),
train_data[[variable]][n_train],
forecast_result$lower[, 2]),
upper = c(rep(NA, n_train - 1),
train_data[[variable]][n_train],
forecast_result$upper[, 2])
)
# Calculate metrics (excluding overlap point from calculations)
test_actual <- test_data[[variable]]
test_forecast <- forecast_result$mean
sum_actual <- sum(test_actual)
sum_forecast <- sum(test_forecast)
sum_pct_error <- ((sum_forecast - sum_actual) / sum_actual) * 100
accuracy_metrics <- data.frame(
RMSE = sqrt(mean((test_actual - test_forecast)^2)),
MAPE = mean(abs((test_actual - test_forecast) / test_actual)) * 100,
Sum_Pct_Error = sum_pct_error
)
return(list(
data = result_df,
metrics = accuracy_metrics,
model = model
))
}
# Create plots for all variables
analyze_and_plot <- function(mydf) {
variables <- c("var1", "var2", "var3", "var4")
results <- lapply(variables, function(var) fit_and_validate(mydf, var))
# Function to create individual plot
create_plot <- function(result, var, include_intervals = TRUE) {
p <- ggplot(result$data, aes(x = date)) +
geom_line(aes(y = actual, color = "Actual"), size = 1) +
geom_line(aes(y = forecast, color = "Forecast"),
size = 1, linetype = "dashed")
if(include_intervals) {
p <- p + geom_ribbon(aes(ymin = lower, ymax = upper),
fill = "blue", alpha = 0.1)
}
p <- p +
theme_minimal() +
labs(title = paste(var),
subtitle = sprintf("RMSE: %.1f | MAPE: %.1f%% | Sum Error: %.1f%%",
result$metrics$RMSE,
result$metrics$MAPE,
result$metrics$Sum_Pct_Error),
x = "Date",
y = "Value") +
scale_color_manual(values = c("Actual" = "black", "Forecast" = "blue"),
name = "") +
theme(legend.position = "bottom",
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
panel.grid.minor = element_blank())
return(p)
}
# Create both sets of plots
plots_with_intervals <- lapply(seq_along(variables), function(i) {
create_plot(results[[i]], variables[i], TRUE)
})
plots_without_intervals <- lapply(seq_along(variables), function(i) {
create_plot(results[[i]], variables[i], FALSE)
})
# Arrange plots in 2x2 grids
plot_with_intervals <- grid.arrange(
plots_with_intervals[[1]], plots_with_intervals[[2]],
plots_with_intervals[[3]], plots_with_intervals[[4]],
top = "Forecasts with Confidence Intervals",
ncol = 2
)
plot_without_intervals <- grid.arrange(
plots_without_intervals[[1]], plots_without_intervals[[2]],
plots_without_intervals[[3]], plots_without_intervals[[4]],
top = "Forecasts without Confidence Intervals",
ncol = 2
)
# Print model summaries and accuracy metrics
for(i in seq_along(variables)) {
cat("\nResults for", variables[i], ":\n")
cat("\nModel Summary:\n")
print(summary(results[[i]]$model))
cat("\nForecast Accuracy Metrics:\n")
print(results[[i]]$metrics)
cat("\n-------------------\n")
}
return(list(
with_intervals = plot_with_intervals,
without_intervals = plot_without_intervals
))
}
# Run the analysis and create both sets of plots
plots <- analyze_and_plot(mydf)
# Display both sets of plots
print(plots$with_intervals)
print(plots$without_intervals)