.Rhistory

library(tidyr)
library(purrr)
library(syuzhet)
# Load the data
utterance_train <- read.csv("dialogue_utterance_train.csv")
usefulness_train <- read.csv("dialogue_usefulness_train.csv")
utterance_validation <- read.csv("dialogue_utterance_validation.csv")
usefulness_validation <- read.csv("dialogue_usefulness_validation.csv")
utterance_test <- read.csv("dialogue_utterance_test.csv")
usefulness_test <- read.csv("dialogue_usefulness_test.csv")
# Ensure consistent column names
colnames(utterance_train)[1] <- "Dialogue_ID"
colnames(utterance_train)[3] <- "Interlocutor"
colnames(utterance_validation)[1] <- "Dialogue_ID"
colnames(utterance_validation)[3] <- "Interlocutor"
colnames(utterance_test)[1] <- "Dialogue_ID"
colnames(utterance_test)[3] <- "Interlocutor"
# Feature Engineering: Sentiment Analysis
compute_sentiment <- function(text) {
polarity <- get_sentiment(text, method = "syuzhet")
subjectivity <- get_nrc_sentiment(text)$positive - get_nrc_sentiment(text)$negative
return(data.frame(polarity = polarity, subjectivity = subjectivity))
}
utterance_train_sentiment <- utterance_train %>%
mutate(sentiment = map(Utterance_text, compute_sentiment)) %>%
unnest_wider(sentiment)
utterance_validation_sentiment <- utterance_validation %>%
mutate(sentiment = map(Utterance_text, compute_sentiment)) %>%
unnest_wider(sentiment)
# Aggregate features for each dialogue
extract_features <- function(df, usefulness) {
df %>%
group_by(Dialogue_ID) %>%
summarise(
total_utterances = n(),
avg_utterance_length = mean(nchar(Utterance_text)),
student_interactions = sum(Interlocutor == "Student"),
chatbot_interactions = sum(Interlocutor == "Chatbot"),
avg_polarity = mean(polarity, na.rm = TRUE),
avg_subjectivity = mean(subjectivity, na.rm = TRUE),
max_polarity = max(polarity, na.rm = TRUE),
min_polarity = min(polarity, na.rm = TRUE),
polarity_range = max(polarity, na.rm = TRUE) - min(polarity, na.rm = TRUE)
) %>%
inner_join(usefulness, by = "Dialogue_ID")
}
features_train <- extract_features(utterance_train_sentiment, usefulness_train)
features_validation <- extract_features(utterance_validation_sentiment, usefulness_validation)
# Create Usefulness Group for Boxplot Visualization
features_train <- features_train %>%
mutate(Usefulness_group = case_when(
Usefulness_score %in% c(1, 2) ~ "Low",
Usefulness_score %in% c(4, 5) ~ "High",
TRUE ~ "Medium"
))
# Boxplot visualization for Total Utterances and Average Polarity
# Total Utterances Boxplot
ggplot(features_train %>% filter(Usefulness_group %in% c("Low Usefulness (Score 1-2)", "High Usefulness (Score 4-5)")), aes(x = Usefulness_group, y = total_utterances)) +
geom_boxplot(aes(fill = Usefulness_group)) +
labs(title = "Total Utterances by Usefulness Group", x = "Usefulness Group", y = "Total Utterances") +
theme_minimal() +
scale_fill_manual(values = c("Low Usefulness (Score 1-2)" = "red", "High Usefulness (Score 4-5)" = "green"))
# Average Polarity Boxplot
ggplot(features_train %>% filter(Usefulness_group %in% c("Low Usefulness (Score 1-2)", "High Usefulness (Score 4-5)")), aes(x = Usefulness_group, y = avg_polarity)) +
geom_boxplot(aes(fill = Usefulness_group)) +
labs(title = "Average Polarity by Usefulness Group", x = "Usefulness Group", y = "Average Polarity") +
theme_minimal() +
scale_fill_manual(values = c("Low Usefulness (Score 1-2)" = "red", "High Usefulness (Score 4-5)" = "green"))
# Section 1: Feature Engineering and Visualization
# Check and install necessary libraries
necessary_packages <- c("dplyr", "ggplot2", "caret", "randomForest", "textclean", "tidyr", "purrr", "syuzhet")
for (package in necessary_packages) {
if (!requireNamespace(package, quietly = TRUE)) {
install.packages(package)
}
}
# Load the libraries
library(dplyr)
library(ggplot2)
library(caret)
library(randomForest)
library(textclean)
library(tidyr)
library(purrr)
library(syuzhet)
# Load the data
utterance_train <- read.csv("dialogue_utterance_train.csv")
usefulness_train <- read.csv("dialogue_usefulness_train.csv")
utterance_validation <- read.csv("dialogue_utterance_validation.csv")
usefulness_validation <- read.csv("dialogue_usefulness_validation.csv")
utterance_test <- read.csv("dialogue_utterance_test.csv")
usefulness_test <- read.csv("dialogue_usefulness_test.csv")
# Ensure consistent column names
colnames(utterance_train)[1] <- "Dialogue_ID"
colnames(utterance_train)[3] <- "Interlocutor"
colnames(utterance_validation)[1] <- "Dialogue_ID"
colnames(utterance_validation)[3] <- "Interlocutor"
colnames(utterance_test)[1] <- "Dialogue_ID"
colnames(utterance_test)[3] <- "Interlocutor"
# Feature Engineering: Sentiment Analysis
compute_sentiment <- function(text) {
polarity <- get_sentiment(text, method = "syuzhet")
subjectivity <- get_nrc_sentiment(text)$positive - get_nrc_sentiment(text)$negative
return(data.frame(polarity = polarity, subjectivity = subjectivity))
}
utterance_train_sentiment <- utterance_train %>%
mutate(sentiment = map(Utterance_text, compute_sentiment)) %>%
unnest_wider(sentiment)
utterance_validation_sentiment <- utterance_validation %>%
mutate(sentiment = map(Utterance_text, compute_sentiment)) %>%
unnest_wider(sentiment)
# Aggregate features for each dialogue
extract_features <- function(df, usefulness) {
df %>%
group_by(Dialogue_ID) %>%
summarise(
total_utterances = n(),
avg_utterance_length = mean(nchar(Utterance_text)),
student_interactions = sum(Interlocutor == "Student"),
chatbot_interactions = sum(Interlocutor == "Chatbot"),
avg_polarity = mean(polarity, na.rm = TRUE),
avg_subjectivity = mean(subjectivity, na.rm = TRUE),
max_polarity = max(polarity, na.rm = TRUE),
min_polarity = min(polarity, na.rm = TRUE),
polarity_range = max(polarity, na.rm = TRUE) - min(polarity, na.rm = TRUE)
) %>%
inner_join(usefulness, by = "Dialogue_ID")
}
features_train <- extract_features(utterance_train_sentiment, usefulness_train)
features_validation <- extract_features(utterance_validation_sentiment, usefulness_validation)
# Create Usefulness Group for Boxplot Visualization
features_train <- features_train %>%
mutate(Usefulness_group = case_when(
Usefulness_score %in% c(1, 2) ~ "Low Usefulness (Score 1-2)",
Usefulness_score %in% c(4, 5) ~ "High Usefulness (Score 4-5)",
TRUE ~ "Medium Usefulness (Score 3)"
))
# Boxplot visualization for Total Utterances and Average Polarity
# Total Utterances Boxplot
ggplot(features_train %>% filter(Usefulness_group %in% c("Low Usefulness (Score 1-2)", "High Usefulness (Score 4-5)")), aes(x = Usefulness_group, y = total_utterances)) +
geom_boxplot(aes(fill = Usefulness_group)) +
labs(title = "Total Utterances by Usefulness Group", x = "Usefulness Group", y = "Total Utterances") +
theme_minimal() +
scale_fill_manual(values = c("Low Usefulness (Score 1-2)" = "red", "High Usefulness (Score 4-5)" = "green"))
# Average Polarity Boxplot
ggplot(features_train %>% filter(Usefulness_group %in% c("Low Usefulness (Score 1-2)", "High Usefulness (Score 4-5)")), aes(x = Usefulness_group, y = avg_polarity)) +
geom_boxplot(aes(fill = Usefulness_group)) +
labs(title = "Average Polarity by Usefulness Group", x = "Usefulness Group", y = "Average Polarity") +
theme_minimal() +
scale_fill_manual(values = c("Low Usefulness (Score 1-2)" = "red", "High Usefulness (Score 4-5)" = "green"))
# Section 1: Feature Engineering and Visualization
# Check and install necessary libraries
necessary_packages <- c("dplyr", "ggplot2", "caret", "randomForest", "textclean", "tidyr", "purrr", "syuzhet")
for (package in necessary_packages) {
if (!requireNamespace(package, quietly = TRUE)) {
install.packages(package)
}
}
# Load the libraries
library(dplyr)
library(ggplot2)
library(caret)
library(randomForest)
library(textclean)
library(tidyr)
library(purrr)
library(syuzhet)
# Load the data
utterance_train <- read.csv("dialogue_utterance_train.csv")
usefulness_train <- read.csv("dialogue_usefulness_train.csv")
utterance_validation <- read.csv("dialogue_utterance_validation.csv")
usefulness_validation <- read.csv("dialogue_usefulness_validation.csv")
utterance_test <- read.csv("dialogue_utterance_test.csv")
usefulness_test <- read.csv("dialogue_usefulness_test.csv")
# Ensure consistent column names
colnames(utterance_train)[1] <- "Dialogue_ID"
colnames(utterance_train)[3] <- "Interlocutor"
colnames(utterance_validation)[1] <- "Dialogue_ID"
colnames(utterance_validation)[3] <- "Interlocutor"
colnames(utterance_test)[1] <- "Dialogue_ID"
colnames(utterance_test)[3] <- "Interlocutor"
# Feature Engineering: Sentiment Analysis
compute_sentiment <- function(text) {
polarity <- get_sentiment(text, method = "syuzhet")
subjectivity <- get_nrc_sentiment(text)$positive - get_nrc_sentiment(text)$negative
return(data.frame(polarity = polarity, subjectivity = subjectivity))
}
utterance_train_sentiment <- utterance_train %>%
mutate(sentiment = map(Utterance_text, compute_sentiment)) %>%
unnest_wider(sentiment)
utterance_validation_sentiment <- utterance_validation %>%
mutate(sentiment = map(Utterance_text, compute_sentiment)) %>%
unnest_wider(sentiment)
# Aggregate features for each dialogue
extract_features <- function(df, usefulness) {
df %>%
group_by(Dialogue_ID) %>%
summarise(
total_utterances = n(),
avg_utterance_length = mean(nchar(Utterance_text)),
student_interactions = sum(Interlocutor == "Student"),
chatbot_interactions = sum(Interlocutor == "Chatbot"),
avg_polarity = mean(polarity, na.rm = TRUE),
avg_subjectivity = mean(subjectivity, na.rm = TRUE),
max_polarity = max(polarity, na.rm = TRUE),
min_polarity = min(polarity, na.rm = TRUE),
polarity_range = max(polarity, na.rm = TRUE) - min(polarity, na.rm = TRUE)
) %>%
inner_join(usefulness, by = "Dialogue_ID")
}
features_train <- extract_features(utterance_train_sentiment, usefulness_train)
features_validation <- extract_features(utterance_validation_sentiment, usefulness_validation)
# Create Usefulness Group for Boxplot Visualization
features_train <- features_train %>%
mutate(Usefulness_group = case_when(
Usefulness_score %in% c(1, 2) ~ "Low Usefulness (Score 1-2)",
Usefulness_score %in% c(4, 5) ~ "High Usefulness (Score 4-5)",
TRUE ~ "Medium Usefulness (Score 3)"
))
# Boxplot visualization for Total Utterances and Average Polarity
# Total Utterances Boxplot
ggplot(features_train %>% filter(Usefulness_group %in% c("Low Usefulness (Score 1-2)", "High Usefulness (Score 4-5)")), aes(x = Usefulness_group, y = total_utterances)) +
geom_boxplot(aes(fill = Usefulness_group)) +
labs(title = "Total Utterances by Usefulness Group", x = "Usefulness Group", y = "Total Utterances") +
theme_minimal() +
scale_fill_manual(values = c("Low Usefulness (Score 1-2)" = "red", "Medium Usefulness (Score 3)" = "yellow", "High Usefulness (Score 4-5)" = "green"))
# Average Polarity Boxplot
ggplot(features_train %>% filter(Usefulness_group %in% c("Low Usefulness (Score 1-2)", "High Usefulness (Score 4-5)")), aes(x = Usefulness_group, y = avg_polarity)) +
geom_boxplot(aes(fill = Usefulness_group)) +
labs(title = "Average Polarity by Usefulness Group", x = "Usefulness Group", y = "Average Polarity") +
theme_minimal() +
scale_fill_manual(values = c("Low Usefulness (Score 1-2)" = "red", "High Usefulness (Score 4-5)" = "green"))
# Section 1: Feature Engineering and Visualization
# Check and install necessary libraries
necessary_packages <- c("dplyr", "ggplot2", "caret", "randomForest", "textclean", "tidyr", "purrr", "syuzhet")
for (package in necessary_packages) {
if (!requireNamespace(package, quietly = TRUE)) {
install.packages(package)
}
}
# Load the libraries
library(dplyr)
library(ggplot2)
library(caret)
library(randomForest)
library(textclean)
library(tidyr)
library(purrr)
library(syuzhet)
# Load the data
utterance_train <- read.csv("dialogue_utterance_train.csv")
usefulness_train <- read.csv("dialogue_usefulness_train.csv")
utterance_validation <- read.csv("dialogue_utterance_validation.csv")
usefulness_validation <- read.csv("dialogue_usefulness_validation.csv")
utterance_test <- read.csv("dialogue_utterance_test.csv")
usefulness_test <- read.csv("dialogue_usefulness_test.csv")
# Ensure consistent column names
colnames(utterance_train)[1] <- "Dialogue_ID"
colnames(utterance_train)[3] <- "Interlocutor"
colnames(utterance_validation)[1] <- "Dialogue_ID"
colnames(utterance_validation)[3] <- "Interlocutor"
colnames(utterance_test)[1] <- "Dialogue_ID"
colnames(utterance_test)[3] <- "Interlocutor"
# Feature Engineering: Sentiment Analysis
compute_sentiment <- function(text) {
polarity <- get_sentiment(text, method = "syuzhet")
subjectivity <- get_nrc_sentiment(text)$positive - get_nrc_sentiment(text)$negative
return(data.frame(polarity = polarity, subjectivity = subjectivity))
}
utterance_train_sentiment <- utterance_train %>%
mutate(sentiment = map(Utterance_text, compute_sentiment)) %>%
unnest_wider(sentiment)
utterance_validation_sentiment <- utterance_validation %>%
mutate(sentiment = map(Utterance_text, compute_sentiment)) %>%
unnest_wider(sentiment)
# Aggregate features for each dialogue
extract_features <- function(df, usefulness) {
df %>%
group_by(Dialogue_ID) %>%
summarise(
total_utterances = n(),
avg_utterance_length = mean(nchar(Utterance_text)),
student_interactions = sum(Interlocutor == "Student"),
chatbot_interactions = sum(Interlocutor == "Chatbot"),
avg_polarity = mean(polarity, na.rm = TRUE),
avg_subjectivity = mean(subjectivity, na.rm = TRUE),
max_polarity = max(polarity, na.rm = TRUE),
min_polarity = min(polarity, na.rm = TRUE),
polarity_range = max(polarity, na.rm = TRUE) - min(polarity, na.rm = TRUE)
) %>%
inner_join(usefulness, by = "Dialogue_ID")
}
features_train <- extract_features(utterance_train_sentiment, usefulness_train)
features_validation <- extract_features(utterance_validation_sentiment, usefulness_validation)
# Create Usefulness Group for Boxplot Visualization
features_train <- features_train %>%
mutate(Usefulness_group = case_when(
Usefulness_score %in% c(1, 2) ~ "Low Usefulness (Score 1-2)",
Usefulness_score %in% c(4, 5) ~ "High Usefulness (Score 4-5)"
))
# Boxplot visualization for Total Utterances and Average Polarity
# Total Utterances Boxplot
ggplot(features_train %>% filter(Usefulness_group %in% c("Low Usefulness (Score 1-2)", "High Usefulness (Score 4-5)")), aes(x = Usefulness_group, y = total_utterances)) +
geom_boxplot(aes(fill = Usefulness_group)) +
labs(title = "Total Utterances by Usefulness Group", x = "Usefulness Group", y = "Total Utterances") +
theme_minimal() +
scale_fill_manual(values = c("Low Usefulness (Score 1-2)" = "red", "High Usefulness (Score 4-5)" = "green"))
# Average Polarity Boxplot
ggplot(features_train %>% filter(Usefulness_group %in% c("Low Usefulness (Score 1-2)", "High Usefulness (Score 4-5)")), aes(x = Usefulness_group, y = avg_polarity)) +
geom_boxplot(aes(fill = Usefulness_group)) +
labs(title = "Average Polarity by Usefulness Group", x = "Usefulness Group", y = "Average Polarity") +
theme_minimal() +
scale_fill_manual(values = c("Low Usefulness (Score 1-2)" = "red", "High Usefulness (Score 4-5)" = "green"))
# Section 2: Build Initial Machine Learning Model
# Remove Usefulness_group before training
features_train <- features_train %>% select(-Usefulness_group)
# Train a Random Forest model
set.seed(42)
rf_model <- suppressWarnings(randomForest(Usefulness_score ~ ., data = features_train, importance = TRUE))
print(rf_model)
# Predict on the validation set
predictions <- suppressWarnings(predict(rf_model, features_validation))
rmse <- sqrt(mean((predictions - features_validation$Usefulness_score)^2))
print(paste("RMSE on validation set:", rmse))
# Feature importance with colors and rounded values
importance_df <- as.data.frame(importance(rf_model))
importance_df <- importance_df[order(importance_df$IncNodePurity, decreasing = TRUE),]
importance_df$Feature <- rownames(importance_df)
ggplot(importance_df, aes(x = reorder(Feature, IncNodePurity), y = round(IncNodePurity, 2), fill = IncNodePurity)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Feature Importance in Random Forest Model", x = "Features", y = "IncNodePurity") +
scale_fill_gradient(low = "blue", high = "red") +
theme_minimal()
# Section 3: Improve the Model
# Tune the Random Forest model using caret
train_control <- trainControl(method = "cv", number = 5)
tuned_rf <- suppressWarnings(train(
Usefulness_score ~ ., data = features_train,
method = "rf", trControl = train_control,
tuneLength = 5
))
print(tuned_rf)
# Predict using the tuned model
tuned_predictions <- predict(tuned_rf, features_validation)
tuned_rmse <- sqrt(mean((tuned_predictions - features_validation$Usefulness_score)^2))
print(paste("Tuned RMSE on validation set:", tuned_rmse))
# Section 4: Analyze Generated Dialogue
# Print the utterance text of Dialogue_ID 1111
dialogue_1111_data <- utterance_train %>% filter(Dialogue_ID == 1111)
cat("Dialogue text for Dialogue_ID 1111:\n")
print(dialogue_1111_data$Utterance_text)
# Predict using the tuned model
tuned_predictions <- predict(tuned_rf, features_validation)
tuned_rmse <- sqrt(mean((tuned_predictions - features_validation$Usefulness_score)^2))
print(paste("Tuned RMSE on validation set:", tuned_rmse))
# Predict usefulness score for Dialogue_ID 1111
dialogue_1111 <- filter(features_train, Dialogue_ID == 1111)
pred_1111 <- predict(tuned_rf, dialogue_1111)
print(paste("Predicted Usefulness Score for Dialogue_ID 1111:", pred_1111))
# Section 1: Feature Engineering and Visualization
# Checking and installing necessary libraries
necessary_packages <- c("dplyr", "ggplot2", "caret", "randomForest", "textclean", "tidyr", "purrr", "syuzhet")
for (package in necessary_packages) {
if (!requireNamespace(package, quietly = TRUE)) {
install.packages(package)
}
}
# Loading the libraries
library(dplyr)
library(ggplot2)
library(caret)
library(randomForest)
library(textclean)
library(tidyr)
library(purrr)
library(syuzhet)
# Loading the data
utterance_train <- read.csv("dialogue_utterance_train.csv")
usefulness_train <- read.csv("dialogue_usefulness_train.csv")
utterance_validation <- read.csv("dialogue_utterance_validation.csv")
usefulness_validation <- read.csv("dialogue_usefulness_validation.csv")
utterance_test <- read.csv("dialogue_utterance_test.csv")
usefulness_test <- read.csv("dialogue_usefulness_test.csv")
# Ensuring consistent column names
colnames(utterance_train)[1] <- "Dialogue_ID"
colnames(utterance_train)[3] <- "Interlocutor"
colnames(utterance_validation)[1] <- "Dialogue_ID"
colnames(utterance_validation)[3] <- "Interlocutor"
colnames(utterance_test)[1] <- "Dialogue_ID"
colnames(utterance_test)[3] <- "Interlocutor"
# Feature Engineering: Sentiment Analysis
compute_sentiment <- function(text) {
# Computing polarity using syuzhet
polarity <- get_sentiment(text, method = "syuzhet")
# Computing subjectivity as the difference between positive and negative sentiments
subjectivity <- get_nrc_sentiment(text)$positive - get_nrc_sentiment(text)$negative
return(data.frame(polarity = polarity, subjectivity = subjectivity))
}
# Applying sentiment analysis to the training data
utterance_train_sentiment <- utterance_train %>%
mutate(sentiment = map(Utterance_text, compute_sentiment)) %>%
unnest_wider(sentiment)
# Applying sentiment analysis to the validation data
utterance_validation_sentiment <- utterance_validation %>%
mutate(sentiment = map(Utterance_text, compute_sentiment)) %>%
unnest_wider(sentiment)
# Aggregating features for each dialogue
extract_features <- function(df, usefulness) {
df %>%
group_by(Dialogue_ID) %>%
summarise(
total_utterances = n(),
avg_utterance_length = mean(nchar(Utterance_text)),
student_interactions = sum(Interlocutor == "Student"),
chatbot_interactions = sum(Interlocutor == "Chatbot"),
avg_polarity = mean(polarity, na.rm = TRUE),
avg_subjectivity = mean(subjectivity, na.rm = TRUE),
max_polarity = max(polarity, na.rm = TRUE),
min_polarity = min(polarity, na.rm = TRUE),
polarity_range = max(polarity, na.rm = TRUE) - min(polarity, na.rm = TRUE)
) %>%
inner_join(usefulness, by = "Dialogue_ID")
}
# Extracting features for the training data
features_train <- extract_features(utterance_train_sentiment, usefulness_train)
# Extracting features for the validation data
features_validation <- extract_features(utterance_validation_sentiment, usefulness_validation)
# Creating Usefulness Group for Boxplot Visualization
features_train <- features_train %>%
mutate(Usefulness_group = case_when(
Usefulness_score %in% c(1, 2) ~ "Low Usefulness (Score 1-2)",
Usefulness_score %in% c(4, 5) ~ "High Usefulness (Score 4-5)"
))
# Boxplot visualization for Total Utterances and Average Polarity
# Total Utterances Boxplot
ggplot(features_train %>% filter(Usefulness_group %in% c("Low Usefulness (Score 1-2)", "High Usefulness (Score 4-5)")), aes(x = Usefulness_group, y = total_utterances)) +
geom_boxplot(aes(fill = Usefulness_group)) +
labs(title = "Total Utterances by Usefulness Group", x = "Usefulness Group", y = "Total Utterances") +
theme_minimal() +
scale_fill_manual(values = c("Low Usefulness (Score 1-2)" = "red", "High Usefulness (Score 4-5)" = "green"))
# Average Polarity Boxplot
ggplot(features_train %>% filter(Usefulness_group %in% c("Low Usefulness (Score 1-2)", "High Usefulness (Score 4-5)")), aes(x = Usefulness_group, y = avg_polarity)) +
geom_boxplot(aes(fill = Usefulness_group)) +
labs(title = "Average Polarity by Usefulness Group", x = "Usefulness Group", y = "Average Polarity") +
theme_minimal() +
scale_fill_manual(values = c("Low Usefulness (Score 1-2)" = "red", "High Usefulness (Score 4-5)" = "green"))
# Section 2: Build Initial Machine Learning Model
# Removing Usefulness_group before training
features_train <- features_train %>% select(-Usefulness_group)
# Training a Random Forest model
set.seed(42)
rf_model <- suppressWarnings(randomForest(Usefulness_score ~ ., data = features_train, importance = TRUE))
print(rf_model)
# Predicting on the validation set
predictions <- suppressWarnings(predict(rf_model, features_validation))
# Calculating RMSE (Root Mean Square Error) on the validation set
rmse <- sqrt(mean((predictions - features_validation$Usefulness_score)^2))
print(paste("RMSE on validation set:", rmse))
# Feature importance with colors and rounded values
importance_df <- as.data.frame(importance(rf_model))
# Sorting features by their importance
importance_df <- importance_df[order(importance_df$IncNodePurity, decreasing = TRUE),]
importance_df$Feature <- rownames(importance_df)
# Plotting feature importance
ggplot(importance_df, aes(x = reorder(Feature, IncNodePurity), y = round(IncNodePurity, 2), fill = IncNodePurity)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Feature Importance in Random Forest Model", x = "Features", y = "IncNodePurity") +
scale_fill_gradient(low = "blue", high = "red") +
theme_minimal()
# Section 3: Improve the Model
# Tuning the Random Forest model using caret
train_control <- trainControl(method = "cv", number = 5)
# Performing cross-validation with 5 folds
tuned_rf <- suppressWarnings(train(
Usefulness_score ~ ., data = features_train,
method = "rf", trControl = train_control,
tuneLength = 5
))
print(tuned_rf)
# Predicting using the tuned model
tuned_predictions <- predict(tuned_rf, features_validation)
# Calculating RMSE for the tuned model on the validation set
tuned_rmse <- sqrt(mean((tuned_predictions - features_validation$Usefulness_score)^2))
print(paste("Tuned RMSE on validation set:", tuned_rmse))
# Section 4: Analyze Generated Dialogue
# Printing the utterance text of Dialogue_ID 1111
dialogue_1111_data <- utterance_train %>% filter(Dialogue_ID == 1111)
cat("Dialogue text for Dialogue_ID 1111:\n")
print(dialogue_1111_data$Utterance_text)
# Predicting using the tuned model
tuned_predictions <- predict(tuned_rf, features_validation)
tuned_rmse <- sqrt(mean((tuned_predictions - features_validation$Usefulness_score)^2))
print(paste("Tuned RMSE on validation set:", tuned_rmse))
# Predicting usefulness score for Dialogue_ID 1111
dialogue_1111 <- filter(features_train, Dialogue_ID == 1111)
pred_1111 <- predict(tuned_rf, dialogue_1111)
print(paste("Predicted Usefulness Score for Dialogue_ID 1111:", pred_1111))
# Section 5: Predict Test Set Usefulness
# Feature Engineering: Sentiment Analysis for Test Set
utterance_test_sentiment <- utterance_test %>%
mutate(sentiment = map(Utterance_text, compute_sentiment)) %>%
unnest_wider(sentiment)
# Aggregating features for each dialogue in the test set
features_test <- extract_features(utterance_test_sentiment, usefulness_test)
# Predicting on the test set using the tuned model
test_predictions <- predict(tuned_rf, features_test)
# Preparing the csv file with rounded usefulness scores
usefulness_test$Usefulness_score <- round(test_predictions, 2)
write.csv(usefulness_test, "Adhikary_34053387_dialogue_usefulness_test.csv", row.names = FALSE)
setwd("/Users/abhishekadhikary/Monash/FIT5147/DVP/AbhishekAdhikary_34053387_Code")