-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcorrelationwith%changemeansentiment.R
98 lines (80 loc) · 4.12 KB
/
correlationwith%changemeansentiment.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
library(dplyr)
library(ggplot2)
# Assume 'combined_data' is already loaded with appropriate sentiment and emotion word data
# Calculate the total emotion ratio and absolute change in mean sentiment
combined_data <- combined_data %>%
mutate(
mean_sentiment_shifted = abs(mean_sentiment) + 0.000000000000001, # Shift by a small constant to avoid division by zero
percent_change_mean_sentiment = (abs(mean_sentiment) - abs(lag(mean_sentiment, default = first(mean_sentiment)))) / mean_sentiment_shifted * 100
) %>%
filter(!is.na(percent_change_mean_sentiment))
# Removing any potential infinite or missing values from the ratio for safety
combined_data <- combined_data %>%
mutate(total_emotion_ratio = ifelse(is.infinite(total_emotion_ratio) | is.na(total_emotion_ratio), 0, total_emotion_ratio))
# Plotting the relationship using ggplot2
ggplot(combined_data, aes(x = scaled_buffer_ratio*scaled_neutral_ratio, y = mean_sentiment_shifted)) +
geom_point(alpha = 0.5) + # Plot points with partial transparency
geom_smooth(method = "lm", color = "blue", se = FALSE) + # Add a linear regression line without a confidence envelope
labs(title = "Relationship Between Absolute Change in Mean Sentiment and Total Emotion Ratio",
x = "Product of Scaled Positive & Negative Ratio",
y = "Absolute Change in Mean Sentiment") +
theme_minimal() # Use a minimal theme for a clean look
##########################correlation##########
library(dplyr)
library(ggplot2)
# Assuming 'combined_data' is already loaded and contains all the variables
# Calculate the absolute change in mean sentiment (as an example)
combined_data <- combined_data %>%
mutate(
mean_sentiment_lag = lag(mean_sentiment, default = first(mean_sentiment)),
absolute_change_in_sentiment = abs(mean_sentiment - mean_sentiment_lag)
) %>%
filter(!is.na(absolute_change_in_sentiment))
# Create a data frame to store correlation coefficients
correlations <- data.frame(
variable = character(),
correlation_with_y = numeric(),
stringsAsFactors = FALSE
)
# Assuming 'combined_data' has been preprocessed appropriately
variable_names <- setdiff(names(combined_data), c("absolute_change_in_sentiment", "mean_sentiment_lag", "mean_sentiment"))
# Initialize a data frame to store correlation coefficients
correlations <- data.frame(variable = character(), correlation_with_y = numeric(), stringsAsFactors = FALSE)
# Calculate correlation for each numeric variable with 'absolute_change_in_sentiment'
for(var in variable_names) {
# Check if the variable is numeric
if(is.numeric(combined_data[[var]])) {
cor_value <- cor(combined_data[[var]], combined_data$absolute_change_in_sentiment, use = "complete.obs")
correlations <- rbind(correlations, data.frame(variable = var, correlation_with_y = cor_value))
} else {
# Optionally, print a message if the variable is not numeric
message("Variable '", var, "' is not numeric and will be skipped.")
}
}
# View the correlations data frame
print(correlations)
# Sort by absolute value of correlation to identify the strongest relationships
correlations <- correlations %>%
arrange(desc(abs(correlation_with_y)))
# View correlations
print(correlations)
# Now let's create a function to plot correlation graphs for the top n variables with the highest absolute correlation
plot_correlation <- function(data, variable_name, response_variable) {
ggplot(data, aes_string(x = variable_name, y = response_variable)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "blue", se = FALSE) +
labs(title = paste("Relationship between", response_variable, "and", variable_name),
x = variable_name,
y = response_variable) +
theme_minimal()
}
# Apply the function to plot for the top n variables
# For example, to plot for the top 5 variables
top_n <- 5
plots_list <- lapply(head(correlations$variable, top_n), function(var) {
plot_correlation(combined_data, var, "absolute_change_in_sentiment")
})
# Display the plots (they will appear in your R plotting window)
for (p in plots_list) {
print(p)
}