-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCleaning_Of_2021_Twitter_Info_Ops.R
116 lines (93 loc) · 4.17 KB
/
Cleaning_Of_2021_Twitter_Info_Ops.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
######Package load########
# Author: Cody Wilson
# Loads packages using the groundhog package. Groundhog enables reproducible
# analysis by recording the date the packages were used. It then downloads those
# exact package version upon later analysis, ensuring scripts run as intended.
library("groundhog")
pkgs <- c("tidyverse", "skimr")
groundhog.library(pkgs, "2022-07-11")
# Disabling scientific notation to improve readability of certain variables.
# options(scipen=0, digits=7) # To return to default setting
options(scipen = 999)
# Setting seed for results replication
set.seed(12345)
######Data Load##########
# Load the three datasets of removed tweets acquired from Twitter. Feb 2021 removal
# of Russian IRA accounts, Feb 2021 removal of Iranian accounts, and Dec 2021
# removal of PRC Xinjiang-focused accounts.
russia_2021 <- read_csv("Russia 2021/Feb 2021/hashed_2020_12_IRA_202012_IRA_202012_tweets_csv_hashed.csv")
iran_2021<- read_csv("Iran 2021/hashed_2020_12_iran_202012_iran_202012_tweets_csv_hashed.csv")
china_2021 <- read_csv("China 2021/Peoples_republic_of_China_Xinjiang_DEC2021_tweets/CNHU_0621_tweets_csv_hashed_2021.csv")
## Examine raw data
skim_without_charts(russia_2021)
skim_without_charts(iran_2021)
skim_without_charts(china_2021)
######Data Cleaning######
# Transform tweet and account identifiers from numeric to character.
# Intended to ensure that these are used for identification purposes rather than
# descriptive statistics.
russia_2021 <- russia_2021 %>%
mutate_at(
c('tweetid',
'in_reply_to_userid',
'in_reply_to_tweetid',
'quoted_tweet_tweetid',
'retweet_userid',
'retweet_tweetid',
'user_mentions'),
as.character)
iran_2021 <- iran_2021 %>%
mutate_at(
c('tweetid',
'in_reply_to_userid',
'in_reply_to_tweetid',
'quoted_tweet_tweetid',
'retweet_userid',
'retweet_tweetid',
'user_mentions'),
as.character)
china_2021 <- china_2021 %>%
mutate_at(
c('tweetid',
'in_reply_to_userid',
'in_reply_to_tweetid',
'quoted_tweet_tweetid',
'retweet_userid',
'retweet_tweetid',
'user_mentions'),
as.character)
# Removing case sensitivity from hashtags variable to ensure consistent counting
# of hashtag usage
russia_2021$hashtags <- tolower(russia_2021$hashtags)
iran_2021$hashtags <- tolower(iran_2021$hashtags)
china_2021$hashtags <- tolower(china_2021$hashtags)
# Removing brackets and single quotes surrounding strings in two columns
russia_2021$user_mentions <- gsub("\\[|\\]", "", russia_2021$user_mentions)
russia_2021$user_mentions <- gsub("'", "", russia_2021$user_mentions)
russia_2021$hashtags <- gsub("\\[|\\]", "", russia_2021$hashtags)
russia_2021$hashtags <- gsub("'", "", russia_2021$hashtags)
iran_2021$user_mentions <- gsub("\\[|\\]", "", iran_2021$user_mentions)
iran_2021$user_mentions <- gsub("'", "", iran_2021$user_mentions)
iran_2021$hashtags <- gsub("\\[|\\]", "", iran_2021$hashtags)
iran_2021$hashtags <- gsub("'", "", iran_2021$hashtags)
china_2021$user_mentions <- gsub("\\[|\\]", "", china_2021$user_mentions)
china_2021$user_mentions <- gsub("'", "", china_2021$user_mentions)
china_2021$hashtags <- gsub("\\[|\\]", "", china_2021$hashtags)
china_2021$hashtags <- gsub("'", "", china_2021$hashtags)
# Adding NAs to missing observations in the above columns
russia_2021$user_mentions <- na_if(russia_2021$user_mentions, "")
russia_2021$hashtags <- na_if(russia_2021$hashtags, "")
iran_2021$user_mentions <- na_if(iran_2021$user_mentions, "")
iran_2021$hashtags <- na_if(iran_2021$hashtags, "")
china_2021$user_mentions <- na_if(china_2021$user_mentions, "")
china_2021$hashtags <- na_if(china_2021$hashtags, "")
## Examine cleaned data
skim_without_charts(russia_2021)
skim_without_charts(iran_2021)
skim_without_charts(china_2021)
# no missing values were noted in any variables of interest, cleaning is completed
######Writing Data########
# Write cleaned datasets to new files to analysis
write_csv(russia_2021, "russia_2021_cleaned.csv")
write_csv(iran_2021, "iran_2021_cleaned.csv")
write_csv(china_2021, "china_2021_cleaned.csv")