-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path02_sample_f8.R
85 lines (69 loc) · 2.61 KB
/
02_sample_f8.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#####################################################################################
##
## File Name: 01_vandy_clean_sample.R
## Date: 2020-03-20
## Author: Daniel Weitzel
## Purpose: Clean and sample 5000 stories from Vanderbilt news
## Date Used: 2020-06-07
## Data Used: full_vandy.csv.zip
## Output File: (none)
## Data Output: Sample data sets for F8 coding
## Data Webpage: (none)
## Log File: (none)
## Notes:
##
#####################################################################################
## Setting working directory
setwd(githubdir)
setwd("notwork_news/")
## Libraries
library("tidyverse")
## Loading the data
source("scripts/01_clean_vandy.R")
df_vandy <-
df_vandy %>%
dplyr::select(broadcast_abstract, year, sample_id, time, length, special, n)
## This samples for 1968 to 1977 first. Here we had the specials included. The code oversampled specials due to the longer
## run time compared to normal news segments. From 1978 on we exclude specials in the sampling. This is done further down.
df_vandy_6877 <-
df_vandy %>%
dplyr::select(-special) %>%
group_by(year) %>%
sample_n(100, weight = time, replace = FALSE) %>%
ungroup() %>%
filter(year < 1978)
## Sample for 1978 to 2019 without including any specials
df_vandy_7819 <-
df_vandy %>%
filter(year > 1977) %>%
filter(special == FALSE) %>%
group_by(year) %>%
sample_n(100, weight = time, replace = FALSE) %>%
ungroup() %>%
select(-special)
## Smaller subsets of the 1978-2019 data set
df_vandy_7887 <-
df_vandy_7819 %>%
filter(year > 1977 & year < 1988)
df_vandy_8897 <-
df_vandy_7819 %>%
filter(year > 1987 & year < 1998)
df_vandy_9807 <-
df_vandy_7819 %>%
filter(year > 1997 & year < 2008)
df_vandy_0817 <-
df_vandy_7819 %>%
filter(year > 2007 & year < 2018)
df_vandy_1819 <-
df_vandy_7819 %>%
filter(year > 2017)
## Export the data set
#write_csv(df_vandy_7819, "data/sample_questions_7819.csv")
#write_csv(df_vandy_6877, "data/subsets_10years/sample_1_questions_6877.csv")
#write_csv(df_vandy_7887, "data/subsets_10years/sample_2_questions_7887.csv")
#write_csv(df_vandy_8897, "data/subsets_10years/sample_3_questions_8897.csv")
#write_csv(df_vandy_9807, "data/subsets_10years/sample_4_questions_9807.csv")
#write_csv(df_vandy_0817, "data/subsets_10years/sample_5_questions_0817.csv")
#write_csv(df_vandy_1819, "data/subsets_10years/sample_6_questions_1819.csv")
rm(df_vandy, df_vandy_6877, df_vandy_7819, df_vandy_7887,
df_vandy_8897, df_vandy_9807, df_vandy_0817, df_vandy_1819)