forked from PYannick/HighFrequencyChecks
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathisinterviewtooshortforthehouseholdsize.R
128 lines (112 loc) · 7.32 KB
/
isinterviewtooshortforthehouseholdsize.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# WARNING - Generated by {fusen} from dev/function_documentation.Rmd: do not edit by hand
#' @name isInterviewTooShortForTheHouseholdSize
#' @rdname isInterviewTooShortForTheHouseholdSize
#' @title Check that the duration relative to the household size of each interview is more than a threshold
#' @description This function check that the duration relative to the household size of each interview is more than a specified threshold.
#' There is an option to automatically mark for deletion the surveys which are under the threshold.
#' Warning: If there are uncorrected mistakes in the survey dates, it can lead to have the length of the survey in seconds and this check will not performed well
#' @param ds dataset containing the survey (from kobo): labelled data.frame
#' @param checkperiod if not null number of day before today when the check should be made
#' @param surveyConsent name of the field in the dataset where the survey consent is stored: string
#' @param consentForValidSurvey value defined in the kobo form to acknowledge the surveyed person gave his consent: string
#' @param dates name of the fields where the information about the start and end date of the survey is stored: list of string (c('start_date','end_date'))
#' @param householdSize name of the field in the dataset where the household size is stored: string
#' @param minimumSurveyDurationByIndividual minimum acceptable survey duration for one individual in minutes: integer
#' @param uniquerespondantID name of the field where the survey unique ID is stored: string
#' @param enumeratorID name of the field where the enumerator ID is stored: string
#' @param reportingColumns (Optional, by default it is built from the enumeratorID and the uniquerespondantID) name of the columns from the dataset you want in the result: list of string (c('col1','col2',...))
#' @param deleteIsInterviewTooShortForTheHouseholdSize (Optional, by default set as FALSE) if TRUE, the survey in error will be marked as 'deletedIsInterviewTooShortForTheHouseholdSize': boolean (TRUE/FALSE)
#'
#' @return result a list that includes:
#' * dst same dataset as the inputed one but with survey marked for deletion if errors are found and delete=TRUE (or NULL)
#' * ret_log list of the errors found (or NULL)
#' * var a list of value (or NULL)
#' * graph graphical representation of the results (or NULL)
#'
#' @export isInterviewTooShortForTheHouseholdSize
#' @examples
#' load(system.file("sample_dataset.RData", package = "HighFrequencyChecks"))
#' ds <- sample_dataset
#' surveyConsent <- "survey_consent"
#' dates <- c("survey_start","end_survey")
#' householdSize <-"consent_received.respondent_info.hh_size"
#' uniquerespondantID <- "X_uuid"
#' enumeratorID <- "enumerator_id"
#' minimumSurveyDurationByIndividual <- 10
#' reportingColumns <- c(enumeratorID, uniquerespondantID)
#'
#' result <- isInterviewTooShortForTheHouseholdSize(ds = ds,
#' surveyConsent=surveyConsent,
#' dates=dates,
#' householdSize=householdSize,
#' minimumSurveyDurationByIndividual=minimumSurveyDurationByIndividual,
#' reportingColumns=reportingColumns,
#' deleteIsInterviewTooShortForTheHouseholdSize=FALSE)
#' knitr::kable(head(result[["ret_log"]], 10))
#' print(result[["graph"]])
isInterviewTooShortForTheHouseholdSize <- function(ds=NULL,
surveyConsent=NULL,
dates=NULL,
householdSize=NULL,
minimumSurveyDurationByIndividual=10,
reportingColumns=c(enumeratorID, uniquerespondantID),
deleteIsInterviewTooShortForTheHouseholdSize=FALSE){
if(is.null(ds) | nrow(ds)==0 | !is.data.frame(ds)){
stop("Please provide the dataset")
}
if(is.null(surveyConsent) | !is.character(surveyConsent)){
stop("Please provide the field where the survey consent is stored")
}
if(is.null(dates) | !is.character(dates) | length(dates)!=2){
stop("Please provide the fields where the survey start and end date is stored (c('start_date','end_date'))")
}
if(is.null(householdSize) | !is.character(householdSize)){
stop("Please provide the field where the HH size is stored")
}
if(is.null(reportingColumns) | !is.character(reportingColumns)){
stop("Please provide the columns you want in the result (include the enumerator id column if you want to check by enumerator)")
}
if(is.null(minimumSurveyDurationByIndividual) | !is.numeric(minimumSurveyDurationByIndividual)){
stop("Please provide the minimum survey time to check against")
}
if(is.null(deleteIsInterviewTooShortForTheHouseholdSize) | !is.logical(deleteIsInterviewTooShortForTheHouseholdSize)){
stop("Please provide the delete action to be done (TRUE/FALSE)")
}
tmp <- data.frame(ds[reportingColumns], HHSize=ds[,householdSize],
SurveyLength=as.double.difftime((readr::parse_datetime(ds[[dates[[2]]]]) -
readr::parse_datetime(ds[[dates[[1]]]])),
units = "secs") / 60)
# tmp<-data.frame(ds[reportingColumns], HHSize=ds[,HouseholdSize], SurveyLength=as.double.difftime((strptime(ds[[dates[[2]]]],"%Y-%m-%dT%R") - strptime(ds[[dates[[1]]]],"%Y-%m-%dT%R")), units = "secs")/60)
if(deleteIsInterviewTooShortForTheHouseholdSize){
ds[,surveyConsent][(tmp$SurveyLength/tmp$HHSize)<minimumSurveyDurationByIndividual] <- "deletedIsInterviewTooShortForTheHouseholdSize"
}
ret_log <- subset(tmp, (SurveyLength/HHSize) < minimumSurveyDurationByIndividual)
check <- data.frame(categories=c("OK", "NOK"),
Nb = c( length(ds[,1])-length(ret_log[,1]),
length(ret_log[,1])))
t1 <- check
t1$fraction <- t1$Nb / sum(t1$Nb)
t1 <- t1[order(t1$fraction), ]
t1$ymax <- cumsum(t1$fraction)
t1$ymin <- c(0, utils::head(t1$ymax, n=-1))
graph <- ggplot2::ggplot(t1, ggplot2::aes(fill=categories,
ymax=ymax,
ymin=ymin,
xmax=4,
xmin=3)) +
ggplot2::geom_rect(colour="grey30") +
ggplot2::coord_polar(theta="y") +
ggplot2::xlim(c(1, 4)) +
unhcrthemes::theme_unhcr(font_size = 14) +
ggplot2::theme(panel.grid=ggplot2::element_blank()) +
ggplot2::theme(axis.text=ggplot2::element_blank()) +
ggplot2::theme(axis.ticks=ggplot2::element_blank()) +
ggplot2::theme(legend.position='none') +
ggplot2::labs(title= "Are there interviews appearing too short based on household Size?",
subtitle = paste0("Errors: ", round(t1$fraction[t1$categories == 'NOK']*100,2), "%"))
result <- list( dst = ds, # same dataset as the inputed one but with survey marked for deletion if errors are found and delete=TRUE (or NULL)
ret_log = ret_log , # list of the errors found (or NULL)
var = NULL, # a list of value (or NULL)
graph = graph) # graphical representation of the results (or NULL)
return(result)
}