Merge pull request #10 from adibender/devel

Fix scrape functions
adibender · Mar 16, 2017 · a14aac7 · a14aac7
2 parents d6b0dce + 1aa49bd
commit a14aac7
Show file tree

Hide file tree

Showing 16 changed files with 240 additions and 32 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: coalitions
 Type: Package
 Title: Coalition probabilities in multi-party democracies
-Version: 0.0.422
+Version: 0.0.535
 Authors@R: person("Andreas", "Bender", , "andreas.bender@stat.uni-muenchen.de", role = c("aut", "cre"))
 Description: An Implementation of a Monte Carlo method to calculate
     probabilities for a coalition majority based on survey results.
@@ -11,17 +11,18 @@ Imports:
     magrittr,
     checkmate,
     reshape2,
-    gtools, 
+    gtools,
     lubridate,
     stringr,
-    dplyr, 
+    dplyr,
     tidyr,
-    rvest, 
+    rvest,
     xml2,
     ggplot2,
     plotly
 Suggests:
     testthat,
     covr
+Encoding: UTF-8
 License: MIT + file LICENSE
 RoxygenNote: 6.0.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -5,10 +5,12 @@ export(collapse_parties)
 export(cprob_tab)
 export(dHondt)
 export(draw_from_posterior)
+export(draw_posterior)
 export(get_coalition_probabilities)
 export(get_coalition_probability)
 export(get_entryprobability)
 export(get_seat_distribution)
+export(get_seats)
 export(redistribute)
 export(scrape_wahlrecht)
 export(scrape_wahlumfragen)
@@ -18,7 +20,9 @@ import(dplyr)
 import(magrittr)
 import(parallel)
 import(rvest)
+importFrom(dplyr,bind_rows)
 importFrom(dplyr,left_join)
+importFrom(dplyr,tbl_df)
 importFrom(gtools,rdirichlet)
 importFrom(lubridate,dmy)
 importFrom(lubridate,month)
@@ -27,4 +31,5 @@ importFrom(reshape2,melt)
 importFrom(stats,setNames)
 importFrom(stringr,str_sub)
 importFrom(tidyr,gather)
+importFrom(tidyr,nest)
 importFrom(xml2,read_html)
diff --git a/R/SaintLagueScheppers.R b/R/SaintLagueScheppers.R
@@ -44,4 +44,35 @@ sls <- function(survey, seats = 598, hurdle = 0.05, epsilon = 10e-6) {
 
     survey
 
+}
+
+#' @rdname sls 
+#' @inheritParams sls
+sls2 <- function(survey, seats = 598, hurdle = 0.05, epsilon = 10e-6) {
+
+    #get votes.in.perc after excluding parties with votes.in.perc < 0.05 and "others"
+    survey <- redistribute2(survey, hurdle = hurdle)
+
+    # check for data validity
+    if( abs(sum(survey$PERCENT) - 1) > epsilon  ) 
+        stop("wrong percentages provided in sls() function")
+
+    divisor.mat <- sum(survey$VOTES)/vapply(survey$VOTES, "/", numeric(599),
+        seq(0.5, 598.5, by = 1))
+    colnames(divisor.mat) <- survey$PARTY
+
+    m.mat <- melt(divisor.mat, id.vars = "party")
+    m.mat <- m.mat[rank(m.mat$value, ties.method = "random") <= seats, ]
+    rle.seats <- rle(as.character(m.mat$Var2))
+    seat.mat <- bind_cols(list(PARTY = rle.seats$values, SEATS = rle.seats$lengths))
+
+    if( nrow(seat.mat) != nrow(survey) ) 
+        stop ("Wrong number of parties after seat distribution")
+    if( sum(seat.mat$SEATS) != seats ) 
+        stop(paste("Number of seats distributed not equal to", seats))
+
+    survey <- left_join(survey, seat.mat, by = "PARTY")
+
+    survey
+
 }
diff --git a/R/drawElectionsFromPosterior.R b/R/drawElectionsFromPosterior.R
@@ -15,22 +15,48 @@
 draw_from_posterior <- function(survey, nsim, seed = NULL, prior = NULL) {
 
     ## calculate posteriori
-    if( is.null(prior) ) {
-        prior <- rep(0.5, nrow(survey))
-    }
+  if(is.null(prior)) {
+    prior <- rep(0.5, nrow(survey))
+  } else {
+    if(length(prior) != nrow(survey))
+      stop("length of prior weights and number of observations differ")
+  }
 
-    else {
-        if( length(prior) != nrow(survey) )
-            stop("length of prior weights and number of observations differ")
-    }
-
-    alpha <- survey$votes + prior
+  alpha <- survey$votes + prior
 
     ## draw n.sim random dirichlet numbers/vectors with concentration weights alpha
-    if( !is.null(seed) ) set.seed(seed)
+  if(!is.null(seed)) set.seed(seed)
     rn <- as.data.frame(rdirichlet(nsim, alpha = alpha))
-    colnames(rn) <- survey$party
+  colnames(rn) <- survey$party
 
-    rn
+  rn
 
 }
+
+#' @rdname draw_from_posterior
+#' @inheritParams draw_from_posterior
+#' @importFrom dplyr tbl_df
+#' @export
+draw_posterior <- function(
+  survey, 
+  nsim, 
+  seed = NULL, 
+  prior = NULL) {
+
+  ## set seed if provided
+  if(!is.null(seed)) set.seed(seed)
+    ## calculate posteriori
+  if(is.null(prior)) {
+    prior <- rep(0.5, nrow(survey))
+  } else {
+    if(length(prior) != nrow(survey))
+      stop("length of prior weights and number of observations differ")
+  }
+
+  ## draw n.sim random dirichlet numbers/vectors with concentration weights alpha
+  draws <- rdirichlet(nsim, alpha = survey$VOTES + prior)
+  colnames(draws) <- survey$PARTY
+
+  return(tbl_df(draws))
+
+}
diff --git a/R/getCoalitionProbability.R b/R/getCoalitionProbability.R
@@ -38,6 +38,37 @@ get_coalition_probability <- function(seat.tab, coalition, superior = NULL,
 
 }
 
+#' @rdname get_coalition_probability
+get_probability <- function(
+  seat.tab, 
+  coalition, 
+  superior = NULL,
+  majority = 300) {
+
+  ind.coalition <- sapply(seat.tab, function(z) {
+    sum(z$SEATS[z$PARTY %in% coalition]) >= majority
+  })
+
+  if( !any(is.null(superior)) ) {
+
+    ind.sup.list <- lapply(superior, function(superior.coalition) {
+      sapply(seat.tab, function(z) {
+        sum(z$SEATS[z$PARTY %in% superior.coalition]) >=
+        majority
+      })
+    })
+  }
+  else{
+    ind.sup.list <- list(rep(FALSE, length(ind.coalition)))
+  }
+
+  ind.sup <- Reduce("|", ind.sup.list)
+
+  mean(ind.coalition & !(ind.sup))
+
+}
+
+
 #' Calculate coalition probabilities
 #' 
 #' Given a list containing simulated seats obtained per party
@@ -70,6 +101,25 @@ get_coalition_probabilities <- function(
 
 }
 
+#' @rdname get_coalition_probabilities
+get_probabilities <- function(
+  seat.tabs, 
+  coalitions,
+  superior.coalitions, 
+  majority = 300) {
+
+  coal.probs <- sapply(seq_along(coalitions), function(z) {
+    get_probability(seat.tabs, coalitions[[z]],
+      superior.coalitions[[z]], majority = majority)
+  })
+  tibble(
+    COALITION   = sapply(coalitions, paste0, collapse          = "-"),
+    SUPERIOR    = sapply(superior.coalitions, paste0, collapse = "-"),
+    PROBABILITY = coal.probs)
+
+}
+
+
 
 # #' @importFrom reshape2 melt
 #

diff --git a/R/getSeatDistribution.R b/R/getSeatDistribution.R
@@ -13,7 +13,6 @@
 #' @keywords seat distribution
 #' @seealso \code{\link{draw_from_posterior}}, \code{\link{sls}}, 
 #' \code{\link{dHondt}}
-
 get_seat_distribution <- function(dirichlet.draws, survey, distrib.fun = sls, 
   samplesize = NULL, ...) {
 
@@ -32,4 +31,34 @@ get_seat_distribution <- function(dirichlet.draws, survey, distrib.fun = sls,
     ## return results
   sim.results
 
+}
+
+
+#' @rdname get_seat_distribution
+#' @param mc.cores Number of cores to be used in parallel.
+#' See \code{\link[parallel]{mclapply}}.
+#' @importFrom dplyr bind_rows
+#' @export 
+get_seats <- function(
+  dirichlet.draws, 
+  survey, 
+  mc.cores    = 1,
+  distrib.fun = sls,
+  samplesize  = NULL, ...) {
+
+  if( is.null(samplesize) ) samplesize <- sum(survey$VOTES)
+
+    sim.surveys <- mclapply(seq_len(nrow(dirichlet.draws)), function(z) {
+      survey$PERCENT <- as.numeric(dirichlet.draws[z, ])
+      survey$VOTES <- survey$PERCENT * samplesize
+      survey
+
+    }, mc.cores=mc.cores)
+
+  ## calculate seat distribution for each simulation
+  sim.results <- mclapply(sim.surveys, distrib.fun, ..., mc.cores=mc.cores)
+
+  ## return results
+  bind_rows(sim.results)
+
 }
diff --git a/R/redistribute.R b/R/redistribute.R
@@ -17,3 +17,14 @@ redistribute <- function(survey, hurdle = 0.05) {
 	survey
 
 }
+
+#' @rdname redistribute
+#' @inheritParams redistribute
+redistribute2 <- function(survey, hurdle = 0.05) {
+
+	survey <- survey[survey$PERCENT >= hurdle & survey$PARTY != "SONSTIGE", ]
+	survey$PERCENT <- survey$VOTES/sum(survey$VOTES)
+
+	survey
+
+}
diff --git a/R/scrappers.R b/R/scrappers.R
@@ -14,16 +14,35 @@ sanitize_percent <- function(vec) {
 
 }
 
+#' Sanitze character vector 
+#' 
+#' Substitute all german "Umlaute"
+#' @param x A character vector. 
+sanitize_strings <- function(x) {
+
+	# lower case letters 
+	x <- gsub("\u00f6", "oe", x)
+	x <- gsub("\u00fc", "ue", x)
+	x <- gsub("\u00e4", "ae", x)
+	# upper case letters 
+	x <- gsub("\u00d6", "Oe", x)
+	x <- gsub("\u00dc", "Ue", x)
+	x <- gsub("\u00c4", "Ae", x)
+
+	return(x)
+
+}
+
 #' Sanitize column names 
 #' 
 #' @param df A data frame with party names with special characters that need 
 #' to be sanitized.
 sanitize_colnames <- function(df) {
 
-	cdf <- toupper(colnames(df))
+	cdf <- colnames(df)
+	cdf <- toupper(sanitize_strings(cdf))
 	cdf <- sub("CDU/CSU", "CDU", cdf)
-	cdf <- sub("GRÜNE", "GRUENE", cdf)
-	cdf <- sub("Grünen", "GRUENE", cdf)
+	cdf <- sub("GRUENEN", "GRUENE", cdf)
 
 	colnames(df) <- cdf
 
@@ -71,16 +90,17 @@ scrape_wahlrecht <- function(
 		ind.row.remove <- -1:-3
 	}
 
-	atab           <- atab[ind.row.remove,]
+	atab           <- atab[ind.row.remove, ]
 	atab           <- atab[-nrow(atab), ]
+	colnames(atab) <- c("Datum", colnames(atab)[-1])
 	ind.empty      <- sapply(atab, function(z) all(z==""))
 	atab           <- atab[, !ind.empty]
-	colnames(atab) <- c("Datum", colnames(atab)[-1])
+
 	atab <- sanitize_colnames(atab)
 	parties <- colnames(atab)[colnames(atab) %in% parties]
 	# transform percentage string to numerics
-	atab[, parties] <- apply(atab[, parties], 2,  gsub,  pattern=" %",replacement="", fixed=TRUE)
-	atab[, parties] <- apply(atab[, parties], 2,  gsub,  pattern="," ,replacement=".", fixed=TRUE)
+	atab[, parties] <- apply(atab[, parties], 2,  gsub,  pattern=" %", replacement="", fixed=TRUE)
+	atab[, parties] <- apply(atab[, parties], 2,  gsub,  pattern="," , replacement=".", fixed=TRUE)
 	atab[, parties] <- apply(atab[, parties], 2,  as.numeric)
 
 	atab <-  mutate(atab, DATUM = dmy(DATUM))
@@ -94,7 +114,7 @@ scrape_wahlrecht <- function(
 	} else {
 		## remove special characters from BEFRAGTE column, transform to numeric
 		atab$BEFRAGTE <- gsub("?", "", atab$BEFRAGTE, fixed=TRUE)
-		atab$BEFRAGTE <- gsub("≈", "", atab$BEFRAGTE, fixed=TRUE)
+		atab$BEFRAGTE <- gsub("\u2248", "", atab$BEFRAGTE, fixed=TRUE)
 		atab$BEFRAGTE <- gsub(".", "", atab$BEFRAGTE, fixed=TRUE)
 		atab$BEFRAGTE <- as.numeric(atab$BEFRAGTE)
 	}
@@ -134,9 +154,7 @@ scrape_wahlumfragen <- function(
 		html_nodes("table") %>% .[[5]] %>%
 		html_table(fill=TRUE) %>%
 		select(-Kommentar)
-	colnames(atab) <- gsub("ö", "oe", colnames(atab))
-	colnames(atab) <- gsub("ä", "ae", colnames(atab))
-	colnames(atab) <- gsub("ü", "ue", colnames(atab))
+	colnames(atab) <- sanitize_strings(colnames(atab))
 	atab <- rename(atab, Datum=Veroeffentlichung)
 
 	# transform percentage string to numerics
@@ -158,8 +176,8 @@ scrape_wahlumfragen <- function(
 #' the data into long format with one row per party. 
 #' @inheritParams scrape_wahlrecht
 #' @param surveys A data frame with one survey per row.  
-#' @import checkmate
-#' @importFrom tidyr gather
+#' @import checkmate magrittr
+#' @importFrom tidyr gather nest
 #' @return Data frame in long format
 #' @export
 #' @examples
@@ -177,6 +195,7 @@ collapse_parties <- function(
 	surveys <- gather(surveys, PARTY, PERCENT, one_of(av.parties)) %>% 
 		arrange(desc(DATUM))
 
-	return(mutate(surveys, PARTY = factor(PARTY, levels=parties)))
+	surveys %<>% mutate(VOTES = PERCENT/100 * BEFRAGTE) %>% 
+		nest(PARTY:VOTES, .key="survey")
 
 }
diff --git a/R/zzz.R b/R/zzz.R
@@ -3,7 +3,7 @@
 	if(getRversion() >= "2.5.1") {
 		utils::globalVariables(
 			c("Datum", "Kommentar", ",", ".", "Veroeffentlichung", "BEFRAGTE", "DATUM", 
-				"PARTY", "PERCENT", "V1", "V11", "V12", "ZEITRAUM", "total"))
+				"PARTY", "PERCENT", "V1", "V11", "V12", "ZEITRAUM", "total", "VOTES"))
 	}
 
 	invisible()