Skip to content

Commit

Permalink
Merge pull request #10 from adibender/devel
Browse files Browse the repository at this point in the history
Fix scrape functions
  • Loading branch information
adibender authored Mar 16, 2017
2 parents d6b0dce + 1aa49bd commit a14aac7
Show file tree
Hide file tree
Showing 16 changed files with 240 additions and 32 deletions.
9 changes: 5 additions & 4 deletions DESCRIPTION
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: coalitions
Type: Package
Title: Coalition probabilities in multi-party democracies
Version: 0.0.422
Version: 0.0.535
Authors@R: person("Andreas", "Bender", , "andreas.bender@stat.uni-muenchen.de", role = c("aut", "cre"))
Description: An Implementation of a Monte Carlo method to calculate
probabilities for a coalition majority based on survey results.
Expand All @@ -11,17 +11,18 @@ Imports:
magrittr,
checkmate,
reshape2,
gtools,
gtools,
lubridate,
stringr,
dplyr,
dplyr,
tidyr,
rvest,
rvest,
xml2,
ggplot2,
plotly
Suggests:
testthat,
covr
Encoding: UTF-8
License: MIT + file LICENSE
RoxygenNote: 6.0.1
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@ export(collapse_parties)
export(cprob_tab)
export(dHondt)
export(draw_from_posterior)
export(draw_posterior)
export(get_coalition_probabilities)
export(get_coalition_probability)
export(get_entryprobability)
export(get_seat_distribution)
export(get_seats)
export(redistribute)
export(scrape_wahlrecht)
export(scrape_wahlumfragen)
Expand All @@ -18,7 +20,9 @@ import(dplyr)
import(magrittr)
import(parallel)
import(rvest)
importFrom(dplyr,bind_rows)
importFrom(dplyr,left_join)
importFrom(dplyr,tbl_df)
importFrom(gtools,rdirichlet)
importFrom(lubridate,dmy)
importFrom(lubridate,month)
Expand All @@ -27,4 +31,5 @@ importFrom(reshape2,melt)
importFrom(stats,setNames)
importFrom(stringr,str_sub)
importFrom(tidyr,gather)
importFrom(tidyr,nest)
importFrom(xml2,read_html)
31 changes: 31 additions & 0 deletions R/SaintLagueScheppers.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,35 @@ sls <- function(survey, seats = 598, hurdle = 0.05, epsilon = 10e-6) {

survey

}

#' @rdname sls
#' @inheritParams sls
sls2 <- function(survey, seats = 598, hurdle = 0.05, epsilon = 10e-6) {

#get votes.in.perc after excluding parties with votes.in.perc < 0.05 and "others"
survey <- redistribute2(survey, hurdle = hurdle)

# check for data validity
if( abs(sum(survey$PERCENT) - 1) > epsilon )
stop("wrong percentages provided in sls() function")

divisor.mat <- sum(survey$VOTES)/vapply(survey$VOTES, "/", numeric(599),
seq(0.5, 598.5, by = 1))
colnames(divisor.mat) <- survey$PARTY

m.mat <- melt(divisor.mat, id.vars = "party")
m.mat <- m.mat[rank(m.mat$value, ties.method = "random") <= seats, ]
rle.seats <- rle(as.character(m.mat$Var2))
seat.mat <- bind_cols(list(PARTY = rle.seats$values, SEATS = rle.seats$lengths))

if( nrow(seat.mat) != nrow(survey) )
stop ("Wrong number of parties after seat distribution")
if( sum(seat.mat$SEATS) != seats )
stop(paste("Number of seats distributed not equal to", seats))

survey <- left_join(survey, seat.mat, by = "PARTY")

survey

}
50 changes: 38 additions & 12 deletions R/drawElectionsFromPosterior.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,48 @@
draw_from_posterior <- function(survey, nsim, seed = NULL, prior = NULL) {

## calculate posteriori
if( is.null(prior) ) {
prior <- rep(0.5, nrow(survey))
}
if(is.null(prior)) {
prior <- rep(0.5, nrow(survey))
} else {
if(length(prior) != nrow(survey))
stop("length of prior weights and number of observations differ")
}

else {
if( length(prior) != nrow(survey) )
stop("length of prior weights and number of observations differ")
}

alpha <- survey$votes + prior
alpha <- survey$votes + prior

## draw n.sim random dirichlet numbers/vectors with concentration weights alpha
if( !is.null(seed) ) set.seed(seed)
if(!is.null(seed)) set.seed(seed)
rn <- as.data.frame(rdirichlet(nsim, alpha = alpha))
colnames(rn) <- survey$party
colnames(rn) <- survey$party

rn
rn

}

#' @rdname draw_from_posterior
#' @inheritParams draw_from_posterior
#' @importFrom dplyr tbl_df
#' @export
draw_posterior <- function(
survey,
nsim,
seed = NULL,
prior = NULL) {

## set seed if provided
if(!is.null(seed)) set.seed(seed)
## calculate posteriori
if(is.null(prior)) {
prior <- rep(0.5, nrow(survey))
} else {
if(length(prior) != nrow(survey))
stop("length of prior weights and number of observations differ")
}

## draw n.sim random dirichlet numbers/vectors with concentration weights alpha
draws <- rdirichlet(nsim, alpha = survey$VOTES + prior)
colnames(draws) <- survey$PARTY

return(tbl_df(draws))

}
50 changes: 50 additions & 0 deletions R/getCoalitionProbability.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,37 @@ get_coalition_probability <- function(seat.tab, coalition, superior = NULL,

}

#' @rdname get_coalition_probability
get_probability <- function(
seat.tab,
coalition,
superior = NULL,
majority = 300) {

ind.coalition <- sapply(seat.tab, function(z) {
sum(z$SEATS[z$PARTY %in% coalition]) >= majority
})

if( !any(is.null(superior)) ) {

ind.sup.list <- lapply(superior, function(superior.coalition) {
sapply(seat.tab, function(z) {
sum(z$SEATS[z$PARTY %in% superior.coalition]) >=
majority
})
})
}
else{
ind.sup.list <- list(rep(FALSE, length(ind.coalition)))
}

ind.sup <- Reduce("|", ind.sup.list)

mean(ind.coalition & !(ind.sup))

}


#' Calculate coalition probabilities
#'
#' Given a list containing simulated seats obtained per party
Expand Down Expand Up @@ -70,6 +101,25 @@ get_coalition_probabilities <- function(

}

#' @rdname get_coalition_probabilities
get_probabilities <- function(
seat.tabs,
coalitions,
superior.coalitions,
majority = 300) {

coal.probs <- sapply(seq_along(coalitions), function(z) {
get_probability(seat.tabs, coalitions[[z]],
superior.coalitions[[z]], majority = majority)
})
tibble(
COALITION = sapply(coalitions, paste0, collapse = "-"),
SUPERIOR = sapply(superior.coalitions, paste0, collapse = "-"),
PROBABILITY = coal.probs)

}



# #' @importFrom reshape2 melt
#
Expand Down
31 changes: 30 additions & 1 deletion R/getSeatDistribution.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
#' @keywords seat distribution
#' @seealso \code{\link{draw_from_posterior}}, \code{\link{sls}},
#' \code{\link{dHondt}}

get_seat_distribution <- function(dirichlet.draws, survey, distrib.fun = sls,
samplesize = NULL, ...) {

Expand All @@ -32,4 +31,34 @@ get_seat_distribution <- function(dirichlet.draws, survey, distrib.fun = sls,
## return results
sim.results

}


#' @rdname get_seat_distribution
#' @param mc.cores Number of cores to be used in parallel.
#' See \code{\link[parallel]{mclapply}}.
#' @importFrom dplyr bind_rows
#' @export
get_seats <- function(
dirichlet.draws,
survey,
mc.cores = 1,
distrib.fun = sls,
samplesize = NULL, ...) {

if( is.null(samplesize) ) samplesize <- sum(survey$VOTES)

sim.surveys <- mclapply(seq_len(nrow(dirichlet.draws)), function(z) {
survey$PERCENT <- as.numeric(dirichlet.draws[z, ])
survey$VOTES <- survey$PERCENT * samplesize
survey

}, mc.cores=mc.cores)

## calculate seat distribution for each simulation
sim.results <- mclapply(sim.surveys, distrib.fun, ..., mc.cores=mc.cores)

## return results
bind_rows(sim.results)

}
11 changes: 11 additions & 0 deletions R/redistribute.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,14 @@ redistribute <- function(survey, hurdle = 0.05) {
survey

}

#' @rdname redistribute
#' @inheritParams redistribute
redistribute2 <- function(survey, hurdle = 0.05) {

survey <- survey[survey$PERCENT >= hurdle & survey$PARTY != "SONSTIGE", ]
survey$PERCENT <- survey$VOTES/sum(survey$VOTES)

survey

}
47 changes: 33 additions & 14 deletions R/scrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,35 @@ sanitize_percent <- function(vec) {

}

#' Sanitze character vector
#'
#' Substitute all german "Umlaute"
#' @param x A character vector.
sanitize_strings <- function(x) {

# lower case letters
x <- gsub("\u00f6", "oe", x)
x <- gsub("\u00fc", "ue", x)
x <- gsub("\u00e4", "ae", x)
# upper case letters
x <- gsub("\u00d6", "Oe", x)
x <- gsub("\u00dc", "Ue", x)
x <- gsub("\u00c4", "Ae", x)

return(x)

}

#' Sanitize column names
#'
#' @param df A data frame with party names with special characters that need
#' to be sanitized.
sanitize_colnames <- function(df) {

cdf <- toupper(colnames(df))
cdf <- colnames(df)
cdf <- toupper(sanitize_strings(cdf))
cdf <- sub("CDU/CSU", "CDU", cdf)
cdf <- sub("GRÜNE", "GRUENE", cdf)
cdf <- sub("Grünen", "GRUENE", cdf)
cdf <- sub("GRUENEN", "GRUENE", cdf)

colnames(df) <- cdf

Expand Down Expand Up @@ -71,16 +90,17 @@ scrape_wahlrecht <- function(
ind.row.remove <- -1:-3
}

atab <- atab[ind.row.remove,]
atab <- atab[ind.row.remove, ]
atab <- atab[-nrow(atab), ]
colnames(atab) <- c("Datum", colnames(atab)[-1])
ind.empty <- sapply(atab, function(z) all(z==""))
atab <- atab[, !ind.empty]
colnames(atab) <- c("Datum", colnames(atab)[-1])

atab <- sanitize_colnames(atab)
parties <- colnames(atab)[colnames(atab) %in% parties]
# transform percentage string to numerics
atab[, parties] <- apply(atab[, parties], 2, gsub, pattern=" %",replacement="", fixed=TRUE)
atab[, parties] <- apply(atab[, parties], 2, gsub, pattern="," ,replacement=".", fixed=TRUE)
atab[, parties] <- apply(atab[, parties], 2, gsub, pattern=" %", replacement="", fixed=TRUE)
atab[, parties] <- apply(atab[, parties], 2, gsub, pattern="," , replacement=".", fixed=TRUE)
atab[, parties] <- apply(atab[, parties], 2, as.numeric)

atab <- mutate(atab, DATUM = dmy(DATUM))
Expand All @@ -94,7 +114,7 @@ scrape_wahlrecht <- function(
} else {
## remove special characters from BEFRAGTE column, transform to numeric
atab$BEFRAGTE <- gsub("?", "", atab$BEFRAGTE, fixed=TRUE)
atab$BEFRAGTE <- gsub("", "", atab$BEFRAGTE, fixed=TRUE)
atab$BEFRAGTE <- gsub("\u2248", "", atab$BEFRAGTE, fixed=TRUE)
atab$BEFRAGTE <- gsub(".", "", atab$BEFRAGTE, fixed=TRUE)
atab$BEFRAGTE <- as.numeric(atab$BEFRAGTE)
}
Expand Down Expand Up @@ -134,9 +154,7 @@ scrape_wahlumfragen <- function(
html_nodes("table") %>% .[[5]] %>%
html_table(fill=TRUE) %>%
select(-Kommentar)
colnames(atab) <- gsub("ö", "oe", colnames(atab))
colnames(atab) <- gsub("ä", "ae", colnames(atab))
colnames(atab) <- gsub("ü", "ue", colnames(atab))
colnames(atab) <- sanitize_strings(colnames(atab))
atab <- rename(atab, Datum=Veroeffentlichung)

# transform percentage string to numerics
Expand All @@ -158,8 +176,8 @@ scrape_wahlumfragen <- function(
#' the data into long format with one row per party.
#' @inheritParams scrape_wahlrecht
#' @param surveys A data frame with one survey per row.
#' @import checkmate
#' @importFrom tidyr gather
#' @import checkmate magrittr
#' @importFrom tidyr gather nest
#' @return Data frame in long format
#' @export
#' @examples
Expand All @@ -177,6 +195,7 @@ collapse_parties <- function(
surveys <- gather(surveys, PARTY, PERCENT, one_of(av.parties)) %>%
arrange(desc(DATUM))

return(mutate(surveys, PARTY = factor(PARTY, levels=parties)))
surveys %<>% mutate(VOTES = PERCENT/100 * BEFRAGTE) %>%
nest(PARTY:VOTES, .key="survey")

}
2 changes: 1 addition & 1 deletion R/zzz.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
if(getRversion() >= "2.5.1") {
utils::globalVariables(
c("Datum", "Kommentar", ",", ".", "Veroeffentlichung", "BEFRAGTE", "DATUM",
"PARTY", "PERCENT", "V1", "V11", "V12", "ZEITRAUM", "total"))
"PARTY", "PERCENT", "V1", "V11", "V12", "ZEITRAUM", "total", "VOTES"))
}

invisible()
Expand Down
Loading

0 comments on commit a14aac7

Please sign in to comment.