diff --git a/model-2pp/model-2pp.R b/model-2pp/model-2pp.R index 2f7eb7d..77bd8a3 100644 --- a/model-2pp/model-2pp.R +++ b/model-2pp/model-2pp.R @@ -31,12 +31,12 @@ sample_sizes <- ozpolls_2016 %>% sample_sizes <- sample_sizes %>% mutate(ss = ifelse(is.na(ss), min(sample_sizes$ss, na.rm = TRUE), ss)) -all_firms <- ozpolls %>% +all_firms_index <- ozpolls %>% filter(!firm %in% c("Election result") & firm %in% firms_today) %>% - pull(firm) %>% - unique() %>% - as.character() + select(firm) %>% + distinct() %>% + mutate(firm_idx = as.numeric(factor(firm))) # 2 party preferred vote - both elections and polls: data_2pp <- ozpolls %>% @@ -64,10 +64,10 @@ elections_2pp <- data_2pp %>% election_days <- c(pull(elections_2pp, n_days), next_election - first_election) election_results <- pull(elections_2pp, intended_vote) -# Data for one pollster as a time, elements in a list for ease of use: -one_pollster <- lapply(all_firms, function(x){ - filter(data_2pp, firm == x) -}) +# Polling data +polls_2pp <- data_2pp %>% + left_join(all_firms_index, by = "firm") %>% + filter(!is.na(firm_idx)) # Put all the data into a single list we can pass to Stan: model_data <- list( @@ -78,40 +78,13 @@ model_data <- list( # This inflator doubles the variance of polls; ie allows for non-sampling error inflator =sqrt(2), - y1_values = one_pollster[[1]]$intended_vote, - y1_days = one_pollster[[1]]$n_days, - y1_n = nrow(one_pollster[[1]]), - y1_se = one_pollster[[1]]$se[1], - - y2_values = one_pollster[[2]]$intended_vote, - y2_days = one_pollster[[2]]$n_days, - y2_n = nrow(one_pollster[[2]]), - y2_se = one_pollster[[2]]$se[1], - - y3_values = one_pollster[[3]]$intended_vote, - y3_days = one_pollster[[3]]$n_days, - y3_n = nrow(one_pollster[[3]]), - y3_se = one_pollster[[3]]$se[1], - - y4_values = one_pollster[[4]]$intended_vote, - y4_days = one_pollster[[4]]$n_days, - y4_n = nrow(one_pollster[[4]]), - y4_se = one_pollster[[4]]$se[1], - - y5_values = one_pollster[[5]]$intended_vote, - y5_days = one_pollster[[5]]$n_days, - y5_n = nrow(one_pollster[[5]]), - y5_se = one_pollster[[5]]$se[1], - - y6_values = one_pollster[[6]]$intended_vote, - y6_days = one_pollster[[6]]$n_days, - y6_n = nrow(one_pollster[[6]]), - y6_se = one_pollster[[6]]$se[1], + n_polling_firms = nrow(all_firms_index), - y7_values = one_pollster[[7]]$intended_vote, - y7_days = one_pollster[[7]]$n_days, - y7_n = nrow(one_pollster[[7]]), - y7_se = one_pollster[[7]]$se[1] + polls_n = nrow(polls_2pp), + polls_firm_idx = polls_2pp$firm_idx, + polls_intended_vote = polls_2pp$intended_vote, + polls_se = polls_2pp$se, + polls_day = polls_2pp$n_days ) diff --git a/model-2pp/model-2pp.stan b/model-2pp/model-2pp.stan index ef1e45b..2f5a729 100644 --- a/model-2pp/model-2pp.stan +++ b/model-2pp/model-2pp.stan @@ -7,48 +7,19 @@ data { real election_results[number_elections - 1]; // historical election results real inflator; // amount by which to multiply the standard error of polls - // note - pollsters are individually hard coded in to avoid having to use some kind of ragged array: + int n_polling_firms; // number of polling firms - int y1_n; // number of polls conducted by pollster 1 - real y1_values[y1_n]; // actual values in polls for pollster 1 - int y1_days[y1_n]; // the number of days since first election each poll was taken - real y1_se; // the standard error for each party from pollster 1 (note sometimes made up) - - int y2_n; - real y2_values[y2_n]; - int y2_days[y2_n]; - real y2_se; - - int y3_n; - real y3_values[y3_n]; - int y3_days[y3_n]; - real y3_se; - - int y4_n; - real y4_values[y4_n]; - int y4_days[y4_n]; - real y4_se; - - int y5_n; - real y5_values[y5_n]; - int y5_days[y5_n]; - real y5_se; - - int y6_n; - real y6_values[y6_n]; - int y6_days[y6_n]; - real y6_se; - - int y7_n; - real y7_values[y7_n]; - int y7_days[y7_n]; - real y7_se; + int polls_n; // number of polls + int polls_firm_idx[polls_n]; // index of firm + real polls_intended_vote[polls_n]; // actual values in polls + real polls_se[polls_n]; // the standard error for each party (note sometimes made up) + int polls_day[polls_n]; // the number of days since first election each poll was taken } parameters { vector[election_days[number_elections]] mu; // - real d[7]; // polling effects + real d[n_polling_firms]; // polling effects real sigma; // sd of innovations } @@ -71,13 +42,9 @@ model { // 2. Polls - y1_values ~ normal(mu[y1_days] + d[1], y1_se * inflator); - y2_values ~ normal(mu[y2_days] + d[2], y2_se * inflator); - y3_values ~ normal(mu[y3_days] + d[3], y3_se * inflator); - y4_values ~ normal(mu[y4_days] + d[4], y4_se * inflator); - y5_values ~ normal(mu[y5_days] + d[5], y5_se * inflator); - y6_values ~ normal(mu[y6_days] + d[6], y6_se * inflator); - y7_values ~ normal(mu[y7_days] + d[7], y7_se * inflator); - + for(ii in 1:polls_n){ + polls_intended_vote[ii] ~ normal(mu[polls_day[ii]] + d[polls_firm_idx[ii]], polls_se[ii] * inflator); + } + } diff --git a/model-2pp/model-interpretation.R b/model-2pp/model-interpretation.R index c3acf9a..0e71673 100644 --- a/model-2pp/model-interpretation.R +++ b/model-2pp/model-interpretation.R @@ -5,9 +5,12 @@ load("output/model_2pp_2019051801014.rdata") # Polling firms d <- as.data.frame(extract(model_2pp, "d")$d) -names(d) <- all_firms + pd <- d %>% - gather(firm, overestimate) %>% + gather(firm_idx, overestimate) %>% + mutate(firm_idx = as.numeric(str_replace_all(firm_idx, "V", ""))) %>% + left_join(all_firms_index, + by = "firm_idx") %>% mutate(firm = fct_reorder(firm, overestimate)) %>% ggplot(aes(x = overestimate, colour = firm, fill = firm)) + # facet_wrap(~firm) +