diff --git a/.lintr b/.lintr new file mode 100644 index 0000000..997b47c --- /dev/null +++ b/.lintr @@ -0,0 +1,5 @@ +linters: with_defaults(line_length_linter(120), object_usage_linter = NULL, closed_curly_linter = NULL, open_curly_linter = NULL, spaces_left_parentheses_linter = NULL) +exclusions: list("R/RcppExports.R") +exclude: "# Exclude Linting" +exclude_start: "# Begin Exclude Linting" +exclude_end: "# End Exclude Linting" diff --git a/DESCRIPTION b/DESCRIPTION index f739244..c3bf5dc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,24 +1,35 @@ Package: wmf Type: Package Title: R Code for Wikimedia Foundation Internal Usage -Version: 0.2.6 -Date: 2017-01-23 +Version: 0.3.0 +Date: 2017-08-04 Authors@R: c( - person("Oliver", "Keyes", role = "aut"), - person("Mikhail", "Popov", email = "mpopov@wikimedia.org", role = c("aut", "cre"))) -Description: More about what it does (maybe more than one line). + person("Mikhail", "Popov", email = "mikhail@wikimedia.org", role = c("aut", "cre")), + person("Oliver", "Keyes", role = "aut", comment = "No longer employed at the Foundation"), + person("Chelsy", "Xie", email = "cxie@wikimedia.org", role = "ctb") + ) +Description: This package contains functions made for Analysts at Wikimedia + Foundation, but can be used by people outside of the Foundation. License: Uhhhhh -LazyData: TRUE +URL: https://phabricator.wikimedia.org/diffusion/1821/ +BugReports: https://phabricator.wikimedia.org/maniphest/task/create/? + projects=Discovery-Analysis Imports: RMySQL, + Rcpp, urltools, pwr, - ggplot2, readr, - ggthemes -URL: https://phabricator.wikimedia.org/diffusion/1821/ -BugReports: https://phabricator.wikimedia.org/maniphest/task/create/? - projects=Discovery-Analysis + ggplot2, + ggthemes, + lubridate, + progress Suggests: + lintr, testthat -RoxygenNote: 5.0.1 +LinkingTo: + Rcpp, + BH +LazyData: TRUE +Roxygen: list(markdown = TRUE) +RoxygenNote: 6.0.1 diff --git a/NAMESPACE b/NAMESPACE index 4a137c4..b2754ea 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,37 +1,34 @@ # Generated by roxygen2: do not edit by hand export(build_query) +export(chisq_test_effect) +export(chisq_test_odds) export(date_clause) +export(exact_binom) export(from_log) export(from_mediawiki) export(get_logfile) export(global_query) +export(interleaved_preference) +export(interleaved_sample_size) export(mysql_close) export(mysql_connect) export(mysql_disconnect) export(mysql_exists) export(mysql_read) export(mysql_write) export(query_hive) export(read_sampled_log) export(rewrite_conditional) -export(sample_size_effect) -export(sample_size_odds) export(set_proxies) export(theme_fivethirtynine) export(to_log) export(to_mediawiki) export(write_conditional) import(ggplot2) import(ggthemes) +importFrom(Rcpp,sourceCpp) importFrom(pwr,pwr.chisq.test) importFrom(readr,read_tsv) importFrom(urltools,url_decode) -importMethodsFrom(RMySQL,dbClearResult) -importMethodsFrom(RMySQL,dbConnect) -importMethodsFrom(RMySQL,dbDisconnect) -importMethodsFrom(RMySQL,dbExistsTable) -importMethodsFrom(RMySQL,dbListResults) -importMethodsFrom(RMySQL,dbSendQuery) -importMethodsFrom(RMySQL,dbWriteTable) -importMethodsFrom(RMySQL,fetch) +useDynLib(wmf) diff --git a/NEWS.md b/NEWS.md index a414f5d..b267be2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,38 +1,46 @@ +wmf 0.3.0 +========= +* C++-based `exact_binomial()` to estimate sample size for exact binomial tests +* Functions for working with interleaved search results experiments; see `?interleaved` for details +* ggplot themes `theme_min()` and `theme_facet()` +* Documentation updates +* Syntax-checking unit test + wmf 0.2.6 ========= * Adds support for more MySQL config filenames since those vary between the different machines * Smarter about choosing a config file wmf 0.2.5 ========= * Fixes Hive query execution to remove messages/warnings. wmf 0.2.4 ========= * Ungroups grouped data frames when rewriting. See [T146422](https://phabricator.wikimedia.org/T146422) for more details. wmf 0.2.3 ========= * Fixes ggplot2 theme margin bug [discovered & fixed](https://github.com/wikimedia/wikimedia-discovery-wmf/pull/1) by Oliver Keyes. wmf 0.2.2 ========= * Updates `query_hive()` to support [JAR path overriding](https://wikitech.wikimedia.org/wiki/Analytics/Cluster/Hive/QueryUsingUDF#Testing_changes_to_existing_udf) * Updates the MySQL config file path so the package can now also be used on stat1003 * Updates maintainer contact info in README wmf 0.2.1 ========= * Adds a Contributor Code of Conduct wmf 0.2.0 ========= * Adds compatibility with RMySQL 0.9.4+ wmf 0.1.1 ========= * Fix a bug in global_query wmf 0.1.0 ========= Initial release diff --git a/R/RcppExports.R b/R/RcppExports.R new file mode 100644 index 0000000..3bb5802 --- /dev/null +++ b/R/RcppExports.R @@ -0,0 +1,27 @@ +# Generated by using Rcpp::compileAttributes() -> do not edit by hand +# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#' @title Sample size for exact, one sample binomial test +#' @description Estimates sample size required to detect difference from a +#' constant proportion. +#' @param constant_prop The proportion under the null hypothesis. +#' @param effect_size Positive size of the difference between your null +#' hypothesis and the alternative hypothesis that you hope to detect. +#' **Heads-up** that values less than 1\% might take a while to calculate. +#' @param alpha Probability of rejecting the null hypothesis even though it is +#' true. +#' @param power Probability of rejecting the null hypothesis (getting a +#' significant result) when the real difference is equal to the minimum +#' effect size. +#' @param two_tail Whether to perform two-tail or one-tail power analysis. +#' `TRUE` (default) tests in both directions of difference. +#' @examples +#' exact_binom(0.75, 0.03) +#' @references [Power analysis](http://www.biostathandbook.com/power.html) and +#' [Exact test of goodness-of-fit](http://www.biostathandbook.com/exactgof.html) from +#' John H. McDonald's [_Handbook of Biological Statistics_](http://www.biostathandbook.com/) +#' @export +exact_binom <- function(constant_prop, effect_size, alpha = 0.05, power = 0.8, two_tail = TRUE) { + .Call('_wmf_exact_binom', PACKAGE = 'wmf', constant_prop, effect_size, alpha, power, two_tail) +} + diff --git a/R/chisq_test.R b/R/chisq_test.R new file mode 100644 index 0000000..90aa442 --- /dev/null +++ b/R/chisq_test.R @@ -0,0 +1,163 @@ +oddsRatio <- function(p_treatment, p_control) { + return((p_treatment / (1 - p_treatment)) / (p_control / (1 - p_control))) +} +pTreatment <- function(p_control, odds_ratio) { + return((odds_ratio * p_control) / ((p_control * (odds_ratio - 1)) + 1)) +} +pControl <- function(p_treatment, odds_ratio) { + return(1 / ((odds_ratio * ((1 / p_treatment) - 1)) + 1)) +} + +#' @title Calculate sample size for chi-squared test of indep. given odds ratio +#' @param odds_ratio The expected odds ratio. That is, the ratio of the odds of +#' the outcome in the test group relative to the control group. Optional, +#' but see *Details*. +#' @param p_control Your guess for prevalence of outcome in the control group. +#' Optional but see **Details**. +#' @param p_treatment Your guess for prevalence of outcome in the test group. +#' Optional but see **Details**. +#' @param power The ability of the test to detect an effect where there is one. +#' Power = 1 - Prob(Type 2 error). Optional. See **Value** for details. +#' @param conf_level Desired confidence level. Defaults to 95\%. +#' @param sample_ratio Ratio of test group to control group. 1 is even split. +#' @param visualize Whether to plot power or prevalence of outcome in the +#' control group vs sample size. Can be used to help make a decision. +#' @details The function only needs to know two of the following three: +#' `odds_ratio`, `p_control`, and `p_treatment`. If given +#' all three, it will check to make sure the odds ratio is correct. It +#' will figure out the missing third value from the other two. +#' @return If `power` was not provided, returns vector containing +#' possible power values and the appropriate sample size for each \%. +#' If all values were provided, returns a single sample size estimate. +#' @section References: +#' Wang, H., Chow, S.-C., & Li, G. (2002). On sample size calculation based on +#' odds ratio in clinical trials. *Journal of Biopharmaceutical +#' Statistics*, **12**(4), 471–483. +#' [doi:10.1081/BIP-120016231](http://doi.org/10.1081/BIP-120016231) +#' @examples +#' chisq_test_odds(p_treatment = 0.4, p_control = 0.25, power = 0.8) +#' chisq_test_odds(odds_ratio = 2, p_control = 0.4, power = c(0.8, 0.9, 0.95)) +#' chisq_test_odds(odds_ratio = 2, p_control = 0.4) +#' chisq_test_odds(odds_ratio = 2, p_control = 0.4, visualize = TRUE) +#' @author Mikhail Popov +#' @seealso [chisq_test_effect()] +#' @export +chisq_test_odds <- function( + odds_ratio = NULL, + p_control = NULL, + p_treatment = NULL, + power = NULL, + conf_level = 0.95, + sample_ratio = 1, + visualize = FALSE +) { + # Begin Exclude Linting + # Checks + power_missing <- is.null(power) + pC_missing <- is.null(p_control) + pT_missing <- is.null(p_treatment) + oR_missing <- is.null(odds_ratio) + if ( (oR_missing + pC_missing + pT_missing) > 1) { + stop("Only one of {odds_ratio, p_control, p_treatment} can be missing.") + } + # Imputations (Part 1) + if (power_missing) { + power <- seq(0.5, 0.99, 0.01) + } + # Imputations (Part 2) + if (pC_missing) { + p_control <- pControl(p_treatment, odds_ratio) + } else if (pT_missing) { + p_treatment <- pTreatment(p_control, odds_ratio) + } else if (oR_missing) { + odds_ratio <- oddsRatio(p_treatment, p_control) + } + # End Exclude Linting + + # Calculations + x <- p_treatment * (1 - p_treatment) * sample_ratio + y <- p_control * (1 - p_control) + z_alpha <- qnorm((1 - conf_level) / 2) + z_beta <- qnorm(1 - power) + n_b <- (1 / x + 1 / y) * (((z_alpha + z_beta) ^ 2) / (log(odds_ratio) ^ 2)) + n_a <- sample_ratio * n_b + n <- ceiling(n_a + n_b) + + # Visualization + if (visualize) { + if (power_missing || length(power) > 1) { + plot( + power, n, type = "l", + main = "Sample size as function of statistical power", + ylab = "N", xlab = "Power to detect effect", + lwd = 2, xaxt = "n" + ) + axis( + side = 1, at = seq(0.5, 1, 0.1), + labels = sprintf("%.0f%%", 100 * seq(0.5, 1, 0.1)) + ) + abline(v = seq(0.5, 1, 0.1), lty = "dotted", col = "lightgray", lwd = par("lwd")) + } else { + warning("All parameters known. Nothing to visualize.") + } + } + + # Output + if (power_missing || length(power) > 1) { + names(n) <- sprintf("%.0f%%", power * 100) + } + return(n) + +} + +#' @title Calculate sample size for chi-squared test of indep. given Cohen's w +#' @description Uses Cohen's w for effect size to calculate sample size for +#' a chi-squared test of independence. +#' @param w Effect size you want the test to be able to detect. (Optional) +#' @param groups Number of groups. Used in degrees of freedom calculation. +#' Defaults to 2 (e.g. control group vs treatment group). +#' @param sig_level Probability of Type 1 error. Usually called alpha. +#' Defaults to 0.05. +#' @param power Ability to detect the effect. (1 - probability of Type 2 error) +#' Defaults to 80\%. +#' @return If `w` was not provided, returns a data frame containing +#' possible values of w and the corresponding sample size estimates. +#' @examples +#' chisq_test_effect() +#' chisq_test_effect(0.1) +#' chisq_test_effect(w = 0.1, groups = 3, sig_level = 0.001, power = 0.9) +#' @importFrom pwr pwr.chisq.test +#' @author Mikhail Popov +#' @seealso [chisq_test_odds()] +#' @export +chisq_test_effect <- function( + w = NULL, + groups = 2, + sig_level = 0.05, + power = 0.8 +) { + # Checks + w_missing <- is.null(w) + if (!w_missing && w <= 0.01) stop("'w' must be > 0.01") + if (power <= 0.1 || power > 1.0) stop("'power' must be in (0.1, 1]") + + # Imputation + if (w_missing) w <- c(0.05, 0.1, 0.3, 0.5) + + # Calculation and output + if (length(w) > 1) { + n <- ceiling(vapply(w, function(ww) { + return(pwr::pwr.chisq.test( + w = ww, N = NULL, df = groups - 1, + sig.level = sig_level, power = power + )$N) + }, 0)) + names(n) <- c("tiny", "small", "medium", "large") + } else { + n <- ceiling(pwr::pwr.chisq.test( + w = w, N = NULL, df = groups - 1, + sig.level = sig_level, power = power + )$N) + } + return(n) +} diff --git a/R/dataviz.R b/R/dataviz.R index e52d156..296d735 100644 --- a/R/dataviz.R +++ b/R/dataviz.R @@ -1,38 +1,83 @@ -#'@title Theme inspired by fivethirtyeight.com plots -#'@description A modification of \code{ggthemes::theme_fivethirtyeight} -#' -#'@param base_size base font size -#'@param base_family base font family -#' -#'@details Basically it adds axis titles (with some modification on the y to -#' allow for long titles) back in and does a small amount of reduction of the -#' overall plot size to avoid an absolute ton of extraneous spacing. -#' -#'@name FiveThirtyNine -#'@rdname FiveThirtyNine -#'@import ggplot2 -#'@import ggthemes -#' -#'@export -#' -theme_fivethirtynine <- function(base_size = 12, base_family = "sans"){ - (theme_foundation(base_size = base_size, base_family = base_family) + - theme(line = element_line(), rect = element_rect(fill = ggthemes:::ggthemes_data$fivethirtyeight["ltgray"], - linetype = 0, colour = NA), - text = element_text(colour = ggthemes:::ggthemes_data$fivethirtyeight["dkgray"], margin = ggplot2::margin(), debug = FALSE), - axis.title.y = element_text(size = rel(2), angle = 90, vjust = 1.5, margin = ggplot2::margin(0, 12), debug = FALSE), - axis.text = element_text(size = rel(1.5)), - axis.title.x = element_text(size = rel(2), margin = ggplot2::margin(12), debug = FALSE), - axis.ticks = element_blank(), axis.line = element_blank(), - legend.background = element_rect(), legend.position = "bottom", - legend.direction = "horizontal", legend.box = "vertical", - panel.grid = element_line(colour = NULL), - panel.grid.major = element_line(colour = ggthemes:::ggthemes_data$fivethirtyeight["medgray"]), - panel.grid.minor = element_blank(), - plot.title = element_text(hjust = 0, size = rel(1.5), face = "bold", margin = ggplot2::margin(), debug = FALSE), - strip.background = element_rect(), - legend.text = element_text(size = 18), legend.title = element_text(size = rel(1.5), margin = ggplot2::margin(4), debug = FALSE), - legend.key.size = unit(1, "in"), - panel.background = element_rect(fill = "transparent", color = NA), - plot.background = element_rect(fill = "transparent", color = NA))) +#' @title Theme inspired by fivethirtyeight.com plots +#' @description A modification of [ggthemes::theme_fivethirtyeight()] +#' @param base_size base font size +#' @param base_family base font family +#' @details Basically it adds axis titles (with some modification on the y to +#' allow for long titles) back in and does a small amount of reduction of the +#' overall plot size to avoid an absolute ton of extraneous spacing. +#' @name FiveThirtyNine +#' @rdname FiveThirtyNine +#' @import ggplot2 +#' @import ggthemes +#' @author Oliver Keyes +#' @export +theme_fivethirtynine <- function(base_size = 12, base_family = "sans") { + theme_foundation(base_size = base_size, base_family = base_family) + + theme( + line = element_line(), + rect = element_rect( + fill = ggthemes:::ggthemes_data$fivethirtyeight["ltgray"], + linetype = 0, colour = NA), + text = element_text( + colour = ggthemes:::ggthemes_data$fivethirtyeight["dkgray"], + margin = ggplot2::margin(), debug = FALSE + ), + axis.title.y = element_text( + size = rel(2), angle = 90, vjust = 1.5, + margin = ggplot2::margin(0, 12), + debug = FALSE + ), + axis.text = element_text(size = rel(1.5)), + axis.title.x = element_text(size = rel(2), margin = ggplot2::margin(12), debug = FALSE), + axis.ticks = element_blank(), axis.line = element_blank(), + legend.background = element_rect(), legend.position = "bottom", + legend.direction = "horizontal", legend.box = "vertical", + panel.grid = element_line(colour = NULL), + panel.grid.major = element_line(colour = ggthemes:::ggthemes_data$fivethirtyeight["medgray"]), + panel.grid.minor = element_blank(), + plot.title = element_text(hjust = 0, size = rel(1.5), face = "bold", margin = ggplot2::margin(), debug = FALSE), + strip.background = element_rect(), + legend.text = element_text(size = 18), + legend.title = element_text(size = rel(1.5), margin = ggplot2::margin(4), debug = FALSE), + legend.key.size = unit(1, "in"), + panel.background = element_rect(fill = "transparent", color = NA), + plot.background = element_rect(fill = "transparent", color = NA) + ) +} + +#' @title Simple theme for ggplots +#' @param base_size font size +#' @param base_family font family +#' @author Mikhail Popov +theme_min <- function(base_size = 12, base_family = "", ...) { + ggplot2::theme_minimal(base_size, base_family) + + ggplot2::theme( + legend.position = "bottom", + strip.placement = "outside", + ... + ) +} + +#' @title Simple theme for facet-ed ggplots +#' @param base_size font size +#' @param base_family font family +#' @param border whether to add a border around facets +#' @param clean_xaxis whether to remove ticks & labels from x-axis +#' @author Mikhail Popov & Chelsy Xie +theme_facet <- function(base_size = 12, base_family = "", border = TRUE, clean_xaxis = FALSE, ...) { + theme <- theme_min(base_size, base_family, ...) + + ggplot2::theme(strip.background = element_rect(fill = "gray90")) + if (border) { + theme <- theme + ggplot2::theme(panel.border = element_rect(color = "gray30", fill = NA)) + } + if (clean_xaxis) { + theme <- theme + + ggplot2::theme( + panel.grid.major.x = element_blank(), + panel.grid.minor.x = element_blank(), + axis.ticks.x = element_blank(), + axis.text.x = element_blank() + ) + } + return(theme) } diff --git a/R/global.R b/R/global.R index 67cda4e..e6f0cd0 100644 --- a/R/global.R +++ b/R/global.R @@ -1,68 +1,43 @@ -#'@title -#'global SQL queries for analytics-store.eqiad.wmnet -#' -#'@description -#'\code{global_query} is a simple wrapper around the mysql queries that allows a useR to send a query to all production -#'dbs on analytics-store.eqiad.wmnet, joining the results from each query into a single object. -#' -#'@param query the SQL query you want to run -#' -#'@param project_type what class of wiki (wikisource, wiktionary..) you want to run against. Set to "all" by default. -#' -#'@author Oliver Keyes -#' -#'@seealso \code{\link{mysql_read}} for querying an individual db, \code{\link{mw_strptime}} -#'for converting MediaWiki timestamps into POSIXlt timestamps, or \code{\link{hive_query}} for -#'accessing the Hive datastore. -#' -#'@export - -global_query <- function(query, project_type = "all"){ - - #Construct the query - if(!project_type == "all"){ - - info_query <- paste("SELECT wiki FROM wiki_info WHERE code = '",project_type,"'", sep = "") - +#' @title Global SQL queries for analytics-store.eqiad.wmnet +#' @description `global_query` is a simple wrapper around the MySQL queries +#' that allows a useR to send a query to all production dbs on +#' analytics-store.eqiad.wmnet, joining the results from each query into a +#' single object. +#' @param query the SQL query you want to run +#' @param project_type what class of wiki (e.g. "wikisource", "wiktionary") +#' you want to run against. Set to "all" by default. +#' @author Oliver Keyes +#' @seealso +#' [mysql_read] for querying an individual db, +#' [mw_strptime] for converting MediaWiki timestamps into `POSIXlt` timestamps, +#' or [hive_query] for accessing the Hive datastore +#' @export +global_query <- function(query, project_type = "all") { + # Construct the query + if (!project_type == "all") { + info_query <- paste("SELECT wiki FROM wiki_info WHERE code = '", project_type, "'", sep = "") } else { - info_query <- "SELECT wiki FROM wiki_info" - } - - #Run query - wikis <- mysql_read(query = info_query, db = "staging")$wiki - - #Instantiate progress bar and note environment - env <- environment() - progress <- txtProgressBar(min = 0, max = (length(wikis)), initial = 0, style = 3) - - #Retrieve data - data <- lapply(wikis, function(x, query){ - - #Retrieve the data - data <- mysql_read(query = query, db = x) - - if(nrow(data) > 0){ - - #Add the wiki - data$project <- x - + # Run query + wikis <- mysql_read(query = info_query, database = "staging")$wiki + # Instantiate progress bar and note environment + pb <- progress::progress_bar$new(total = length(wikis)) + # Retrieve data + data <- lapply(wikis, function(x, query) { + # Retrieve the data + data <- mysql_read(query = query, database = x) + if (nrow(data) > 0) { + data$project <- x # Add the wiki } else { - data <- NULL - } - - #Increment the progress bar - setTxtProgressBar(get("progress",envir = env),(progress$getVal()+1)) - - #Return + # Increment the progress bar + pb$tick() + # Return return(data) - }, query = query) cat("\n") - - #Bind it into a single object and return + # Bind it into a single object and return return(do.call(what = "rbind", args = data)) -} \ No newline at end of file +} diff --git a/R/hive.R b/R/hive.R index 208b945..a6c206b 100644 --- a/R/hive.R +++ b/R/hive.R @@ -1,92 +1,80 @@ -#'@title hive_query -#'@details Hive querying function -#'@description this is the "old" hive querying function - it's deprecated as all hell and waiting -#'until Andrew sticks the hive server on a dedicated and more powerful machine. -#' -#'@param query a Hive query -#'@param override_jars A logical flag indicating whether to override the path. -#' Hive on WMF's analytics machine(s) loads some JARs by default, so if your -#' query uses an updated version of an existing UDF and you want to load the -#' JAR that you built yourself, set this to TRUE. See -#' \href{https://wikitech.wikimedia.org/wiki/Analytics/Cluster/Hive/QueryUsingUDF#Testing_changes_to_existing_udf}{this section} -#' for more details. -#' -#'@section escaping: -#'\code{hive_query} works by running the query you provide through the CLI via a system() call. -#'As a result, single escapes for meaningful characters (such as quotes) within the query will not work: -#'R will interpret them only as escaping that character /within R/. Double escaping (\\\) is thus necessary, -#'in the same way that it is for regular expressions. -#' -#'@return a data.frame containing the results of the query, or a boolean TRUE if the user has chosen -#'to write straight to file. -#' -#'@section handling our hadoop/hive setup: -#' -#'The \code{webrequests} table is documented -#'\href{https://wikitech.wikimedia.org/wiki/Analytics/Cluster/Hive}{on Wikitech}, which also provides -#'\href{https://wikitech.wikimedia.org/wiki/Analytics/Cluster/Hive/Queries}{a set of example -#'queries}. -#' -#'When it comes to manipulating the rows with Java before they get to you, Nuria has written a -#'\href{https://wikitech.wikimedia.org/wiki/Analytics/Cluster/Hive/QueryUsingUDF}{brief tutorial on loading UDFs} -#'which should help if you want to engage in that; the example provided is a user agent parser, allowing you to -#'get the equivalent of \code{\link{ua_parse}}'s output further upstream. -#'@seealso \code{\link{log_strptime}} for converting the "dt" column in the webrequests table to POSIXlt, -#'and \code{\link{mysql_query}} and \code{\link{global_query}} for querying our MySQL databases. -#' -#'@examples -#'\dontrun{ -#'query_hive("USE wmf; DESCRIBE webrequest;") -#'} -#' -#'@export +#' @title Query Hadoop cluster with Hive +#' @description Queries Hive +#' @param query a Hive query +#' @param override_jars A logical flag indicating whether to override the path. +#' Hive on WMF's analytics machine(s) loads some JARs by default, so if your +#' query uses an updated version of an existing UDF and you want to load the +#' JAR that you built yourself, set this to `TRUE`. See +#' [Testing changes to existing UDF](https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Hive/QueryUsingUDF#Testing_changes_to_existing_udf) +#' for more details. +#' @section escaping: +#' `hive_query` works by running the query you provide through the CLI via a +#' [system()] call. As a result, single escapes for meaningful characters +#' (such as quotes) within the query will not work: R will interpret them +#' only as escaping that character /within R/. Double escaping (\\\) is thus +#' necessary, in the same way that it is for regular expressions. +#' @return a `data.frame` containing the results of the query, or a `TRUE` if +#' the user has chosen to write straight to file. +#' @section Handling our hadoop/hive setup: +#' The `webrequests` table is documented [on Wikitech](https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Hive), +#' which also provides [a set of example queries](https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Hive/Queries). When it comes to manipulating the rows with Java before they get to you, Nuria has written a +#' [brief tutorial on loading UDFs](https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Hive/QueryUsingUDF) +#' which should help if you want to engage in that. +#' @seealso [log_strptime()] or [lubridate::ymd_hms()] for converting the "dt" +#' column in the webrequests table to proper datetime, and [mysql_query()] +#' and [global_query()] for querying our MySQL databases +#' @examples +#' \dontrun{ +#' query_hive("USE wmf; DESCRIBE webrequest;") +#' } +#' @export query_hive <- function(query, override_jars = FALSE) { # Write query out to tempfile and create tempfile for results. query_dump <- tempfile() cat(query, file = query_dump) results_dump <- tempfile() - filters <- paste0(c("", paste("grep -v", c("JAVA_TOOL_OPTIONS", "parquet.hadoop", "WARN:", ":WARN"))), collapse = " | ") + filters <- paste0( + c("", paste("grep -v", c("JAVA_TOOL_OPTIONS", "parquet.hadoop", "WARN:", ":WARN"))), + collapse = " | " + ) # Query and read in the results try({ system( paste0("export HADOOP_HEAPSIZE=1024 && hive -S ", ifelse(override_jars, "--hiveconf hive.aux.jars.path= ", ""), "-f ", query_dump, " 2> /dev/null", filters, " > ", results_dump) ) results <- read.delim(results_dump, sep = "\t", quote = "", as.is = TRUE, header = TRUE) }) # Clean up and return file.remove(query_dump, results_dump) stop_on_empty(results) return(results) } -#'@title Generate a Date Clause for a Hive query -#'@description what it says on the tin; generates a "WHERE year = foo AND month = bar" using lubridate -#'that can then be combined with other elements to form a Hive query. -#' -#'@param date the date to use. If NULL, yesterday will be used. -#' -#'@return a list containing two elements, "date_clause" and "date"; the returning of -#'the date allows you to include it with. -#' -#'@export +#' @title Generate a Date Clause for a Hive query +#' @description What it says on the tin; generates a +#' `WHERE year = foo AND month = bar` +#' that can then be combined with other elements to form a Hive query. +#' @param date if `NULL`, yesterday will be used +#' @return a list containing two elements: "date_clause" and "date"; the +#' returning of the date allows you to include it +#' @export date_clause <- function(date) { if (is.null(date)) { date <- Sys.Date() - 1 } - - split_date <- unlist(strsplit(as.character(date), "-")) - - fragment <- (paste("WHERE year =", split_date[1], - "AND month =",split_date[2], - "AND day =", split_date[3], "")) - + fragment <- sprintf( + "WHERE year = %s AND month = %s AND day = %s ", + lubridate::year(date), + lubridate::month(date), + lubridate::mday(date) + ) output <- list(date_clause = fragment, date = date) return(output) -} \ No newline at end of file +} diff --git a/R/interleaved.R b/R/interleaved.R new file mode 100644 index 0000000..75448c7 --- /dev/null +++ b/R/interleaved.R @@ -0,0 +1,167 @@ +#' @title Interleaved search results +#' @description Tools for analysis of experiments that use interleaved search +#' results wherein users receive results from multiple sets of retrieval +#' functions. +#' - `interleaved_data` is a fake dataset used for testing and examples; +#' refer to **Format** section below +#' - `interleaved_data_a` is a fake dataset used for testing and examples; +#' "A" is preferred over "B" +#' - `interleaved_data_b` is a fake dataset used for testing and examples; +#' "B" is preferred over "A" +#' - `interleaved_preference` returns a test statistic summarizing the +#' interleaving experiment; a positive value indicates that A is better +#' than B, a negative value indicates that B is better than A +#' - `interleaved_sample_size()` estimates the sample size required to detect +#' a particular effect size with a specified power and significance level +#' @references +#' - Chapelle, O., Joachims, T., Radlinski, F., & Yue, Y. (2012). Large-scale +#' validation and analysis of interleaved search evaluation. +#' *ACM Transactions on Information Systems*, **30**(1), 1–41. +#' [doi:10.1145/2094072.2094078](https://doi.org/10.1145/2094072.2094078) +#' - Radlinski, F. and Craswell, N. (2013). [Optimized interleaving for online retrieval evaluation](https://www.microsoft.com/en-us/research/publication/optimized-interleaving-for-online-retrieval-evaluation/). +#' *ACM International Conference on Web Search and Data Mining (WSDM)*. +#' [doi:10.1145/2433396.2433429](https://doi.org/10.1145/2433396.2433429) +#' @name interleaved +NULL + +#' @param sessions vector of session IDs used to group `positions` and +#' `ranking_functions` +#' @param ranking_functions vector that shows which ranking function the +#' clicked search result came from ("A" or "B") +#' @examples +#' # Preference statistic calculation: +#' \dontrun{ +#' ## Data without a clear preference: +#' data("interleaved_data", package = "wmf") +#' x <- interleaved_data[interleaved_data$event == "click", ] +#' x <- x[order(x$session_id, x$timestamp), ] +#' interleaved_preference(x$session_id, x$ranking_function) +#' +#' ## Data where A is preferred over B: +#' data("interleaved_data_a", package = "wmf") +#' x <- interleaved_data_a[interleaved_data_a$event == "click", ] +#' x <- x[order(x$session_id, x$timestamp), ] +#' interleaved_preference(x$session_id, x$ranking_function) +#' +#' ## Data where B is preferred over A: +#' data("interleaved_data_b", package = "wmf") +#' x <- interleaved_data_b[interleaved_data_b$event == "click", ] +#' x <- x[order(x$session_id, x$timestamp), ] +#' interleaved_preference(x$session_id, x$ranking_function) +#' } +#' @rdname interleaved +#' @export +interleaved_preference <- function(sessions, ranking_functions) { + preferences <- table(vapply(split(ranking_functions, sessions), function(clicks) { + counts <- table(clicks) + if (sum(counts == max(counts)) > 1) { + return("tie") + } else { + return(names(counts)[which.max(counts)]) + } + }, "")) + wins <- as.numeric(preferences) + names(wins) <- names(preferences) + if (!"tie" %in% names(wins)) { + wins <- c(wins, c(tie = 0)) + } + numer <- (wins["A"] + wins["tie"] / 2) + denom <- wins["A"] + wins["B"] + wins["tie"] + return(unname((numer / denom) - 0.5)) +} + +#' @param alpha Probability of rejecting the null hypothesis even though it is +#' true. +#' @param power Probability of rejecting the null hypothesis (getting a +#' significant result) when the real difference is equal to the minimum +#' effect size. +#' @examples +#' # Sample size estimation: +#' \dontrun{ +#' data(interleaved_data, package = "wmf") +#' interleaved_sample_size() +#' } +#' @rdname interleaved +#' @export +interleaved_sample_size <- function(alpha = 0.05, power = 0.8) { + return(invisible(NULL)) +} + +fake_interleaved_data <- function(dev = FALSE, n_sessions = 1000, seed = 0) { + set.seed(seed) + fake_timestamps <- function(n) { + return(as.POSIXct( + runif(n, 0, 60 * 10), + origin = "2018-08-01 00:00:00", + tz = "UTC" + )) + } + fake_session <- function(preference = NA) { + n_events <- sample.int(10, 1) + if (n_events == 1) { + return(data.frame( + session_id = paste0(sample(c(letters, 0:9), 10), collapse = ""), + timestamp = fake_timestamps(1), + event = "serp", + position = as.numeric(NA), + ranking_function = as.character(NA), + stringsAsFactors = FALSE + )) + } else { + if (is.na(preference)) { + probability <- c(0.5, 0.5) + } else if (preference == "A") { + probability <- c(0.7, 0.3) + } else { + probability <- c(0.3, 0.7) + } + df <- data.frame( + session_id = rep_len(paste0(sample(c(letters, 0:9), 10), collapse = ""), n_events), + timestamp = sort(fake_timestamps(n_events), decreasing = FALSE), + event = c("serp", rep_len("click", n_events - 1)), + position = c(NA, sample.int(20, n_events - 1, replace = FALSE)), + ranking_function = c(NA, sample(c("A", "B"), n_events - 1, replace = TRUE, prob = probability)), + stringsAsFactors = FALSE + ) + if (n_events %in% c(3, 5, 7, 9) && rbinom(1, 1, 0.005) == 1) { + # 0.5% chance of same number of clicks for A and B + df$ranking_function[df$event == "click"] <- rep_len(c("A", "B"), n_events - 1) + } + return(df) + } + } + message("Generating unbiased data...") + interleaved_data <- do.call(rbind, replicate(n_sessions, fake_session(), simplify = FALSE)) + if (dev) { + devtools::use_data(interleaved_data, overwrite = TRUE) + } + message("Generating A-biased data...") + interleaved_data_a <- do.call(rbind, replicate(n_sessions, fake_session("A"), simplify = FALSE)) + if (dev) { + devtools::use_data(interleaved_data_a, overwrite = TRUE) + } + message("Generating B-biased data...") + interleaved_data_b <- do.call(rbind, replicate(n_sessions, fake_session("B"), simplify = FALSE)) + if (dev) { + devtools::use_data(interleaved_data_b, overwrite = TRUE) + } + if (!dev) { + return(list( + no_preference = interleaved_data, + a_preferred = interleaved_data_a, + b_preferred = interleaved_data_b + )) + } +} + +#' @format `interleaved_data*` are `data.frame`-s of generated search sessions with +#' the following columns: +#' \describe{ +#' \item{session_id}{10-character alphanumeric ID; for grouping events} +#' \item{timestamp}{when the event occurred; uses [POSIXct][base::DateTimeClasses] format} +#' \item{event}{"serp" or "click"} +#' \item{position}{position ("ranking") of the clicked search result} +#' \item{ranking_function}{"A" or "B"} +#' } +#' @rdname interleaved +"interleaved_data" diff --git a/R/logs.R b/R/logs.R index 381512c..21f1309 100644 --- a/R/logs.R +++ b/R/logs.R @@ -1,77 +1,97 @@ -parse_date <- function(date){ +parse_date <- function(date) { return(gsub(x = date, pattern = "-", replacement = "")) } -#'@title retrieve a vector of sampled log files -#'@description Grab sampled log files to be piped into -#'\code{\link{read_sampled_log}}. By default this retrieves all -#'sampled log files; it can be used to retrieve a particular date range of -#'files through the "earliest" and "latest" arguments. -#' -#'@param earliest a "Date" object. Set to NULL by default, which triggers -#'the retrieval of all log file names. -#' -#'@param latest a "Date" object; set to NULL by default. In the event that -#'\code{earliest} is set but \code{latest} is not, the files retrieved -#'will span from \code{earliest} to the current date; in the event that -#'both arguments are set, the retrieved files will be those in that range. -#' -#'@return A vector of filenames that can be passed into \code{\link{read_sampled_log}}. -#'@export +#' @title Retrieve a vector of sampled log files +#' @description Grabs sampled log files to be piped into [read_sampled_log()]. +#' By default this retrieves all sampled log files; it can be used to +#' retrieve a particular date range of files through the "earliest" and +#' "latest" arguments. +#' @param earliest a `Date` object. Set to `NULL` by default, which triggers +#' the retrieval of all log file names. +#' @param latest a `Date` object; set to `NULL` by default. In the event that +#' `earliest` is set but `latest` is not, the files retrieved will span from +#' `earliest` to the current date; in the event that both arguments are set, +#' the retrieved files will be those in that range. +#' @return A vector of filenames that can be passed into [read_sampled_log()] +#' @author Oliver Keyes +#' @export get_logfile <- function(earliest = NULL, latest = NULL){ - files <- list.files("/a/squid/archive/sampled", full.names= TRUE, pattern = "gz$") - if(!is.null(earliest)){ - file_dates <- as.numeric(substring(files,47,55)) - if(!is.null(latest)){ + # Begin Exclude Linting + files <- list.files("/a/squid/archive/sampled", full.names = TRUE, pattern = "gz$") + # End Exclude Linting + if (!is.null(earliest)) { + file_dates <- as.numeric(substring(files, 47, 55)) + if (!is.null(latest)) { files <- files[file_dates >= as.numeric(parse_date(earliest)) & file_dates <= as.numeric(parse_date(latest))] } else { files <- files[file_dates >= as.numeric(parse_date(earliest))] } } return(files) } -#'@title read a sampled log file -#'@description read a sampled log file identified with \code{\link{get_logfile}}. -#'The sampled logs are returned as a data.frame with 16 columns - see -#' the "Value" documentation. -#' -#'@param file a filename, retrieved with \code{\link{get_logfile}} -#'@param transparent a logical flag whether to gunzip the log file explicitly -#' first (default) or read it in directly. -#'@param nrows Number of rows to read in. (Optional) -#' -#'@return a data.frame containing 16 columns - "squid", "sequence_no", "timestamp", -#' "servicetime", "ip_address", "status_code", "reply_size", "request_method", -#' "url", "squid_status", "mime_type", "referer", "x_forwarded", "user_agent", -#' "lang" and "x_analytics". -#' -#'@importFrom urltools url_decode -#'@export +#' @title Read a sampled log file +#' @description Reads a sampled log file identified with [get_logfile()]. +#' The sampled logs are returned as a data.frame with 16 columns - see +#' the **Value** documentation. +#' @param file a filename, retrieved with [get_logfile()] +#' @param transparent a logical flag whether to gunzip the log file explicitly +#' first (default) or read it in directly. +#' @param nrows number of rows to read in; *optional* +#' @return a `data.frame` containing 16 columns: +#' - squid +#' - sequence_no +#' - timestamp +#' - servicetime +#' - ip_address +#' - status_code +#' - reply_size +#' - request_method +#' - url +#' - squid_status +#' - mime_type +#' - referer +#' - x_forwarded +#' - user_agent +#' - lang +#' - x_analytics +#' @importFrom urltools url_decode +#' @author Oliver Keyes +#' @export read_sampled_log <- function(file, transparent = FALSE, nrows = NULL){ is_gzipped <- grepl("gz$", file) - if ( is_gzipped ) { # gzipped log file - if ( transparent ) { # read the file in directly w/o gunzipping first + if (is_gzipped) { # gzipped log file + if (transparent) { # read the file in directly w/o gunzipping first output_file <- file } else { output_file <- tempfile() system(paste("gunzip -c", file, ">", output_file)) } - } else { # an already gunzipped log file + } else { + # an already gunzipped log file output_file <- file } - if ( is.null(nrows) ) nrows = -1 - data <- read.delim(outfile, as.is = TRUE, quote = "", nrows = nrows, - col.names = c("squid","sequence_no", - "timestamp", "servicetime", - "ip_address", "status_code", - "reply_size", "request_method", - "url", "squid_status", - "mime_type", "referer", - "x_forwarded", "user_agent", - "lang", "x_analytics")) - if ( is_gzipped && !transparent ) file.remove(output_file) + if (is.null(nrows)) { + nrows <- -1 + } + data <- read.delim( + output_file, as.is = TRUE, quote = "", nrows = nrows, + col.names = c( + "squid", "sequence_no", + "timestamp", "servicetime", + "ip_address", "status_code", + "reply_size", "request_method", + "url", "squid_status", + "mime_type", "referer", + "x_forwarded", "user_agent", + "lang", "x_analytics" + ) + ) + if (is_gzipped && !transparent) { + file.remove(output_file) + } data$url <- urltools::url_decode(data$url) data$referer <- urltools::url_decode(data$referer) return(data) } diff --git a/R/mysql.R b/R/mysql.R index 0d47e46..1cd72ad 100644 --- a/R/mysql.R +++ b/R/mysql.R @@ -1,182 +1,188 @@ RMySQL_version <- function() { # Returns 93 if the installed version of RMySQL is 0.9.3 return(as.numeric(paste0(unlist(packageVersion("RMySQL")), collapse = ""))) } # Ensure that we recognise and error on 0 rows -stop_on_empty <- function(data){ - if(nrow(data) == 0){ +stop_on_empty <- function(data) { + if (nrow(data) == 0) { stop("No rows were returned from the database") } return(invisible()) } - -#'@title Work with MySQL databases -#'@description Read from, write to, and check data from the MySQL databases and -#' tables in the Wikimedia cluster. Assumes the presence of a validly -#' formatted configuration file. -#' -#'@param query A SQL query. -#' -#'@param database The name of the database to query. -#' -#'@param con A MySQL connection returned by \code{mysql_connect}. -#' Optional -- if not provided, a temporary connection will be opened up. -#' -#'@param table The name of a table to check for the existence of or create, -#' depending on the function. -#' -#'@param ... Further arguments to pass to dbWriteTable. See ?dbWriteTable for more details. -#' -#'@name mysql -#'@rdname mysql -#'@importMethodsFrom RMySQL dbConnect -#' -#'@seealso \code{\link{hive_query}} or \code{\link{global_query}} -#' -#'@export +#' @title Work with MySQL databases +#' @description Read from, write to, and check data from the MySQL databases and +#' tables in the Wikimedia cluster. Assumes the presence of a validly +#' formatted configuration file. +#' @param query SQL query +#' @param database name of the database to query +#' @param con MySQL connection returned by [mysql_connect()]; *optional* -- if +#' not provided, a temporary connection will be opened up +#' @param table name of a table to check for the existence of or create, +#' depending on the function +#' @param ... Further arguments to pass to [RMySQL::dbWriteTable()] +#' @name mysql +#' @rdname mysql +#' @seealso [hive_query()] or [global_query()] +#' @export mysql_connect <- function(database, default_file = NULL) { + # Begin Exclude Linting if (is.null(default_file)) { possible_cnfs <- c( - "analytics-research-client.cnf", # on stat1002 - "stats-research-client.cnf", # on stat1003 + "discovery-stats-client.cnf", # on stat1005 + "statistics-private-client.cnf", # on stat1005 + "analytics-research-client.cnf", # on stat1005 + "stats-research-client.cnf", # on stat1006 "research-client.cnf" # on notebook1001 ) for (cnf in file.path("/etc/mysql/conf.d", possible_cnfs)) { if (file.exists(cnf)) { default_file <- cnf break } } if (is.null(default_file)) { if (dir.exists("/etc/mysql/conf.d")) { cnfs <- dir("/etc/mysql/conf.d", pattern = "*.cnf") if (length(cnfs) == 0) { stop("no credentials found in mysql conf dir") } else { - warning("didn't find any of the specified credentials (", paste0(possible_cnfs, collapse = ", "), "), but going to try this one: ", cnfs[1]) - default_file <- cnfs[1] + stop( + "didn't find any of the specified credentials (", + paste0(possible_cnfs, collapse = ", "), ")" + ) } } else { stop("no configuration directory for mysql credentials") } } } if (RMySQL_version() > 93) { - con <- dbConnect(drv = RMySQL::MySQL(), - host = "analytics-store.eqiad.wmnet", - dbname = database, default.file = default_file) - } else { # Using version RMySQL 0.9.3 or older: - con <- dbConnect(drv = "MySQL", - host = "analytics-store.eqiad.wmnet", - dbname = database, default.file = default_file) + con <- RMySQL::dbConnect( + drv = RMySQL::MySQL(), + host = "analytics-store.eqiad.wmnet", + dbname = database, default.file = default_file + ) + } else { + # Using version RMySQL 0.9.3 or older: + con <- RMySQL::dbConnect( + drv = "MySQL", + host = "analytics-store.eqiad.wmnet", + dbname = database, default.file = default_file + ) } + # End Exclude Linting return(con) } -#'@rdname mysql -#'@importMethodsFrom RMySQL dbSendQuery dbDisconnect dbListResults dbClearResult fetch -#'@export +#' @rdname mysql +#' @export mysql_read <- function(query, database, con = NULL) { already_connected <- !is.null(con) if (!already_connected) { # Open a temporary connection to the db: con <- mysql_connect(database) } - to_fetch <- dbSendQuery(con, query) - data <- fetch(to_fetch, -1) + # Begin Exclude Linting + to_fetch <- RMySQL::dbSendQuery(con, query) + data <- RMySQL::fetch(to_fetch, -1) message(sprintf("Fetched %.0f rows and %.0f columns.", nrow(data), ncol(data))) - dbClearResult(dbListResults(con)[[1]]) + RMySQL::dbClearResult(RMySQL::dbListResults(con)[[1]]) + # End Exclude Linting if (!already_connected) { # Close temporary connection: mysql_close(con) } stop_on_empty(data) return(data) } -#'@rdname mysql -#'@importMethodsFrom RMySQL dbExistsTable dbDisconnect -#'@export +#' @rdname mysql +#' @export mysql_exists <- function(database, table_name, con = NULL) { already_connected <- !is.null(con) if (!already_connected) { # Open a temporary connection to the db: con <- mysql_connect(database) } # Grab the results and close off: - table_exists <- dbExistsTable(conn = con, name = table_name) + # Begin Exclude Linting + table_exists <- RMySQL::dbExistsTable(conn = con, name = table_name) + # End Exclude Linting if (!already_connected) { # Close temporary connection: mysql_close(con) } #Return return(table_exists) } -#'@rdname mysql -#'@importMethodsFrom RMySQL dbWriteTable dbDisconnect -#'@export +#' @rdname mysql +#' @export mysql_write <- function(x, database, table_name, con = NULL, ...){ already_connected <- !is.null(con) if (!already_connected) { # Open a temporary connection to the db: con <- mysql_connect(database) } # Write: - result <- dbWriteTable(conn = con, - name = table_name, - value = x, - row.names = FALSE, - ...) + # Begin Exclude Linting + result <- RMySQL::dbWriteTable( + conn = con, + name = table_name, + value = x, + row.names = FALSE, + ... + ) + # End Exclude Linting if (!already_connected) { # Close temporary connection: mysql_close(con) } # Return the success/failure: return(result) } -#'@rdname mysql -#'@importMethodsFrom RMySQL dbDisconnect -#'@export +#' @rdname mysql +#' @export mysql_close <- function(con) { - dbDisconnect(con) + # Begin Exclude Linting + RMySQL::dbDisconnect(con) + # End Exclude Linting return(invisible()) } -#'@rdname mysql -#'@export + +#' @rdname mysql +#' @export mysql_disconnect <- function(con) { mysql_close(con) } -#'@title Builds a MySQL query aimed at the EventLogging-centric formats -#'@description constructs a MySQL query with a conditional around date. -#'This is aimed at eventlogging, where the date/time is always "timestamp". -#' -#'@param fields the SELECT statement. -#' -#'@param table the table to use. -#' -#'@param date the date to restrict to. If NULL, yesterday will be used. -#' -#'@param any other conditionals to include in the WHERE statement. -#' -#'@export +#' @title Builds a MySQL query aimed at the EventLogging-centric formats +#' @description constructs a MySQL query with a conditional around date. +#' This is aimed at eventlogging, where the date/time is always "timestamp". +#' @param fields the `SELECT` statement. +#' @param table the table to use. +#' @param date the date to restrict to. If `NULL`, yesterday will be used. +#' @param any other conditionals to include in the `WHERE` statement. +#' @export build_query <- function(fields, table, date = NULL, conditionals = NULL){ # Ensure we have a date and deconstruct it into a MW-friendly format if (is.null(date)) { date <- Sys.Date() - 1 } date <- gsub(x = date, pattern = "-", replacement = "") # Build the query proper (this will work for EL schemas where the field is always 'timestamp') - query <- paste(fields, "FROM", table, "WHERE LEFT(timestamp,8) =", date, - ifelse(is.null(conditionals), "", "AND"), conditionals) + query <- paste0( + fields, " FROM ", table, " WHERE LEFT(timestamp, 8) = '", date, "'", + ifelse(is.null(conditionals), "", " AND "), + conditionals + ) results <- mysql_read(query, "log") stop_on_empty(results) return(results) } diff --git a/R/power.R b/R/power.R deleted file mode 100644 index f0f3201..0000000 --- a/R/power.R +++ /dev/null @@ -1,153 +0,0 @@ -oddsRatio <- function(p_treatment, p_control) { - return( (p_treatment/(1-p_treatment)) / (p_control/(1-p_control)) ) -} -pTreatment <- function(p_control, odds_ratio) { - return( (odds_ratio*p_control) / ((p_control*(odds_ratio-1)) + 1) ) -} -pControl <- function(p_treatment, odds_ratio) { - return( 1/( (odds_ratio * ((1/p_treatment) - 1)) + 1 ) ) -} - -#'@title calculate sample size given odds ratio -#' -#'@param odds_ratio The expected odds ratio. That is, the ratio of the odds of -#' the outcome in the test group relative to the control group. Optional, -#' but see \strong{Details}. -#'@param p_control Your guess for prevalence of outcome in the control group. -#' Optional but see \strong{Details}. -#'@param p_treatment Your guess for prevalence of outcome in the test group. -#' Optional but see \strong{Details}. -#'@param power The ability of the test to detect an effect where there is one. -#' Power = 1 - Prob(Type 2 error). Optional. See \strong{Value} for details. -#'@param conf_level Desired confidence level. Defaults to 95\%. -#'@param sample_ratio Ratio of test group to control group. 1 is even split. -#'@param visualize Whether to plot power or prevalence of outcome in the -#' control group vs sample size. Can be used to help make a decision. -#' -#'@details The function only needs to know two of the following three: -#' \code{odds_ratio}, \code{p_control}, and \code{p_treatment}. If given -#' all three, it will check to make sure the odds ratio is correct. It -#' will figure out the missing third value from the other two. -#' -#'@return If \code{power} was not provided, returns vector containing -#' possible power values and the appropriate sample size for each \%. -#' If all values were provided, returns a single sample size estimate. -#' -#'@section References: -#'Wang, H., Chow, S.-C., & Li, G. (2002). On sample size calculation based on -#' odds ratio in clinical trials. \emph{Journal of Biopharmaceutical -#' Statistics}, \strong{12}(4), 471–483. -#' \url{http://doi.org/10.1081/BIP-120016231} -#' -#'@examples -#'sample_size_odds(p_treatment = 0.4, p_control = 0.25, power = 0.8) -#'sample_size_odds(odds_ratio = 2, p_control = 0.4, power = c(0.8, 0.9, 0.95)) -#'sample_size_odds(odds_ratio = 2, p_control = 0.4) -#'sample_size_odds(odds_ratio = 2, p_control = 0.4, visualize = TRUE) -#' -#'@export -sample_size_odds <- function(odds_ratio = NULL, - p_control = NULL, - p_treatment = NULL, - power = NULL, conf_level = 0.95, - sample_ratio = 1, - visualize = FALSE) { - # Checks - power_missing <- is.null(power) - pC_missing <- is.null(p_control) - pT_missing <- is.null(p_treatment) - oR_missing <- is.null(odds_ratio) - if ( (oR_missing + pC_missing + pT_missing) > 1) { - stop("Only one of {odds_ratio, p_control, p_treatment} can be missing.") - } - - # Imputations (Part 1) - if (power_missing) { - power <- seq(0.5, 0.99, 0.01) - } - # Imputations (Part 2) - if (pC_missing) { - p_control <- pControl(p_treatment, odds_ratio) - } else if (pT_missing) { - p_treatment <- pTreatment(p_control, odds_ratio) - } else if (oR_missing) { - odds_ratio <- oddsRatio(p_treatment, p_control) - } - - # Calculations - x <- p_treatment * (1 - p_treatment) * sample_ratio - y <- p_control * (1 - p_control) - z_alpha <- qnorm((1-conf_level)/2) - z_beta <- qnorm(1-power) - n_b <- (1/x + 1/y) * (((z_alpha + z_beta)^2) / (log(odds_ratio)^2)) - n_a <- sample_ratio * n_b - n <- ceiling(n_a + n_b) - - # Visualization - if (visualize) { - if (power_missing || length(power) > 1) { - plot(power, n, type = "l", main = "Sample size as function of statistical power", - ylab = "N", xlab = "Power to detect effect", lwd = 2, xaxt = "n") - axis(side = 1, at = seq(0.5, 1, 0.1), - labels = sprintf("%.0f%%", 100*seq(0.5, 1, 0.1))) - abline(v = seq(0.5, 1, 0.1), lty = "dotted", col = "lightgray", lwd = par("lwd")) - } else { - warning("All parameters known. Nothing to visualize.") - } - } - - # Output - if (power_missing || length(power) > 1) { - names(n) <- sprintf("%.0f%%", power*100) - } - return(n) - -} - -#'@title calculate sample size given effect size -#'@description Uses Cohen's w for effect size to calculate sample size for -#' a chi-squared test of independence. -#' -#'@param w Effect size you want the test to be able to detect. (Optional) -#'@param groups Number of groups. Used in degrees of freedom calculation. -#' Defaults to 2 (e.g. control group vs treatment group). -#'@param sig_level Probability of Type 1 error. Usually called alpha. -#' Defaults to 0.05. -#'@param power Ability to detect the effect. (1 - probability of Type 2 error) -#' Defaults to 80\%. -#'@return If \code{w} was not provided, returns a data frame containing -#' possible values of w and the corresponding sample size estimates. -#' -#'@examples -#'sample_size_effect() -#'sample_size_effect(0.1) -#'sample_size_effect(w = 0.1, groups = 3, sig_level = 0.001, power = 0.9) -#' -#'@importFrom pwr pwr.chisq.test -#'@export -sample_size_effect <- function(w = NULL, groups = 2, - sig_level = 0.05, power = 0.8) { - # Checks - w_missing <- is.null(w) - if (!w_missing && w <= 0.01) stop("'w' must be > 0.01") - if (power <= 0.1 || power > 1.0) stop("'power' must be in (0.1, 1]") - - # Imputation - if (w_missing) w <- c(0.05, 0.1, 0.3, 0.5) - - # Calculation and output - if (length(w) > 1) { - n <- ceiling(sapply(w, function(ww) { - pwr::pwr.chisq.test(w = ww, N = NULL, df = groups - 1, - sig.level = sig_level, power = power)$N - })) - names(n) <- c("tiny", "small", "medium", "large") - } else { - n <- ceiling(pwr::pwr.chisq.test(w = w, N = NULL, df = groups - 1, - sig.level = sig_level, power = power)$N) - } - return(n) -} - - - diff --git a/R/proxies.R b/R/proxies.R index 0bb0cce..78eab10 100644 --- a/R/proxies.R +++ b/R/proxies.R @@ -1,19 +1,17 @@ -#'@title set HTTP and HTTPS proxies -#'@description set the HTTP and HTTPS proxies when running R on -#'one of the Wikimedia servers +#' @title Set HTTP and HTTPS proxies +#' @description Sets the HTTP and HTTPS proxies when running R on +#' Wikimedia machines. +#' @examples +#' \dontrun{ +#' # This will fail in the cluster +#' devtools::install_github("ironholds/urltools") #' -#'@examples -#'\dontrun{ -#'#This will fail in the cluster -#'devtools::install_github("ironholds/urltools") -#' -#'#This will work -#'set_proxies() -#'devtools::install_github("ironholds/urltools") +#' # This will work +#' set_proxies() +#' devtools::install_github("ironholds/urltools") #'} -#' #'@export -set_proxies <- function(){ +set_proxies <- function() { Sys.setenv("http_proxy" = "http://webproxy.eqiad.wmnet:8080") Sys.setenv("https_proxy" = "http://webproxy.eqiad.wmnet:8080") -} \ No newline at end of file +} diff --git a/R/time.R b/R/time.R index 94c5b3c..cf1230e 100644 --- a/R/time.R +++ b/R/time.R @@ -1,32 +1,31 @@ -#'@title convert to and from common timestamp formats -#'@description convert to and from MediaWiki and request log timestamp formats -#' -#'@param x a vector of timestamps -#' -#'@name timeconverters -#'@rdname timeconverters -#' -#'@examples -#'from_mediawiki("20150101010301") -#'@export -from_mediawiki <- function(x){ +#' @title convert to and from common timestamp formats +#' @description convert to and from MediaWiki and request log timestamp formats +#' @param x a vector of timestamps +#' @name timeconverters +#' @rdname timeconverters +#' @examples +#' from_mediawiki("20150101010301") +#' @author Oliver Keyes +#' @seealso [lubridate::ymd_hms()] +#' @export +from_mediawiki <- function(x) { return(strptime(substr(x, 0, 14), format = "%Y%m%d%H%M%S", tz = "UTC")) } -#'@rdname timeconverters -#'@export -from_log <- function(x){ +#' @rdname timeconverters +#' @export +from_log <- function(x) { return(strptime(substr(iconv(x, to = "UTF-8"), 0, 19), format = "%Y-%m-%dT%H:%M:%S", tz = "UTC")) } -#'@rdname timeconverters -#'@export -to_mediawiki <- function(x){ +#' @rdname timeconverters +#' @export +to_mediawiki <- function(x) { gsub(x = x, pattern = "(:| |-)", replacement = "") } -#'@rdname timeconverters -#'@export -to_log <- function(x){ +#' @rdname timeconverters +#' @export +to_log <- function(x) { gsub(x = x, pattern = " ", replacement = "T") -} \ No newline at end of file +} diff --git a/R/wmf.R b/R/wmf.R index e69de29..3929cc8 100644 --- a/R/wmf.R +++ b/R/wmf.R @@ -0,0 +1,8 @@ +#' @title wmf: R Code for Wikimedia Foundation Internal Usage +#' @description This package contains functions made for Analysts at Wikimedia +#' Foundation, but can be used by people outside of the Foundation. +#' @docType package +#' @name wmf +#' @useDynLib wmf +#' @importFrom Rcpp sourceCpp +NULL diff --git a/R/writers.R b/R/writers.R index e6f465e..49125cd 100644 --- a/R/writers.R +++ b/R/writers.R @@ -1,43 +1,38 @@ -#'@title Conditionally write out to a file -#'@description if the file already exists, append. If it -#'doesn't, create! -#' -#'@param x the object to write out -#' -#'@param file the path to the file to use. -#' -#'@export -write_conditional <- function(x, file){ - if(file.exists(file)){ +#' @title Conditionally write out to a file +#' @description If the file already exists, append. If it doesn't, create! +#' @param x the object to write out +#' @param file the path to the file to use +#' @seealso [rewrite_conditional()] +#' @export +write_conditional <- function(x, file) { + if (file.exists(file)) { write.table(x, file, append = TRUE, sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) } else { write.table(x, file, append = FALSE, sep = "\t", row.names = FALSE, quote = FALSE) } } -#'@title Conditionally write to a (rolling) file -#'@description writes out temporal data to a file while ensuring -#'the file only has \code{n_days} worth of data in it. -#' -#'@inheritParams write_conditional -#' -#'@param n_days the number of days worth of data to have in the file. -#'30 by default. -#' -#'@importFrom readr read_tsv -#'@export +#' @title Conditionally write to a (rolling) file +#' @description Writes out temporal data to a file while ensuring +#' the file only has `n_days` worth of data in it. +#' @inheritParams write_conditional +#' @param n_days the number of days worth of data to have in the file; +#' 30 by default +#' @importFrom readr read_tsv +#' @seealso [write_conditional()] +#' @export rewrite_conditional <- function(x, file, n_days = 30) { if ("grouped_df" %in% class(x)) { x <- dplyr::ungroup(x) } if (file.exists(file)) { y <- readr::read_tsv(file) y <- y[order(y$date, decreasing = FALSE), ] if ((Sys.Date() - min(y$date)) > (n_days + 1)) { z <- rbind(y[y$date >= (Sys.Date() - 1 - n_days), ], x) write.table(z, file, append = FALSE, sep = "\t", row.names = FALSE, quote = FALSE) return(invisible()) } } write_conditional(x, file) -} \ No newline at end of file +} diff --git a/README.md b/README.md index c8a3a17..711d71d 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,33 @@ -# R Tools for WMF Analytics +# R Tools for Wikimedia Foundation's Analysts [This package](https://phabricator.wikimedia.org/diffusion/1821/) contains functions made for Analysts at Wikimedia Foundation, but can be used by people outside of the Foundation. -1. `set_proxies()` to set http(s) proxies on the analytics cluster; -2. `global_query()` for querying all of our MySQL databases; -3. `from_mediawiki` and `from_log` (and corresponding `to_*` functions) to convert between time formats, and; -4. `query_hive` for querying our Hadoop cluster via Hive. -5. `sample_size_odds` and `sample_size_effect` for calculating sample size(s) given an odds ratio or effect size (Cohen's *w*). +- `set_proxies()` to set http(s) proxies on the analytics cluster +- `global_query()` for querying all of our MySQL databases +- `from_mediawiki()` and `from_log()` (and corresponding `to_*` functions) to convert between time formats +- `query_hive()` for querying our Hadoop cluster via Hive. +- Sample size calculations: + - `chisq_test_odds` estimates sample size for a chi-squared test given an odds ratio + - `chisq_test_effect` estimates sample size for a chi-squared test given Cohen's *w* More functions as people need them. -Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms. - ## Installation ```R -# install.packages("devtools", repos = "https://cran.rstudio.com/") +# install.packages("devtools", repos = c(CRAN = "https://cran.rstudio.com/")) devtools::install_git("https://gerrit.wikimedia.org/r/wikimedia/discovery/wmf") -# Alternatively, you can install from the GitHub mirror of WMF's Gerrit repos: +# Alternatively, you can install from GitHub mirror: devtools::install_github("wikimedia/wikimedia-discovery-wmf") ``` ## Maintainers - [Mikhail Popov](https://meta.wikimedia.org/wiki/User:MPopov_(WMF)) - [Chelsy Xie](https://meta.wikimedia.org/wiki/User:CXie_(WMF)) + +## Additional Information + +Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms. diff --git a/data/interleaved_data.rda b/data/interleaved_data.rda new file mode 100644 index 0000000..2adf926 Binary files /dev/null and b/data/interleaved_data.rda differ diff --git a/data/interleaved_data_a.rda b/data/interleaved_data_a.rda new file mode 100644 index 0000000..2c0d29d Binary files /dev/null and b/data/interleaved_data_a.rda differ diff --git a/data/interleaved_data_b.rda b/data/interleaved_data_b.rda new file mode 100644 index 0000000..5fd9710 Binary files /dev/null and b/data/interleaved_data_b.rda differ diff --git a/man/FiveThirtyNine.Rd b/man/FiveThirtyNine.Rd index fb66530..0282bfd 100644 --- a/man/FiveThirtyNine.Rd +++ b/man/FiveThirtyNine.Rd @@ -1,23 +1,25 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/dataviz.R \name{FiveThirtyNine} \alias{FiveThirtyNine} \alias{theme_fivethirtynine} \title{Theme inspired by fivethirtyeight.com plots} \usage{ theme_fivethirtynine(base_size = 12, base_family = "sans") } \arguments{ \item{base_size}{base font size} \item{base_family}{base font family} } \description{ -A modification of \code{ggthemes::theme_fivethirtyeight} +A modification of \code{\link[ggthemes:theme_fivethirtyeight]{ggthemes::theme_fivethirtyeight()}} } \details{ Basically it adds axis titles (with some modification on the y to - allow for long titles) back in and does a small amount of reduction of the - overall plot size to avoid an absolute ton of extraneous spacing. +allow for long titles) back in and does a small amount of reduction of the +overall plot size to avoid an absolute ton of extraneous spacing. +} +\author{ +Oliver Keyes } - diff --git a/man/build_query.Rd b/man/build_query.Rd index 8f73d7a..fdc0170 100644 --- a/man/build_query.Rd +++ b/man/build_query.Rd @@ -1,22 +1,21 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/mysql.R \name{build_query} \alias{build_query} \title{Builds a MySQL query aimed at the EventLogging-centric formats} \usage{ build_query(fields, table, date = NULL, conditionals = NULL) } \arguments{ -\item{fields}{the SELECT statement.} +\item{fields}{the \code{SELECT} statement.} \item{table}{the table to use.} -\item{date}{the date to restrict to. If NULL, yesterday will be used.} +\item{date}{the date to restrict to. If \code{NULL}, yesterday will be used.} -\item{any}{other conditionals to include in the WHERE statement.} +\item{any}{other conditionals to include in the \code{WHERE} statement.} } \description{ constructs a MySQL query with a conditional around date. This is aimed at eventlogging, where the date/time is always "timestamp". } - diff --git a/man/sample_size_effect.Rd b/man/chisq_test_effect.Rd similarity index 52% rename from man/sample_size_effect.Rd rename to man/chisq_test_effect.Rd index 88ddd2b..c178a65 100644 --- a/man/sample_size_effect.Rd +++ b/man/chisq_test_effect.Rd @@ -1,35 +1,39 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/power.R -\name{sample_size_effect} -\alias{sample_size_effect} -\title{calculate sample size given effect size} +% Please edit documentation in R/chisq_test.R +\name{chisq_test_effect} +\alias{chisq_test_effect} +\title{Calculate sample size for chi-squared test of indep. given Cohen's w} \usage{ -sample_size_effect(w = NULL, groups = 2, sig_level = 0.05, power = 0.8) +chisq_test_effect(w = NULL, groups = 2, sig_level = 0.05, power = 0.8) } \arguments{ \item{w}{Effect size you want the test to be able to detect. (Optional)} \item{groups}{Number of groups. Used in degrees of freedom calculation. Defaults to 2 (e.g. control group vs treatment group).} \item{sig_level}{Probability of Type 1 error. Usually called alpha. Defaults to 0.05.} \item{power}{Ability to detect the effect. (1 - probability of Type 2 error) Defaults to 80\%.} } \value{ If \code{w} was not provided, returns a data frame containing - possible values of w and the corresponding sample size estimates. +possible values of w and the corresponding sample size estimates. } \description{ Uses Cohen's w for effect size to calculate sample size for - a chi-squared test of independence. +a chi-squared test of independence. } \examples{ -sample_size_effect() -sample_size_effect(0.1) -sample_size_effect(w = 0.1, groups = 3, sig_level = 0.001, power = 0.9) - +chisq_test_effect() +chisq_test_effect(0.1) +chisq_test_effect(w = 0.1, groups = 3, sig_level = 0.001, power = 0.9) +} +\seealso{ +\code{\link[=chisq_test_odds]{chisq_test_odds()}} +} +\author{ +Mikhail Popov } - diff --git a/man/sample_size_odds.Rd b/man/chisq_test_odds.Rd similarity index 60% rename from man/sample_size_odds.Rd rename to man/chisq_test_odds.Rd index ad73b92..27f3703 100644 --- a/man/sample_size_odds.Rd +++ b/man/chisq_test_odds.Rd @@ -1,56 +1,61 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/power.R -\name{sample_size_odds} -\alias{sample_size_odds} -\title{calculate sample size given odds ratio} +% Please edit documentation in R/chisq_test.R +\name{chisq_test_odds} +\alias{chisq_test_odds} +\title{Calculate sample size for chi-squared test of indep. given odds ratio} \usage{ -sample_size_odds(odds_ratio = NULL, p_control = NULL, p_treatment = NULL, +chisq_test_odds(odds_ratio = NULL, p_control = NULL, p_treatment = NULL, power = NULL, conf_level = 0.95, sample_ratio = 1, visualize = FALSE) } \arguments{ \item{odds_ratio}{The expected odds ratio. That is, the ratio of the odds of the outcome in the test group relative to the control group. Optional, -but see \strong{Details}.} +but see \emph{Details}.} \item{p_control}{Your guess for prevalence of outcome in the control group. Optional but see \strong{Details}.} \item{p_treatment}{Your guess for prevalence of outcome in the test group. Optional but see \strong{Details}.} \item{power}{The ability of the test to detect an effect where there is one. Power = 1 - Prob(Type 2 error). Optional. See \strong{Value} for details.} \item{conf_level}{Desired confidence level. Defaults to 95\%.} \item{sample_ratio}{Ratio of test group to control group. 1 is even split.} \item{visualize}{Whether to plot power or prevalence of outcome in the control group vs sample size. Can be used to help make a decision.} } \value{ If \code{power} was not provided, returns vector containing - possible power values and the appropriate sample size for each \%. - If all values were provided, returns a single sample size estimate. +possible power values and the appropriate sample size for each \%. +If all values were provided, returns a single sample size estimate. } \details{ The function only needs to know two of the following three: \code{odds_ratio}, \code{p_control}, and \code{p_treatment}. If given all three, it will check to make sure the odds ratio is correct. It will figure out the missing third value from the other two. } \section{References}{ Wang, H., Chow, S.-C., & Li, G. (2002). On sample size calculation based on - odds ratio in clinical trials. \emph{Journal of Biopharmaceutical - Statistics}, \strong{12}(4), 471–483. - \url{http://doi.org/10.1081/BIP-120016231} +odds ratio in clinical trials. \emph{Journal of Biopharmaceutical +Statistics}, \strong{12}(4), 471–483. +\href{http://doi.org/10.1081/BIP-120016231}{doi:10.1081/BIP-120016231} } -\examples{ -sample_size_odds(p_treatment = 0.4, p_control = 0.25, power = 0.8) -sample_size_odds(odds_ratio = 2, p_control = 0.4, power = c(0.8, 0.9, 0.95)) -sample_size_odds(odds_ratio = 2, p_control = 0.4) -sample_size_odds(odds_ratio = 2, p_control = 0.4, visualize = TRUE) +\examples{ +chisq_test_odds(p_treatment = 0.4, p_control = 0.25, power = 0.8) +chisq_test_odds(odds_ratio = 2, p_control = 0.4, power = c(0.8, 0.9, 0.95)) +chisq_test_odds(odds_ratio = 2, p_control = 0.4) +chisq_test_odds(odds_ratio = 2, p_control = 0.4, visualize = TRUE) +} +\seealso{ +\code{\link[=chisq_test_effect]{chisq_test_effect()}} +} +\author{ +Mikhail Popov } - diff --git a/man/date_clause.Rd b/man/date_clause.Rd index b2736b6..0a975c7 100644 --- a/man/date_clause.Rd +++ b/man/date_clause.Rd @@ -1,20 +1,20 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/hive.R \name{date_clause} \alias{date_clause} \title{Generate a Date Clause for a Hive query} \usage{ date_clause(date) } \arguments{ -\item{date}{the date to use. If NULL, yesterday will be used.} +\item{date}{if \code{NULL}, yesterday will be used} } \value{ -a list containing two elements, "date_clause" and "date"; the returning of -the date allows you to include it with. +a list containing two elements: "date_clause" and "date"; the +returning of the date allows you to include it } \description{ -what it says on the tin; generates a "WHERE year = foo AND month = bar" using lubridate +What it says on the tin; generates a +\code{WHERE year = foo AND month = bar} that can then be combined with other elements to form a Hive query. } - diff --git a/man/exact_binom.Rd b/man/exact_binom.Rd new file mode 100644 index 0000000..70cdcff --- /dev/null +++ b/man/exact_binom.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{exact_binom} +\alias{exact_binom} +\title{Sample size for exact, one sample binomial test} +\usage{ +exact_binom(constant_prop, effect_size, alpha = 0.05, power = 0.8, + two_tail = TRUE) +} +\arguments{ +\item{constant_prop}{The proportion under the null hypothesis.} + +\item{effect_size}{Positive size of the difference between your null +hypothesis and the alternative hypothesis that you hope to detect. +\strong{Heads-up} that values less than 1\% might take a while to calculate.} + +\item{alpha}{Probability of rejecting the null hypothesis even though it is +true.} + +\item{power}{Probability of rejecting the null hypothesis (getting a +significant result) when the real difference is equal to the minimum +effect size.} + +\item{two_tail}{Whether to perform two-tail or one-tail power analysis. +\code{TRUE} (default) tests in both directions of difference.} +} +\description{ +Estimates sample size required to detect difference from a +constant proportion. +} +\examples{ +exact_binom(0.75, 0.03) +} +\references{ +\href{http://www.biostathandbook.com/power.html}{Power analysis} and +\href{http://www.biostathandbook.com/exactgof.html}{Exact test of goodness-of-fit} from +John H. McDonald's \href{http://www.biostathandbook.com/}{Handbook of Biological Statistics} +} diff --git a/man/get_logfile.Rd b/man/get_logfile.Rd index 00e28d9..02cb790 100644 --- a/man/get_logfile.Rd +++ b/man/get_logfile.Rd @@ -1,27 +1,29 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/logs.R \name{get_logfile} \alias{get_logfile} -\title{retrieve a vector of sampled log files} +\title{Retrieve a vector of sampled log files} \usage{ get_logfile(earliest = NULL, latest = NULL) } \arguments{ -\item{earliest}{a "Date" object. Set to NULL by default, which triggers +\item{earliest}{a \code{Date} object. Set to \code{NULL} by default, which triggers the retrieval of all log file names.} -\item{latest}{a "Date" object; set to NULL by default. In the event that -\code{earliest} is set but \code{latest} is not, the files retrieved -will span from \code{earliest} to the current date; in the event that -both arguments are set, the retrieved files will be those in that range.} +\item{latest}{a \code{Date} object; set to \code{NULL} by default. In the event that +\code{earliest} is set but \code{latest} is not, the files retrieved will span from +\code{earliest} to the current date; in the event that both arguments are set, +the retrieved files will be those in that range.} } \value{ -A vector of filenames that can be passed into \code{\link{read_sampled_log}}. +A vector of filenames that can be passed into \code{\link[=read_sampled_log]{read_sampled_log()}} } \description{ -Grab sampled log files to be piped into -\code{\link{read_sampled_log}}. By default this retrieves all -sampled log files; it can be used to retrieve a particular date range of -files through the "earliest" and "latest" arguments. +Grabs sampled log files to be piped into \code{\link[=read_sampled_log]{read_sampled_log()}}. +By default this retrieves all sampled log files; it can be used to +retrieve a particular date range of files through the "earliest" and +"latest" arguments. +} +\author{ +Oliver Keyes } - diff --git a/man/global_query.Rd b/man/global_query.Rd index f07525c..c9bd95d 100644 --- a/man/global_query.Rd +++ b/man/global_query.Rd @@ -1,26 +1,28 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/global.R \name{global_query} \alias{global_query} -\title{global SQL queries for analytics-store.eqiad.wmnet} +\title{Global SQL queries for analytics-store.eqiad.wmnet} \usage{ global_query(query, project_type = "all") } \arguments{ \item{query}{the SQL query you want to run} -\item{project_type}{what class of wiki (wikisource, wiktionary..) you want to run against. Set to "all" by default.} +\item{project_type}{what class of wiki (e.g. "wikisource", "wiktionary") +you want to run against. Set to "all" by default.} } \description{ -\code{global_query} is a simple wrapper around the mysql queries that allows a useR to send a query to all production -dbs on analytics-store.eqiad.wmnet, joining the results from each query into a single object. -} -\author{ -Oliver Keyes +\code{global_query} is a simple wrapper around the MySQL queries +that allows a useR to send a query to all production dbs on +analytics-store.eqiad.wmnet, joining the results from each query into a +single object. } \seealso{ -\code{\link{mysql_read}} for querying an individual db, \code{\link{mw_strptime}} -for converting MediaWiki timestamps into POSIXlt timestamps, or \code{\link{hive_query}} for -accessing the Hive datastore. +\link{mysql_read} for querying an individual db, +\link{mw_strptime} for converting MediaWiki timestamps into \code{POSIXlt} timestamps, +or \link{hive_query} for accessing the Hive datastore +} +\author{ +Oliver Keyes } - diff --git a/man/interleaved.Rd b/man/interleaved.Rd new file mode 100644 index 0000000..71d6a99 --- /dev/null +++ b/man/interleaved.Rd @@ -0,0 +1,96 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interleaved.R +\docType{data} +\name{interleaved} +\alias{interleaved} +\alias{interleaved_preference} +\alias{interleaved_sample_size} +\alias{interleaved_data} +\title{Interleaved search results} +\format{\code{interleaved_data*} are \code{data.frame}-s of generated search sessions with +the following columns: +\describe{ +\item{session_id}{10-character alphanumeric ID; for grouping events} +\item{timestamp}{when the event occurred; uses \link[base:DateTimeClasses]{POSIXct} format} +\item{event}{"serp" or "click"} +\item{position}{position ("ranking") of the clicked search result} +\item{ranking_function}{"A" or "B"} +}} +\usage{ +interleaved_preference(sessions, ranking_functions) + +interleaved_sample_size(alpha = 0.05, power = 0.8) + +interleaved_data +} +\arguments{ +\item{sessions}{vector of session IDs used to group \code{positions} and +\code{ranking_functions}} + +\item{ranking_functions}{vector that shows which ranking function the +clicked search result came from ("A" or "B")} + +\item{alpha}{Probability of rejecting the null hypothesis even though it is +true.} + +\item{power}{Probability of rejecting the null hypothesis (getting a +significant result) when the real difference is equal to the minimum +effect size.} +} +\description{ +Tools for analysis of experiments that use interleaved search +results wherein users receive results from multiple sets of retrieval +functions. +\itemize{ +\item \code{interleaved_data} is a fake dataset used for testing and examples; +refer to \strong{Format} section below +\item \code{interleaved_data_a} is a fake dataset used for testing and examples; +"A" is preferred over "B" +\item \code{interleaved_data_b} is a fake dataset used for testing and examples; +"B" is preferred over "A" +\item \code{interleaved_preference} returns a test statistic summarizing the +interleaving experiment; a positive value indicates that A is better +than B, a negative value indicates that B is better than A +\item \code{interleaved_sample_size()} estimates the sample size required to detect +a particular effect size with a specified power and significance level +} +} +\examples{ +# Preference statistic calculation: +\dontrun{ +## Data without a clear preference: +data("interleaved_data", package = "wmf") +x <- interleaved_data[interleaved_data$event == "click", ] +x <- x[order(x$session_id, x$timestamp), ] +interleaved_preference(x$session_id, x$ranking_function) + +## Data where A is preferred over B: +data("interleaved_data_a", package = "wmf") +x <- interleaved_data_a[interleaved_data_a$event == "click", ] +x <- x[order(x$session_id, x$timestamp), ] +interleaved_preference(x$session_id, x$ranking_function) + +## Data where B is preferred over A: +data("interleaved_data_b", package = "wmf") +x <- interleaved_data_b[interleaved_data_b$event == "click", ] +x <- x[order(x$session_id, x$timestamp), ] +interleaved_preference(x$session_id, x$ranking_function) +} +# Sample size estimation: +\dontrun{ +data(interleaved_data, package = "wmf") +interleaved_sample_size() +} +} +\references{ +\itemize{ +\item Chapelle, O., Joachims, T., Radlinski, F., & Yue, Y. (2012). Large-scale +validation and analysis of interleaved search evaluation. +\emph{ACM Transactions on Information Systems}, \strong{30}(1), 1–41. +\href{https://doi.org/10.1145/2094072.2094078}{doi:10.1145/2094072.2094078} +\item Radlinski, F. and Craswell, N. (2013). \href{https://www.microsoft.com/en-us/research/publication/optimized-interleaving-for-online-retrieval-evaluation/}{Optimized interleaving for online retrieval evaluation}. +\emph{ACM International Conference on Web Search and Data Mining (WSDM)}. +\href{https://doi.org/10.1145/2433396.2433429}{doi:10.1145/2433396.2433429} +} +} +\keyword{datasets} diff --git a/man/mysql.Rd b/man/mysql.Rd index de36b60..1dd7b74 100644 --- a/man/mysql.Rd +++ b/man/mysql.Rd @@ -1,46 +1,45 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/mysql.R \name{mysql} \alias{mysql} -\alias{mysql_close} \alias{mysql_connect} -\alias{mysql_disconnect} -\alias{mysql_exists} \alias{mysql_read} +\alias{mysql_exists} \alias{mysql_write} +\alias{mysql_close} +\alias{mysql_disconnect} \title{Work with MySQL databases} \usage{ mysql_connect(database, default_file = NULL) mysql_read(query, database, con = NULL) mysql_exists(database, table_name, con = NULL) mysql_write(x, database, table_name, con = NULL, ...) mysql_close(con) mysql_disconnect(con) } \arguments{ -\item{database}{The name of the database to query.} +\item{database}{name of the database to query} -\item{query}{A SQL query.} +\item{query}{SQL query} -\item{con}{A MySQL connection returned by \code{mysql_connect}. -Optional -- if not provided, a temporary connection will be opened up.} +\item{con}{MySQL connection returned by \code{\link[=mysql_connect]{mysql_connect()}}; \emph{optional} -- if +not provided, a temporary connection will be opened up} -\item{...}{Further arguments to pass to dbWriteTable. See ?dbWriteTable for more details.} +\item{...}{Further arguments to pass to \code{\link[RMySQL:dbWriteTable]{RMySQL::dbWriteTable()}}} -\item{table}{The name of a table to check for the existence of or create, -depending on the function.} +\item{table}{name of a table to check for the existence of or create, +depending on the function} } \description{ Read from, write to, and check data from the MySQL databases and - tables in the Wikimedia cluster. Assumes the presence of a validly - formatted configuration file. +tables in the Wikimedia cluster. Assumes the presence of a validly +formatted configuration file. } \seealso{ -\code{\link{hive_query}} or \code{\link{global_query}} +\code{\link[=hive_query]{hive_query()}} or \code{\link[=global_query]{global_query()}} } - diff --git a/man/query_hive.Rd b/man/query_hive.Rd index 8483052..88110a1 100644 --- a/man/query_hive.Rd +++ b/man/query_hive.Rd @@ -1,61 +1,52 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/hive.R \name{query_hive} \alias{query_hive} -\title{hive_query} +\title{Query Hadoop cluster with Hive} \usage{ query_hive(query, override_jars = FALSE) } \arguments{ \item{query}{a Hive query} \item{override_jars}{A logical flag indicating whether to override the path. Hive on WMF's analytics machine(s) loads some JARs by default, so if your query uses an updated version of an existing UDF and you want to load the -JAR that you built yourself, set this to TRUE. See -\href{https://wikitech.wikimedia.org/wiki/Analytics/Cluster/Hive/QueryUsingUDF#Testing_changes_to_existing_udf}{this section} +JAR that you built yourself, set this to \code{TRUE}. See +\href{https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Hive/QueryUsingUDF#Testing_changes_to_existing_udf}{Testing changes to existing UDF} for more details.} } \value{ -a data.frame containing the results of the query, or a boolean TRUE if the user has chosen -to write straight to file. +a \code{data.frame} containing the results of the query, or a \code{TRUE} if +the user has chosen to write straight to file. } \description{ -this is the "old" hive querying function - it's deprecated as all hell and waiting -until Andrew sticks the hive server on a dedicated and more powerful machine. -} -\details{ -Hive querying function +Queries Hive } \section{escaping}{ -\code{hive_query} works by running the query you provide through the CLI via a system() call. -As a result, single escapes for meaningful characters (such as quotes) within the query will not work: -R will interpret them only as escaping that character /within R/. Double escaping (\\\) is thus necessary, -in the same way that it is for regular expressions. +\code{hive_query} works by running the query you provide through the CLI via a +\code{\link[=system]{system()}} call. As a result, single escapes for meaningful characters +(such as quotes) within the query will not work: R will interpret them +only as escaping that character /within R/. Double escaping (\) is thus +necessary, in the same way that it is for regular expressions. } -\section{handling our hadoop/hive setup}{ - - -The \code{webrequests} table is documented -\href{https://wikitech.wikimedia.org/wiki/Analytics/Cluster/Hive}{on Wikitech}, which also provides -\href{https://wikitech.wikimedia.org/wiki/Analytics/Cluster/Hive/Queries}{a set of example -queries}. +\section{Handling our hadoop/hive setup}{ -When it comes to manipulating the rows with Java before they get to you, Nuria has written a -\href{https://wikitech.wikimedia.org/wiki/Analytics/Cluster/Hive/QueryUsingUDF}{brief tutorial on loading UDFs} -which should help if you want to engage in that; the example provided is a user agent parser, allowing you to -get the equivalent of \code{\link{ua_parse}}'s output further upstream. +The \code{webrequests} table is documented \href{https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Hive}{on Wikitech}, +which also provides \href{https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Hive/Queries}{a set of example queries}. When it comes to manipulating the rows with Java before they get to you, Nuria has written a +\href{https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Hive/QueryUsingUDF}{brief tutorial on loading UDFs} +which should help if you want to engage in that. } + \examples{ \dontrun{ query_hive("USE wmf; DESCRIBE webrequest;") } - } \seealso{ -\code{\link{log_strptime}} for converting the "dt" column in the webrequests table to POSIXlt, -and \code{\link{mysql_query}} and \code{\link{global_query}} for querying our MySQL databases. +\code{\link[=log_strptime]{log_strptime()}} or \code{\link[lubridate:ymd_hms]{lubridate::ymd_hms()}} for converting the "dt" +column in the webrequests table to proper datetime, and \code{\link[=mysql_query]{mysql_query()}} +and \code{\link[=global_query]{global_query()}} for querying our MySQL databases } - diff --git a/man/read_sampled_log.Rd b/man/read_sampled_log.Rd index 3baa690..9c3f65b 100644 --- a/man/read_sampled_log.Rd +++ b/man/read_sampled_log.Rd @@ -1,28 +1,45 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/logs.R \name{read_sampled_log} \alias{read_sampled_log} -\title{read a sampled log file} +\title{Read a sampled log file} \usage{ read_sampled_log(file, transparent = FALSE, nrows = NULL) } \arguments{ -\item{file}{a filename, retrieved with \code{\link{get_logfile}}} +\item{file}{a filename, retrieved with \code{\link[=get_logfile]{get_logfile()}}} \item{transparent}{a logical flag whether to gunzip the log file explicitly first (default) or read it in directly.} -\item{nrows}{Number of rows to read in. (Optional)} +\item{nrows}{number of rows to read in; \emph{optional}} } \value{ -a data.frame containing 16 columns - "squid", "sequence_no", "timestamp", - "servicetime", "ip_address", "status_code", "reply_size", "request_method", - "url", "squid_status", "mime_type", "referer", "x_forwarded", "user_agent", - "lang" and "x_analytics". +a \code{data.frame} containing 16 columns: +\itemize{ +\item squid +\item sequence_no +\item timestamp +\item servicetime +\item ip_address +\item status_code +\item reply_size +\item request_method +\item url +\item squid_status +\item mime_type +\item referer +\item x_forwarded +\item user_agent +\item lang +\item x_analytics +} } \description{ -read a sampled log file identified with \code{\link{get_logfile}}. +Reads a sampled log file identified with \code{\link[=get_logfile]{get_logfile()}}. The sampled logs are returned as a data.frame with 16 columns - see - the "Value" documentation. +the \strong{Value} documentation. +} +\author{ +Oliver Keyes } - diff --git a/man/rewrite_conditional.Rd b/man/rewrite_conditional.Rd index 953d388..d1bd04f 100644 --- a/man/rewrite_conditional.Rd +++ b/man/rewrite_conditional.Rd @@ -1,21 +1,23 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/writers.R \name{rewrite_conditional} \alias{rewrite_conditional} \title{Conditionally write to a (rolling) file} \usage{ rewrite_conditional(x, file, n_days = 30) } \arguments{ \item{x}{the object to write out} -\item{file}{the path to the file to use.} +\item{file}{the path to the file to use} -\item{n_days}{the number of days worth of data to have in the file. -30 by default.} +\item{n_days}{the number of days worth of data to have in the file; +30 by default} } \description{ -writes out temporal data to a file while ensuring +Writes out temporal data to a file while ensuring the file only has \code{n_days} worth of data in it. } - +\seealso{ +\code{\link[=write_conditional]{write_conditional()}} +} diff --git a/man/set_proxies.Rd b/man/set_proxies.Rd index 685932e..7ecaab6 100644 --- a/man/set_proxies.Rd +++ b/man/set_proxies.Rd @@ -1,24 +1,22 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/proxies.R \name{set_proxies} \alias{set_proxies} -\title{set HTTP and HTTPS proxies} +\title{Set HTTP and HTTPS proxies} \usage{ set_proxies() } \description{ -set the HTTP and HTTPS proxies when running R on -one of the Wikimedia servers +Sets the HTTP and HTTPS proxies when running R on +Wikimedia machines. } \examples{ \dontrun{ -#This will fail in the cluster +# This will fail in the cluster devtools::install_github("ironholds/urltools") -#This will work +# This will work set_proxies() devtools::install_github("ironholds/urltools") } - } - diff --git a/man/theme_facet.Rd b/man/theme_facet.Rd new file mode 100644 index 0000000..d462140 --- /dev/null +++ b/man/theme_facet.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataviz.R +\name{theme_facet} +\alias{theme_facet} +\title{Simple theme for facet-ed ggplots} +\usage{ +theme_facet(base_size = 12, base_family = "", border = TRUE, + clean_xaxis = FALSE, ...) +} +\arguments{ +\item{base_size}{font size} + +\item{base_family}{font family} + +\item{border}{whether to add a border around facets} + +\item{clean_xaxis}{whether to remove ticks & labels from x-axis} +} +\author{ +Mikhail Popov & Chelsy Xie +} diff --git a/man/theme_min.Rd b/man/theme_min.Rd new file mode 100644 index 0000000..c7fe0e7 --- /dev/null +++ b/man/theme_min.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataviz.R +\name{theme_min} +\alias{theme_min} +\title{Simple theme for ggplots} +\usage{ +theme_min(base_size = 12, base_family = "", ...) +} +\arguments{ +\item{base_size}{font size} + +\item{base_family}{font family} +} +\author{ +Mikhail Popov +} diff --git a/man/timeconverters.Rd b/man/timeconverters.Rd index d192809..b8f1304 100644 --- a/man/timeconverters.Rd +++ b/man/timeconverters.Rd @@ -1,28 +1,33 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/time.R \name{timeconverters} -\alias{from_log} -\alias{from_mediawiki} \alias{timeconverters} -\alias{to_log} +\alias{from_mediawiki} +\alias{from_log} \alias{to_mediawiki} +\alias{to_log} \title{convert to and from common timestamp formats} \usage{ from_mediawiki(x) from_log(x) to_mediawiki(x) to_log(x) } \arguments{ \item{x}{a vector of timestamps} } \description{ convert to and from MediaWiki and request log timestamp formats } \examples{ from_mediawiki("20150101010301") } - +\seealso{ +\code{\link[lubridate:ymd_hms]{lubridate::ymd_hms()}} +} +\author{ +Oliver Keyes +} diff --git a/man/wmf.Rd b/man/wmf.Rd new file mode 100644 index 0000000..a4e85c0 --- /dev/null +++ b/man/wmf.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/wmf.R +\docType{package} +\name{wmf} +\alias{wmf} +\alias{wmf-package} +\title{wmf: R Code for Wikimedia Foundation Internal Usage} +\description{ +This package contains functions made for Analysts at Wikimedia +Foundation, but can be used by people outside of the Foundation. +} diff --git a/man/write_conditional.Rd b/man/write_conditional.Rd index 6124f0f..6e54e71 100644 --- a/man/write_conditional.Rd +++ b/man/write_conditional.Rd @@ -1,18 +1,19 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/writers.R \name{write_conditional} \alias{write_conditional} \title{Conditionally write out to a file} \usage{ write_conditional(x, file) } \arguments{ \item{x}{the object to write out} -\item{file}{the path to the file to use.} +\item{file}{the path to the file to use} } \description{ -if the file already exists, append. If it -doesn't, create! +If the file already exists, append. If it doesn't, create! +} +\seealso{ +\code{\link[=rewrite_conditional]{rewrite_conditional()}} } - diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000..22034c4 --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,3 @@ +*.o +*.so +*.dll diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp new file mode 100644 index 0000000..8ee061a --- /dev/null +++ b/src/RcppExports.cpp @@ -0,0 +1,32 @@ +// Generated by using Rcpp::compileAttributes() -> do not edit by hand +// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#include + +using namespace Rcpp; + +// exact_binom +unsigned int exact_binom(double constant_prop, double effect_size, double alpha, double power, bool two_tail); +RcppExport SEXP _wmf_exact_binom(SEXP constant_propSEXP, SEXP effect_sizeSEXP, SEXP alphaSEXP, SEXP powerSEXP, SEXP two_tailSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< double >::type constant_prop(constant_propSEXP); + Rcpp::traits::input_parameter< double >::type effect_size(effect_sizeSEXP); + Rcpp::traits::input_parameter< double >::type alpha(alphaSEXP); + Rcpp::traits::input_parameter< double >::type power(powerSEXP); + Rcpp::traits::input_parameter< bool >::type two_tail(two_tailSEXP); + rcpp_result_gen = Rcpp::wrap(exact_binom(constant_prop, effect_size, alpha, power, two_tail)); + return rcpp_result_gen; +END_RCPP +} + +static const R_CallMethodDef CallEntries[] = { + {"_wmf_exact_binom", (DL_FUNC) &_wmf_exact_binom, 5}, + {NULL, NULL, 0} +}; + +RcppExport void R_init_wmf(DllInfo *dll) { + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +} diff --git a/src/exact_binom.cpp b/src/exact_binom.cpp new file mode 100644 index 0000000..7a4fc0d --- /dev/null +++ b/src/exact_binom.cpp @@ -0,0 +1,50 @@ +#include +using namespace Rcpp; +#include // UINT_MAX +#include // std::abs +#include +using boost::math::binomial_distribution; + +//' @title Sample size for exact, one sample binomial test +//' @description Estimates sample size required to detect difference from a +//' constant proportion. +//' @param constant_prop The proportion under the null hypothesis. +//' @param effect_size Positive size of the difference between your null +//' hypothesis and the alternative hypothesis that you hope to detect. +//' **Heads-up** that values less than 1\% might take a while to calculate. +//' @param alpha Probability of rejecting the null hypothesis even though it is +//' true. +//' @param power Probability of rejecting the null hypothesis (getting a +//' significant result) when the real difference is equal to the minimum +//' effect size. +//' @param two_tail Whether to perform two-tail or one-tail power analysis. +//' `TRUE` (default) tests in both directions of difference. +//' @examples +//' exact_binom(0.75, 0.03) +//' @references [Power analysis](http://www.biostathandbook.com/power.html) and +//' [Exact test of goodness-of-fit](http://www.biostathandbook.com/exactgof.html) from +//' John H. McDonald's [_Handbook of Biological Statistics_](http://www.biostathandbook.com/) +//' @export +// [[Rcpp::export]] +unsigned int exact_binom(double constant_prop, double effect_size, double alpha = 0.05, double power = 0.8, bool two_tail = true) { + if (two_tail) { + alpha = alpha / 2; + } + unsigned int i = 10; + double beta = 1 - power; + bool end_condition = true; + do { + i += 1; + if (i == INT_MAX) { + break; + } + end_condition = (std::abs(cdf(binomial_distribution<>(i, constant_prop + effect_size), quantile(binomial_distribution<>(i, constant_prop), 1 - alpha)) - beta) / beta >= 0.01); + } while (end_condition); + return i; +} + +/*** R +exact_binom(0.75, 0.01, power = 0.9) +exact_binom(0.75, 0.03, power = 0.9) +exact_binom(0.75, 0.03, power = 0.9, two_tail = FALSE) +*/ diff --git a/tests/testthat/test-interleaved.R b/tests/testthat/test-interleaved.R new file mode 100644 index 0000000..cd44e05 --- /dev/null +++ b/tests/testthat/test-interleaved.R @@ -0,0 +1,33 @@ +context("Interleaved search results") + +test_data <- suppressMessages(lapply( + wmf:::fake_interleaved_data(n_sessions = 10, seed = 0), + function(dataset) { + return(dataset[dataset$event == "click", ]) + } +)) + +test_that("preference statistic", { + expect_equal( + interleaved_preference( + test_data$no_preference$session_id, + test_data$no_preference$ranking_function + ), + 0 + ) + expect_equal( + interleaved_preference( + test_data$a_preferred$session_id, + test_data$a_preferred$ranking_function + ), + 0.333, + tolerance = 0.001 + ) + expect_equal( + interleaved_preference( + test_data$b_preferred$session_id, + test_data$b_preferred$ranking_function + ), + -0.15 + ) +}) diff --git a/tests/testthat/test-power.R b/tests/testthat/test-power.R new file mode 100644 index 0000000..b70b1d3 --- /dev/null +++ b/tests/testthat/test-power.R @@ -0,0 +1,44 @@ +context("Sample size calculations") + +test_that("chisq_test_odds returns the appropriate estimates", { + expect_equal( + chisq_test_odds(odds_ratio = 2, p_treatment = 0.4, p_control = 0.25, power = 0.8, conf_level = 0.95), + 311 + ) + expect_equal(chisq_test_odds(p_treatment = 0.4, p_control = 0.25, power = 0.8), 311) + expect_equal( + unname(chisq_test_odds(p_treatment = 0.4, p_control = 0.25, power = c(0.8, 0.9, 0.95))), + c(311, 416, 514) + ) +}) + +test_that("chisq_test_odds returns errors when it should", { + expect_error(chisq_test_odds()) + expect_error(chisq_test_odds(2)) + expect_error(chisq_test_odds(odds_ratio = 2, power = 0.8)) +}) + +test_that("chisq_test_odds returns warnings when it should", { + expect_warning( + chisq_test_odds(p_treatment = 0.4, p_control = 0.25, power = 0.8, visualize = TRUE), + "All parameters known. Nothing to visualize." + ) +}) + +test_that("chisq_test returns the appropriate estimates", { + expect_equal(chisq_test_effect(0.3), 88) + expect_equal(chisq_test_effect(0.1, groups = 3, power = 0.95), 1545) + expect_equal(chisq_test_effect(), c("tiny" = 3140, "small" = 785, "medium" = 88, "large" = 32)) +}) + +test_that("chisq_test returns errors when it should", { + expect_error(chisq_test_effect(w = 0.01)) + expect_error(chisq_test_effect(w = 0.1, power = 0.001)) + expect_error(chisq_test_effect(w = 0.1, sig_level = 2)) +}) + +test_that("exact_binom calculates appropriate sample sizes", { + expect_equal(exact_binom(0.75, 0.03, alpha = 0.05, power = 0.9, two_tail = TRUE), 2105) + expect_equal(exact_binom(0.75, 0.03, alpha = 0.05, power = 0.9, two_tail = FALSE), 1716) + expect_equal(exact_binom(0.75, 0.01, alpha = 0.05, power = 0.9, two_tail = TRUE), 19394) +}) diff --git a/tests/testthat/test-queries.R b/tests/testthat/test-queries.R new file mode 100644 index 0000000..2b68cbf --- /dev/null +++ b/tests/testthat/test-queries.R @@ -0,0 +1,5 @@ +context("Queries") + +test_that("date clause", { + expect_equal(date_clause(as.Date("2017-08-01"))$date_clause, "WHERE year = 2017 AND month = 8 AND day = 1 ") +}) diff --git a/tests/testthat/test-syntax.R b/tests/testthat/test-syntax.R new file mode 100644 index 0000000..49f58fe --- /dev/null +++ b/tests/testthat/test-syntax.R @@ -0,0 +1,6 @@ +if (requireNamespace("lintr", quietly = TRUE)) { + context("Lints") + test_that("package style", { + lintr::expect_lint_free() + }) +} diff --git a/tests/testthat/testSampleSizeCalcs.R b/tests/testthat/testSampleSizeCalcs.R deleted file mode 100644 index 4b689bf..0000000 --- a/tests/testthat/testSampleSizeCalcs.R +++ /dev/null @@ -1,32 +0,0 @@ -context("Sample size calculations") - -test_that("sample_size_odds returns the appropriate estimates", { - expect_equal(sample_size_odds(odds_ratio = 2, p_treatment = 0.4, p_control = 0.25, power = 0.8, conf_level = 0.95), 311) - expect_equal(sample_size_odds(p_treatment = 0.4, p_control = 0.25, power = 0.8), 311) - expect_equal(unname(sample_size_odds(p_treatment = 0.4, p_control = 0.25, - power = c(0.8, 0.9, 0.95))), c(311, 416, 514)) -}) - -test_that("sample_size_odds returns errors when it should", { - expect_error(sample_size_odds()) - expect_error(sample_size_odds(2)) - expect_error(sample_size_odds(odds_ratio = 2, power = 0.8)) -}) - -test_that("sample_size_odds returns warnings when it should", { - expect_warning(sample_size_odds(p_treatment = 0.4, p_control = 0.25, power = 0.8, visualize = TRUE), - "All parameters known. Nothing to visualize.") -}) - -test_that("sample_size_effect returns the appropriate estimates", { - expect_equal(sample_size_effect(0.3), 88) - expect_equal(sample_size_effect(0.1, groups = 3, power = 0.95), 1545) - expect_equal(sample_size_effect(), - c("tiny" = 3140, "small" = 785, "medium" = 88, "large" = 32)) -}) - -test_that("sample_size_effect returns errors when it should", { - expect_error(sample_size_effect(w = 0.01)) - expect_error(sample_size_effect(w = 0.1, power = 0.001)) - expect_error(sample_size_effect(w = 0.1, sig_level = 2)) -}) diff --git a/wmf.Rproj b/wmf.Rproj index 7646086..f0d6187 100644 --- a/wmf.Rproj +++ b/wmf.Rproj @@ -1,20 +1,21 @@ Version: 1.0 RestoreWorkspace: Default SaveWorkspace: Default AlwaysSaveHistory: Default EnableCodeIndexing: Yes UseSpacesForTab: Yes NumSpacesForTab: 2 Encoding: UTF-8 RnwWeave: Sweave LaTeX: pdfLaTeX +AutoAppendNewline: Yes StripTrailingWhitespace: Yes BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source PackageRoxygenize: rd,collate,namespace,vignette