diff --git a/.gitignore b/.gitignore index 33cd43d..807ea25 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,3 @@ -Meta -doc .Rproj.user .Rhistory .RData -inst/doc diff --git a/.lintr b/.lintr index 3a7145e..23d1437 100644 --- a/.lintr +++ b/.lintr @@ -1,5 +1 @@ -linters: with_defaults(line_length_linter(120), object_usage_linter = NULL, closed_curly_linter = NULL, open_curly_linter = NULL, spaces_left_parentheses_linter = NULL) -exclusions: list("R/RcppExports.R", "inst/doc/interleaved.R") -exclude: "# Exclude Linting" -exclude_start: "# Begin Exclude Linting" -exclude_end: "# End Exclude Linting" +linters: with_defaults(line_length_linter(120), object_usage_linter = NULL, closed_curly_linter = NULL, open_curly_linter = NULL, spaces_left_parentheses_linter = NULL, cyclocomp_linter = NULL) diff --git a/DESCRIPTION b/DESCRIPTION index 51ced1f..ee0e4ed 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,51 +1,42 @@ Package: wmf Type: Package Title: R Code for Wikimedia Foundation Internal Usage -Version: 0.7.1 -Date: 2019-02-19 +Version: 0.8.0 +Date: 2019-12-03 Authors@R: c( person("Mikhail", "Popov", email = "mikhail@wikimedia.org", role = c("aut", "cre")), - person("Oliver", "Keyes", role = "aut", comment = "No longer employed at the Foundation"), - person("Chelsy", "Xie", email = "cxie@wikimedia.org", role = "ctb") + person("Os", "Keyes", role = "aut", comment = "No longer employed at the Foundation"), + person("Chelsy", "Xie", role = "ctb", comment = "No longer employed at the Foundation"), + person(family = "Wikimedia Foundation", role = "cph") ) Description: This package contains functions made for Analysts at Wikimedia Foundation, but can be used by people outside of the Foundation. License: MIT + file LICENSE -URL: https://phabricator.wikimedia.org/diffusion/1821/ -BugReports: https://phabricator.wikimedia.org/maniphest/task/create/? - projects=Product-Analytics +URL: https://gerrit.wikimedia.org/r/plugins/gitiles/wikimedia/discovery/wmf/ +BugReports: https://phabricator.wikimedia.org/maniphest/task/create/?projects=Product-Analytics&assigned=mpopov Depends: R (>= 3.1.0) Imports: - devtools, ggthemes (>= 3.4.0), ggplot2, jsonlite, lubridate, progress, purrr, pwr, - Rcpp (>= 0.10.3), readr, RMySQL, tibble, urltools, magrittr, zeallot Suggests: - lintr, - testthat, - knitr, - rmarkdown, dplyr, + knitr, + lintr (>= 2.0.0), + roxygen2 (>= 7.0.0), + testthat, tidyr -LinkingTo: - BH, - Rcpp, - RcppArmadillo (>= 0.3.810) -NeedsCompilation: yes -SystemRequirements: C++11 LazyData: TRUE Roxygen: list(markdown = TRUE) Encoding: UTF-8 -VignetteBuilder: knitr -RoxygenNote: 6.1.1 +RoxygenNote: 7.0.2 diff --git a/NAMESPACE b/NAMESPACE index 6bf5455..1f9dc48 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,52 +1,46 @@ # Generated by roxygen2: do not edit by hand export(build_query) export(chisq_test_effect) export(chisq_test_odds) export(colors_accent) export(colors_base) export(colors_discrete) export(colors_utility) export(connection_details) export(date_clause) export(display_palettes) -export(exact_binom) export(extract_ymd) export(from_log) export(from_mediawiki) export(geom_flat_violin) export(get_logfile) export(global_query) -export(interleaved_bootstraps) -export(interleaved_confint) -export(interleaved_preference) export(invert_list) export(mysql_close) export(mysql_connect) export(mysql_disconnect) export(mysql_exists) export(mysql_read) export(mysql_write) export(null2na) export(parse_json) export(percent2) export(pretty_num) export(query_hive) export(read_sampled_log) export(refine_eventlogs) export(rewrite_conditional) export(set_proxies) export(theme_facet) export(theme_fivethirtynine) export(theme_min) export(to_log) export(to_mediawiki) export(update_shardmap) export(write_conditional) import(ggplot2) import(ggthemes) -importFrom(Rcpp,sourceCpp) importFrom(pwr,pwr.chisq.test) importFrom(readr,read_tsv) importFrom(urltools,url_decode) -useDynLib(wmf) diff --git a/NEWS.md b/NEWS.md index dcb7e4a..2da87be 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,85 +1,91 @@ +wmf 0.8.0 +========= +* Factored out interleaving & exact binomial C++ code into the [ortiz](https://gerrit.wikimedia.org/r/plugins/gitiles/wikimedia/discovery/ortiz/) package +* Updated DESCRIPTION +* Fixed dependencies + wmf 0.7.1 ========= * Updated for new sharded MariaDB replicas ([T212386](https://phabricator.wikimedia.org/T212386)) * Updated for x1 replica (cf. [T172410#4965383](https://phabricator.wikimedia.org/T172410#4965383)) wmf 0.6.0 ========= * Added color palettes based on [Wikimedia Design Style Guide](https://design.wikimedia.org/style-guide/) wmf 0.5.2 ========= * Added `use_beeline` argument in the `query_hive` function to use `beeline` instead of Hive CLI. wmf 0.5.0 ========= * Added `invert_list()` for inverting keys and values in named lists * Added formatting helpers: `percent2()` and `pretty_num()` * Added [David Robinson's `geom_flat_violin`](https://gist.github.com/dgrtwo/eb7750e74997891d7c20) wmf 0.4.0 ========= * Added `parse_json()` * Added `refine_eventlogs()` wmf 0.3.1 ========= * Switched host name from db1047.eqiad.wmnet to db1108.eqiad.wmnet per [T156844](https://phabricator.wikimedia.org/T156844) wmf 0.3.0 ========= * C++-based `exact_binomial()` to quickly estimate sample size for exact binomial tests * Functions for working with interleaved search results experiments * See `?interleaved` for details * See `vignette("interleaved", package = "wmf")` for an example * Requires a compiler that supports C++11 * ggplot themes `theme_min()` and `theme_facet()` * Documentation updates * Syntax-checking unit test * MIT licensing wmf 0.2.7 ========= * Changes which host MySQL functions connect to, depending on the database: - "db1047.eqiad.wmnet" for event logging data from "log" db - "analytics-store.eqiad.wmnet" (same as before) for wiki content * See [T176639](https://phabricator.wikimedia.org/T176639) for more details. wmf 0.2.6 ========= * Adds support for more MySQL config filenames since those vary between the different machines * Smarter about choosing a config file wmf 0.2.5 ========= * Fixes Hive query execution to remove messages/warnings. wmf 0.2.4 ========= * Ungroups grouped data frames when rewriting. See [T146422](https://phabricator.wikimedia.org/T146422) for more details. wmf 0.2.3 ========= * Fixes ggplot2 theme margin bug [discovered & fixed](https://github.com/wikimedia/wikimedia-discovery-wmf/pull/1) by Oliver Keyes. wmf 0.2.2 ========= * Updates `query_hive()` to support [JAR path overriding](https://wikitech.wikimedia.org/wiki/Analytics/Cluster/Hive/QueryUsingUDF#Testing_changes_to_existing_udf) * Updates the MySQL config file path so the package can now also be used on stat1003 * Updates maintainer contact info in README wmf 0.2.1 ========= * Adds a Contributor Code of Conduct wmf 0.2.0 ========= * Adds compatibility with RMySQL 0.9.4+ wmf 0.1.1 ========= * Fix a bug in global_query wmf 0.1.0 ========= Initial release diff --git a/R/RcppExports.R b/R/RcppExports.R deleted file mode 100644 index 8d8f5a1..0000000 --- a/R/RcppExports.R +++ /dev/null @@ -1,100 +0,0 @@ -# Generated by using Rcpp::compileAttributes() -> do not edit by hand -# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 - -#' @title Sample size for exact, one sample binomial test -#' @description Estimates sample size required to detect difference from a -#' constant proportion. -#' @param constant_prop The proportion under the null hypothesis. -#' @param effect_size Positive size of the difference between your null -#' hypothesis and the alternative hypothesis that you hope to detect. -#' **Heads-up** that values less than 1\% might take a while to calculate. -#' @param alpha Probability of rejecting the null hypothesis even though it is -#' true. -#' @param power Probability of rejecting the null hypothesis (getting a -#' significant result) when the real difference is equal to the minimum -#' effect size. -#' @param two_tail Whether to perform two-tail or one-tail power analysis. -#' `TRUE` (default) tests in both directions of difference. -#' @examples -#' exact_binom(0.75, 0.03) -#' @references [Power analysis](http://www.biostathandbook.com/power.html) and -#' [Exact test of goodness-of-fit](http://www.biostathandbook.com/exactgof.html) from -#' John H. McDonald's [_Handbook of Biological Statistics_](http://www.biostathandbook.com/) -#' @export -exact_binom <- function(constant_prop, effect_size, alpha = 0.05, power = 0.8, two_tail = TRUE) { - .Call('_wmf_exact_binom', PACKAGE = 'wmf', constant_prop, effect_size, alpha, power, two_tail) -} - -#' @param bootstraps number of times to sample unique sessions -#' (with replacement); 1000 by default -#' @examples -#' data("interleaved_data", package = "wmf") -#' x <- interleaved_data[interleaved_data$event == "click", ] -#' x <- x[order(x$session_id, x$timestamp), ] -#' data("interleaved_data_a", package = "wmf") -#' y <- interleaved_data_a[interleaved_data_a$event == "click", ] -#' y <- y[order(y$session_id, y$timestamp), ] -#' data("interleaved_data_b", package = "wmf") -#' z <- interleaved_data_b[interleaved_data_b$event == "click", ] -#' z <- z[order(z$session_id, z$timestamp), ] -#' -#' # Bootstrapped preference statistics: -#' -#' ## Data without a clear preference: -#' b <- interleaved_bootstraps(x$session_id, x$ranking_function) -#' hist(b) -#' -#' ## Data where A is preferred over B: -#' b <- interleaved_bootstraps(y$session_id, y$ranking_function) -#' hist(b) -#' -#' ## Data where B is preferred over A: -#' b <- interleaved_bootstraps(z$session_id, z$ranking_function) -#' hist(b) -#' @rdname interleaved -#' @export -interleaved_bootstraps <- function(sessions, clicks, bootstraps = 1000L) { - .Call('_wmf_interleaved_bootstraps', PACKAGE = 'wmf', sessions, clicks, bootstraps) -} - -#' @param confidence level; 0.95 by default -#' @examples -#' -#' # Preference statistic confidence intervals: -#' -#' ## Data without a clear preference: -#' interleaved_confint(x$session_id, x$ranking_function) -#' -#' ## Data where A is preferred over B: -#' interleaved_confint(y$session_id, y$ranking_function) -#' -#' ## Data where B is preferred over A: -#' interleaved_confint(z$session_id, z$ranking_function) -#' @rdname interleaved -#' @export -interleaved_confint <- function(sessions, clicks, bootstraps = 1000L, confidence = 0.95) { - .Call('_wmf_interleaved_confint', PACKAGE = 'wmf', sessions, clicks, bootstraps, confidence) -} - -#' @param sessions vector of session IDs used to group `positions` and -#' `ranking_functions` -#' @param clicks vector that shows which ranking function the -#' clicked search result came from ("A" or "B") -#' @examples -#' -#' # Preference statistic calculation: -#' -#' ## Data without a clear preference: -#' interleaved_preference(x$session_id, x$ranking_function) -#' -#' ## Data where A is preferred over B: -#' interleaved_preference(y$session_id, y$ranking_function) -#' -#' ## Data where B is preferred over A: -#' interleaved_preference(z$session_id, z$ranking_function) -#' @rdname interleaved -#' @export -interleaved_preference <- function(sessions, clicks) { - .Call('_wmf_interleaved_preference', PACKAGE = 'wmf', sessions, clicks) -} - diff --git a/R/chisq_test.R b/R/chisq_test.R index d0f4e50..59f7b7e 100644 --- a/R/chisq_test.R +++ b/R/chisq_test.R @@ -1,165 +1,168 @@ +# nolint start oddsRatio <- function(p_treatment, p_control) { return((p_treatment / (1 - p_treatment)) / (p_control / (1 - p_control))) } pTreatment <- function(p_control, odds_ratio) { return((odds_ratio * p_control) / ((p_control * (odds_ratio - 1)) + 1)) } pControl <- function(p_treatment, odds_ratio) { return(1 / ((odds_ratio * ((1 / p_treatment) - 1)) + 1)) } +# nolint end #' @title Chi-square Test Sample Size Given Odds Ratio #' @description Calculates sample size for chi-squared test of independence #' given the odds ratio. #' @param odds_ratio The expected odds ratio. That is, the ratio of the odds of #' the outcome in the test group relative to the control group. Optional, #' but see *Details*. #' @param p_control Your guess for prevalence of outcome in the control group. #' Optional but see **Details**. #' @param p_treatment Your guess for prevalence of outcome in the test group. #' Optional but see **Details**. #' @param power The ability of the test to detect an effect where there is one. #' Power = 1 - Prob(Type 2 error). Optional. See **Value** for details. -#' @param conf_level Desired confidence level. Defaults to 95\%. +#' @param conf_level Desired confidence level. Defaults to 95%. #' @param sample_ratio Ratio of test group to control group. 1 is even split. #' @param visualize Whether to plot power or prevalence of outcome in the #' control group vs sample size. Can be used to help make a decision. -#' @details The function only needs to know two of the following three: -#' `odds_ratio`, `p_control`, and `p_treatment`. If given -#' all three, it will check to make sure the odds ratio is correct. It -#' will figure out the missing third value from the other two. -#' @return If `power` was not provided, returns vector containing -#' possible power values and the appropriate sample size for each \%. -#' If all values were provided, returns a single sample size estimate. +#' @section Details: +#' The function only needs to know two of the following three: `odds_ratio`, +#' `p_control`, and `p_treatment`. If given all three, it will check to make +#' sure the odds ratio is correct. It will figure out the missing third value +#' from the other two. #' @section References: #' Wang, H., Chow, S.-C., & Li, G. (2002). On sample size calculation based on #' odds ratio in clinical trials. *Journal of Biopharmaceutical #' Statistics*, **12**(4), 471-483. #' [doi:10.1081/BIP-120016231](http://doi.org/10.1081/BIP-120016231) +#' @return If `power` was not provided, returns vector containing +#' possible power values and the appropriate sample size for each %. +#' If all values were provided, returns a single sample size estimate. #' @examples #' chisq_test_odds(p_treatment = 0.4, p_control = 0.25, power = 0.8) #' chisq_test_odds(odds_ratio = 2, p_control = 0.4, power = c(0.8, 0.9, 0.95)) #' chisq_test_odds(odds_ratio = 2, p_control = 0.4) #' chisq_test_odds(odds_ratio = 2, p_control = 0.4, visualize = TRUE) #' @author Mikhail Popov #' @seealso [chisq_test_effect()] #' @export chisq_test_odds <- function( odds_ratio = NULL, p_control = NULL, p_treatment = NULL, power = NULL, conf_level = 0.95, sample_ratio = 1, visualize = FALSE ) { # Begin Exclude Linting # Checks power_missing <- is.null(power) - pC_missing <- is.null(p_control) - pT_missing <- is.null(p_treatment) - oR_missing <- is.null(odds_ratio) - if ( (oR_missing + pC_missing + pT_missing) > 1) { + prob_control_missing <- is.null(p_control) + prob_treatment_missing <- is.null(p_treatment) + odds_ratio_missing <- is.null(odds_ratio) + if ((odds_ratio_missing + prob_control_missing + prob_treatment_missing) > 1) { stop("Only one of {odds_ratio, p_control, p_treatment} can be missing.") } # Imputations (Part 1) if (power_missing) { power <- seq(0.5, 0.99, 0.01) } # Imputations (Part 2) - if (pC_missing) { + if (prob_control_missing) { p_control <- pControl(p_treatment, odds_ratio) - } else if (pT_missing) { + } else if (prob_treatment_missing) { p_treatment <- pTreatment(p_control, odds_ratio) - } else if (oR_missing) { + } else if (odds_ratio_missing) { odds_ratio <- oddsRatio(p_treatment, p_control) } # End Exclude Linting # Calculations x <- p_treatment * (1 - p_treatment) * sample_ratio y <- p_control * (1 - p_control) z_alpha <- stats::qnorm((1 - conf_level) / 2) z_beta <- stats::qnorm(1 - power) n_b <- (1 / x + 1 / y) * (((z_alpha + z_beta) ^ 2) / (log(odds_ratio) ^ 2)) n_a <- sample_ratio * n_b n <- ceiling(n_a + n_b) # Visualization if (visualize) { if (power_missing || length(power) > 1) { graphics::plot( power, n, type = "l", main = "Sample size as function of statistical power", ylab = "N", xlab = "Power to detect effect", lwd = 2, xaxt = "n" ) graphics::axis( side = 1, at = seq(0.5, 1, 0.1), labels = sprintf("%.0f%%", 100 * seq(0.5, 1, 0.1)) ) graphics::abline(v = seq(0.5, 1, 0.1), lty = "dotted", col = "lightgray", lwd = graphics::par("lwd")) } else { warning("All parameters known. Nothing to visualize.") } } # Output if (power_missing || length(power) > 1) { names(n) <- sprintf("%.0f%%", power * 100) } return(n) } #' @title Chi-square Test Sample Size Given Effect #' @description Uses Cohen's w for effect size to calculate sample size for #' a chi-squared test of independence. #' @param w Effect size you want the test to be able to detect. (Optional) #' @param groups Number of groups. Used in degrees of freedom calculation. #' Defaults to 2 (e.g. control group vs treatment group). #' @param sig_level Probability of Type 1 error. Usually called alpha. #' Defaults to 0.05. #' @param power Ability to detect the effect. (1 - probability of Type 2 error) -#' Defaults to 80\%. +#' Defaults to 80%. #' @return If `w` was not provided, returns a data frame containing #' possible values of w and the corresponding sample size estimates. #' @examples #' chisq_test_effect() #' chisq_test_effect(0.1) #' chisq_test_effect(w = 0.1, groups = 3, sig_level = 0.001, power = 0.9) #' @importFrom pwr pwr.chisq.test #' @author Mikhail Popov #' @seealso [chisq_test_odds()] #' @export chisq_test_effect <- function( w = NULL, groups = 2, sig_level = 0.05, power = 0.8 ) { # Checks w_missing <- is.null(w) if (!w_missing && w <= 0.01) stop("'w' must be > 0.01") if (power <= 0.1 || power > 1.0) stop("'power' must be in (0.1, 1]") # Imputation if (w_missing) w <- c(0.05, 0.1, 0.3, 0.5) # Calculation and output if (length(w) > 1) { n <- ceiling(vapply(w, function(ww) { return(pwr::pwr.chisq.test( w = ww, N = NULL, df = groups - 1, sig.level = sig_level, power = power )$N) }, 0)) names(n) <- c("tiny", "small", "medium", "large") } else { n <- ceiling(pwr::pwr.chisq.test( w = w, N = NULL, df = groups - 1, sig.level = sig_level, power = power )$N) } return(n) } diff --git a/R/dataviz.R b/R/dataviz.R index c005a8f..8607df2 100644 --- a/R/dataviz.R +++ b/R/dataviz.R @@ -1,176 +1,178 @@ #' @title Theme inspired by fivethirtyeight.com plots #' @description A modification of [ggthemes::theme_fivethirtyeight()] #' @param base_size base font size #' @param base_family base font family #' @details Basically it adds axis titles (with some modification on the y to #' allow for long titles) back in and does a small amount of reduction of the #' overall plot size to avoid an absolute ton of extraneous spacing. #' @name FiveThirtyNine #' @rdname FiveThirtyNine #' @import ggplot2 #' @import ggthemes -#' @author Oliver Keyes +#' @author Os Keyes #' @export theme_fivethirtynine <- function(base_size = 12, base_family = "sans") { theme_foundation(base_size = base_size, base_family = base_family) + theme( line = element_line(), rect = element_rect( fill = ggthemes::ggthemes_data$fivethirtyeight["ltgray"], linetype = 0, colour = NA), text = element_text( colour = ggthemes::ggthemes_data$fivethirtyeight["dkgray"], margin = ggplot2::margin(), debug = FALSE ), axis.title.y = element_text( size = rel(2), angle = 90, vjust = 1.5, margin = ggplot2::margin(0, 12), debug = FALSE ), axis.text = element_text(size = rel(1.5)), axis.title.x = element_text(size = rel(2), margin = ggplot2::margin(12), debug = FALSE), axis.ticks = element_blank(), axis.line = element_blank(), legend.background = element_rect(), legend.position = "bottom", legend.direction = "horizontal", legend.box = "vertical", panel.grid = element_line(colour = NULL), panel.grid.major = element_line(colour = ggthemes::ggthemes_data$fivethirtyeight["medgray"]), panel.grid.minor = element_blank(), plot.title = element_text(hjust = 0, size = rel(1.5), face = "bold", margin = ggplot2::margin(), debug = FALSE), strip.background = element_rect(), legend.text = element_text(size = 18), legend.title = element_text(size = rel(1.5), margin = ggplot2::margin(4), debug = FALSE), legend.key.size = unit(1, "in"), panel.background = element_rect(fill = "transparent", color = NA), plot.background = element_rect(fill = "transparent", color = NA) ) } #' @title Simple theme for ggplots #' @description A minimal theme that puts the legend at the bottom. #' @param base_size font size #' @param base_family font family #' @param ... additional parameters to pass to `theme()` #' @author Mikhail Popov #' @export theme_min <- function(base_size = 12, base_family = "", ...) { ggplot2::theme_minimal(base_size, base_family) + ggplot2::theme( legend.position = "bottom", strip.placement = "outside", ... ) } #' @title Simple theme for facet-ed ggplots #' @description A minimal theme that puts the legend at the bottom and puts the #' facet labels into gray boxes. The border around those can be disabled. #' @param base_size font size #' @param base_family font family #' @param border whether to add a border around facets #' @param clean_xaxis whether to remove ticks & labels from x-axis #' @param ... additional parameters to pass to `theme()` #' @author Mikhail Popov & Chelsy Xie #' @export theme_facet <- function(base_size = 12, base_family = "", border = TRUE, clean_xaxis = FALSE, ...) { theme <- theme_min(base_size, base_family, ...) + ggplot2::theme(strip.background = element_rect(fill = "gray90")) if (border) { theme <- theme + ggplot2::theme(panel.border = element_rect(color = "gray30", fill = NA)) } if (clean_xaxis) { theme <- theme + ggplot2::theme( panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), axis.ticks.x = element_blank(), axis.text.x = element_blank() ) } return(theme) } #' @title Flat violin plot #' @description Violin plots are a compact display of continuous distributions #' but are usually mirrored to mimick boxplots. The "flat" version removes #' that mirrorness and makes the violin plots less...suggestive. #' @inheritParams ggplot2::geom_point #' @param trim If `TRUE` (default), trim the tails of the violins #' to the range of the data. If `FALSE`, don't trim the tails. #' @param geom,stat Use to override the default connection between #' `geom_violin` and `stat_ydensity`. #' @examples \dontrun{ #' ggplot(diamonds, aes(cut, carat)) + #' geom_flat_violin() + #' coord_flip() #' } #' @author [David Robinson](https://github.com/dgrtwo) #' @source Gist: [dgrtwo/geom_flat_violin.R](https://gist.github.com/dgrtwo/eb7750e74997891d7c20) #' @rdname ggplot2-flatviolin #' @export geom_flat_violin <- function(mapping = NULL, data = NULL, stat = "ydensity", position = "dodge", trim = TRUE, scale = "area", - show.legend = NA, inherit.aes = TRUE, ...) { + show.legend = NA, inherit.aes = TRUE, ...) { # nolint return(ggplot2::layer( data = data, mapping = mapping, stat = stat, geom = GeomFlatViolin, position = position, show.legend = show.legend, inherit.aes = inherit.aes, params = list( trim = trim, scale = scale, ... ) )) } +# nolint start "%||%" <- function(a, b) { if (!is.null(a)) { return(a) } else { return(b) } } #' @rdname ggplot2-flatviolin #' @format NULL #' @usage NULL GeomFlatViolin <- ggplot2::ggproto( "GeomFlatViolin", ggplot2::Geom, setup_data = function(data, params) { data$width <- data$width %||% params$width %||% (resolution(data$x, FALSE) * 0.9) # ymin, ymax, xmin, and xmax define the bounding rectangle for each group return(dplyr::mutate( dplyr::group_by(data, group), ymin = min(y), ymax = max(y), xmin = x, xmax = x + width / 2 )) }, draw_group = function(data, panel_scales, coord) { # Find the points for the line to go all the way around data <- transform(data, xminv = x, xmaxv = x + violinwidth * (xmax - x)) # Make sure it's sorted properly to draw the outline newdata <- rbind( plyr::arrange(transform(data, x = xminv), y), plyr::arrange(transform(data, x = xmaxv), -y) ) # Close the polygon: set first and last point the same # Needed for coord_polar and such newdata <- rbind(newdata, newdata[1, ]) return(ggplot2:::ggname("geom_flat_violin", ggplot2::GeomPolygon$draw_panel(newdata, panel_scales, coord))) }, draw_key = draw_key_polygon, default_aes = aes(weight = 1, colour = "grey20", fill = "white", size = 0.5, alpha = NA, linetype = "solid"), required_aes = c("x", "y") ) +# nolint end diff --git a/R/global.R b/R/global.R index 16e540c..fbf7037 100644 --- a/R/global.R +++ b/R/global.R @@ -1,42 +1,42 @@ #' @title Global SQL queries for analytics-store.eqiad.wmnet #' @description `global_query` is a simple wrapper around the MySQL queries #' that allows a useR to send a query to all production dbs on #' analytics-store.eqiad.wmnet, joining the results from each query into a #' single object. #' @param query the SQL query you want to run #' @param project_type what class of wiki (e.g. "wikisource", "wiktionary") #' you want to run against. Set to "all" by default. -#' @author Oliver Keyes +#' @author Os Keyes #' @seealso [mysql_read()] for querying an individual db, [from_mediawiki()] #' for converting MediaWiki timestamps into `POSIXlt` timestamps, or #' [query_hive()] for accessing the Hive datastore #' @export global_query <- function(query, project_type = "all") { # Construct the query if (!project_type == "all") { info_query <- paste("SELECT wiki FROM wiki_info WHERE code = '", project_type, "'", sep = "") } else { info_query <- "SELECT wiki FROM wiki_info" } # Run query wikis <- mysql_read(query = info_query, database = "staging")$wiki # Instantiate progress bar and note environment pb <- progress::progress_bar$new(total = length(wikis)) # Retrieve data data <- lapply(wikis, function(x, query) { # Retrieve the data data <- mysql_read(query = query, database = x) if (nrow(data) > 0) { data$project <- x # Add the wiki } else { data <- NULL } # Increment the progress bar pb$tick() # Return return(data) }, query = query) cat("\n") # Bind it into a single object and return return(do.call(what = "rbind", args = data)) } diff --git a/R/hive.R b/R/hive.R index 6b3477c..b8ffc86 100644 --- a/R/hive.R +++ b/R/hive.R @@ -1,90 +1,92 @@ #' @title Query Hadoop cluster with Hive #' @description Queries Hive #' @param query a Hive query #' @param override_jars A logical flag indicating whether to override the path. #' Hive on WMF's analytics machine(s) loads some JARs by default, so if your #' query uses an updated version of an existing UDF and you want to load the #' JAR that you built yourself, set this to `TRUE`. See -#' [Testing changes to existing UDF](https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Hive/QueryUsingUDF#Testing_changes_to_existing_udf) +#' [Testing changes to existing UDF](https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Hive/QueryUsingUDF#Testing_changes_to_existing_udf) # nolint #' for more details. #' @param heap_size `HADOOP_HEAPSIZE`; default is 1024 (alt: 2048 or 4096) #' @param use_nice Whether to use `nice` for less greedy CPU usage in a multi-user environment. The default is `TRUE`. #' @param use_ionice Whether to use `ionice` for less greedy I/O in a multi-user environment. The default is `TRUE`. #' @param use_beeline Whether to use `beeline` to connect with Hive instead of `hive`. The default is `FALSE`. #' @section Escaping: #' `hive_query` works by running the query you provide through the CLI via a #' [system()] call. As a result, single escapes for meaningful characters #' (such as quotes) within the query will not work: R will interpret them #' only as escaping that character /within R/. Double escaping (\\\) is thus #' necessary, in the same way that it is for regular expressions. #' @return A `data.frame` containing the results of the query, or a `TRUE` if #' the user has chosen to write straight to file. +# nolint start #' @section Handling our hadoop/hive setup: #' The `webrequests` table is documented [on Wikitech](https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Hive), #' which also provides [a set of example queries](https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Hive/Queries). When it comes to manipulating the rows with Java before they get to you, Nuria has written a #' [brief tutorial on loading UDFs](https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Hive/QueryUsingUDF) #' which should help if you want to engage in that. +# nolint end #' @seealso [lubridate::ymd_hms()] for converting the "dt" column in the #' webrequests table to proper datetime, and [mysql_read()] and #' [global_query()] for querying our MySQL databases #' @examples #' \dontrun{ #' query_hive("USE wmf; DESCRIBE webrequest;") #' } #' @export query_hive <- function(query, override_jars = FALSE, heap_size = 1024, use_nice = TRUE, use_ionice = TRUE, use_beeline = FALSE) { # Write query out to tempfile and create tempfile for results. query_dump <- tempfile() cat(query, file = query_dump) results_dump <- tempfile() filters <- paste0( c("", paste("grep -v", c("JAVA_TOOL_OPTIONS", "parquet.hadoop", "WARN:", ":WARN"))), collapse = " | " ) # Query and read in the results try({ system(paste0( "export HADOOP_HEAPSIZE=", heap_size, " && ", ifelse(use_nice, "nice ", ""), ifelse(use_ionice, "ionice ", ""), ifelse(use_beeline, " beeline --silent=true ", "hive -S "), ifelse(override_jars, "--hiveconf hive.aux.jars.path= ", ""), "-f ", query_dump, " 2> /dev/null", filters, " > ", results_dump )) results <- utils::read.delim(results_dump, sep = "\t", quote = "", as.is = TRUE, header = TRUE) }) # Clean up and return file.remove(query_dump, results_dump) stop_on_empty(results) return(results) } #' @title Generate a Date Clause for a Hive query #' @description What it says on the tin; generates a #' `WHERE year = foo AND month = bar` #' that can then be combined with other elements to form a Hive query. #' @param date if `NULL`, yesterday will be used #' @return a list containing two elements: "date_clause" and "date"; the #' returning of the date allows you to include it #' @seealso [extract_ymd()] #' @export date_clause <- function(date) { warning("Deprecated; recommended to use `c(year, month, day) %<-% wmf::extract_ymd(date)` instead") if (is.null(date)) { date <- Sys.Date() - 1 } fragment <- sprintf( "WHERE year = %s AND month = %s AND day = %s ", lubridate::year(date), lubridate::month(date), lubridate::mday(date) ) output <- list(date_clause = fragment, date = date) return(output) } diff --git a/R/interleaved.R b/R/interleaved.R deleted file mode 100644 index 4342d38..0000000 --- a/R/interleaved.R +++ /dev/null @@ -1,116 +0,0 @@ -#' @title Interleaved search results -#' @description Tools for analysis of experiments that use interleaved search -#' results wherein users receive results from multiple sets of retrieval -#' functions. -#' - `interleaved_data` is a fake dataset used for testing and examples; -#' refer to **Format** section below -#' - `interleaved_data_a` is a fake dataset used for testing and examples; -#' "A" is preferred over "B" -#' - `interleaved_data_b` is a fake dataset used for testing and examples; -#' "B" is preferred over "A" -#' - `interleaved_preference` returns a test statistic summarizing the -#' interleaving experiment; a positive value indicates that A is better -#' than B, a negative value indicates that B is better than A -#' - `interleaved_bootstraps` returns a bootstrapped sample of preference -#' statistics computed by resampling sessions with replacements -#' - `interleaved_confint` returns a `list` with elements "point.est", -#' "lower", and "upper" (uses `interleaved_bootstraps` internally) -#' - `interleaved_sample_size` estimates the sample size required to detect -#' a particular effect size with a specified power and significance level -#' @references -#' - Chapelle, O., Joachims, T., Radlinski, F., & Yue, Y. (2012). Large-scale -#' validation and analysis of interleaved search evaluation. -#' *ACM Transactions on Information Systems*, **30**(1), 1-41. -#' [doi:10.1145/2094072.2094078](https://doi.org/10.1145/2094072.2094078) -#' - Radlinski, F. and Craswell, N. (2013). [Optimized interleaving for online retrieval evaluation](https://www.microsoft.com/en-us/research/publication/optimized-interleaving-for-online-retrieval-evaluation/). -#' *ACM International Conference on Web Search and Data Mining (WSDM)*. -#' [doi:10.1145/2433396.2433429](https://doi.org/10.1145/2433396.2433429) -#' @name interleaved -NULL - -fake_interleaved_data <- function(dev = FALSE, n_sessions = 1000, seed = 0) { - set.seed(seed) - fake_timestamps <- function(n) { - return(as.POSIXct( - stats::runif(n, 0, 60 * 10), - origin = "2018-08-01 00:00:00", - tz = "UTC" - )) - } - fake_session <- function(preference = NULL) { - n_events <- sample.int(10, 1) - if (n_events == 1) { - return(data.frame( - session_id = paste0(sample(c(letters, 0:9), 10), collapse = ""), - timestamp = fake_timestamps(1), - event = "serp", - position = as.numeric(NA), - ranking_function = as.character(NA), - stringsAsFactors = FALSE - )) - } else { - if (is.null(preference)) { - probability <- c(0.5, 0.5) - } else if (preference == "A") { - probability <- c(0.75, 0.25) - } else { - probability <- c(0.25, 0.75) - } - df <- data.frame( - session_id = rep_len(paste0(sample(c(letters, 0:9), 10), collapse = ""), n_events), - timestamp = sort(fake_timestamps(n_events), decreasing = FALSE), - event = c("serp", rep_len("click", n_events - 1)), - position = c(NA, sample.int(20, n_events - 1, replace = FALSE)), - ranking_function = c(NA, sample(c("A", "B"), n_events - 1, replace = TRUE, prob = probability)), - stringsAsFactors = FALSE - ) - if (n_events %in% c(3, 5, 7, 9) && stats::rbinom(1, 1, 0.005) == 1) { - df$ranking_function[df$event == "click"] <- rep_len(c("A", "B"), n_events - 1) - } - return(df) - } - } - message("Generating unbiased data...") - interleaved_data <- do.call(rbind, replicate(n_sessions, fake_session(), simplify = FALSE)) - if (dev) { - devtools::use_data(interleaved_data, overwrite = TRUE) - } - message("Generating A-biased data...") - interleaved_data_a <- do.call(rbind, replicate(n_sessions, fake_session("A"), simplify = FALSE)) - if (dev) { - devtools::use_data(interleaved_data_a, overwrite = TRUE) - } - message("Generating B-biased data...") - interleaved_data_b <- do.call(rbind, replicate(n_sessions, fake_session("B"), simplify = FALSE)) - if (dev) { - devtools::use_data(interleaved_data_b, overwrite = TRUE) - } - if (!dev) { - return(list( - no_preference = interleaved_data, - a_preferred = interleaved_data_a, - b_preferred = interleaved_data_b - )) - } -} - -#' @format `interleaved_data*` are `data.frame`-s of generated search sessions with -#' the following columns: -#' \describe{ -#' \item{session_id}{10-character alphanumeric ID; for grouping events} -#' \item{timestamp}{when the event occurred; uses [POSIXct][base::DateTimeClasses] format} -#' \item{event}{"serp" or "click"} -#' \item{position}{position ("ranking") of the clicked search result} -#' \item{ranking_function}{"A" or "B"} -#' } -#' Users in `interleaved_data` have no preference, users in -#' `interleaved_data_a` have preference for ranking function "A", and users -#' in `interleaved_data_b` have preference for ranking function "B". -#' @rdname interleaved -"interleaved_data" - -#' @rdname interleaved -"interleaved_data_a" - -#' @rdname interleaved -"interleaved_data_b" diff --git a/R/logs.R b/R/logs.R index 0d5ff1c..b36e0fe 100644 --- a/R/logs.R +++ b/R/logs.R @@ -1,151 +1,151 @@ parse_date <- function(date) { return(gsub(x = date, pattern = "-", replacement = "")) } #' @title Retrieve a vector of sampled log files #' @description Grabs sampled log files to be piped into [read_sampled_log()]. #' By default this retrieves all sampled log files; it can be used to #' retrieve a particular date range of files through the "earliest" and #' "latest" arguments. #' @param earliest a `Date` object. Set to `NULL` by default, which triggers #' the retrieval of all log file names. #' @param latest a `Date` object; set to `NULL` by default. In the event that #' `earliest` is set but `latest` is not, the files retrieved will span from #' `earliest` to the current date; in the event that both arguments are set, #' the retrieved files will be those in that range. #' @return A vector of filenames that can be passed into [read_sampled_log()] -#' @author Oliver Keyes +#' @author Os Keyes #' @export -get_logfile <- function(earliest = NULL, latest = NULL){ +get_logfile <- function(earliest = NULL, latest = NULL) { # Begin Exclude Linting files <- list.files("/a/squid/archive/sampled", full.names = TRUE, pattern = "gz$") # End Exclude Linting if (!is.null(earliest)) { file_dates <- as.numeric(substring(files, 47, 55)) if (!is.null(latest)) { files <- files[file_dates >= as.numeric(parse_date(earliest)) & file_dates <= as.numeric(parse_date(latest))] } else { files <- files[file_dates >= as.numeric(parse_date(earliest))] } } return(files) } #' @title Read a sampled log file #' @description Reads a sampled log file identified with [get_logfile()]. #' The sampled logs are returned as a data.frame with 16 columns - see #' the **Value** documentation. #' @param file a filename, retrieved with [get_logfile()] #' @param transparent a logical flag whether to gunzip the log file explicitly #' first (default) or read it in directly. #' @param nrows number of rows to read in; *optional* #' @return a `data.frame` containing 16 columns: #' - squid #' - sequence_no #' - timestamp #' - servicetime #' - ip_address #' - status_code #' - reply_size #' - request_method #' - url #' - squid_status #' - mime_type #' - referer #' - x_forwarded #' - user_agent #' - lang #' - x_analytics #' @importFrom urltools url_decode -#' @author Oliver Keyes +#' @author Os Keyes #' @export -read_sampled_log <- function(file, transparent = FALSE, nrows = NULL){ +read_sampled_log <- function(file, transparent = FALSE, nrows = NULL) { is_gzipped <- grepl("gz$", file) if (is_gzipped) { # gzipped log file if (transparent) { # read the file in directly w/o gunzipping first output_file <- file } else { output_file <- tempfile() system(paste("gunzip -c", file, ">", output_file)) } } else { # an already gunzipped log file output_file <- file } if (is.null(nrows)) { nrows <- -1 } data <- utils::read.delim( output_file, as.is = TRUE, quote = "", nrows = nrows, col.names = c( "squid", "sequence_no", "timestamp", "servicetime", "ip_address", "status_code", "reply_size", "request_method", "url", "squid_status", "mime_type", "referer", "x_forwarded", "user_agent", "lang", "x_analytics" ) ) if (is_gzipped && !transparent) { file.remove(output_file) } data$url <- urltools::url_decode(data$url) data$referer <- urltools::url_decode(data$referer) return(data) } #' @title Refine EventLogging data #' @description Converts date-time and JSON columns, removes "event_" prefix #' fom column names, and returns `tibble`s. #' @param el_data EventLogging data #' @param dt_cols character vector of timestamp and date-time column names to #' parse; can also be a named list of parsing functions to apply on a #' per-column basis, as [lubridate::ymd_hms()] is used by default #' @param json_cols character vector of JSON-containing column names that need #' to be parsed; can also be a named list of parsing functions to apply on a #' per-column basis, as [parse_json()] is used by default #' @return A `tibble` (see [tibble::`tibble-package`] for more info) #' @author Mikhail Popov #' @export refine_eventlogs <- function(el_data, dt_cols = NULL, json_cols = NULL) { el_data <- tibble::as_tibble(el_data) # Check column specifications and construct parsers if needed: if (!is.null(dt_cols)) { if (is.list(dt_cols)) { per_col_dt <- all(vapply(dt_cols, is.function, TRUE)) if (!per_col_dt && any(vapply(dt_cols, is.function, TRUE))) { stop("You have an incomplete 'dt_cols' (not all columns are assigned a date-time parser)") } } else { per_col_dt <- FALSE } # Parse date-time columns: if (!per_col_dt) { dt_cols <- setNames(replicate(length(dt_cols), lubridate::ymd_hms), dt_cols) } for (dt_col in names(dt_cols)) { el_data[[dt_col]] <- dt_cols[[dt_col]](el_data[[dt_col]]) } } if (!is.null(json_cols)) { if (is.list(json_cols)) { per_col_json <- all(vapply(json_cols, is.function, TRUE)) if (!per_col_json && any(vapply(json_cols, is.function, TRUE))) { stop("You have an incomplete 'json_cols' (not all columns are assigned a JSON parser)") } } else { per_col_json <- FALSE } # Parse JSON-containing columns: if (!per_col_json) { json_cols <- setNames(replicate(length(json_cols), parse_json), json_cols) } for (json_col in names(json_cols)) { el_data[[json_col]] <- json_cols[[json_col]](el_data[[json_col]]) } } names(el_data) <- sub("^event_", "", names(el_data)) return(el_data) } diff --git a/R/mysql.R b/R/mysql.R index 28bf2df..db05e90 100644 --- a/R/mysql.R +++ b/R/mysql.R @@ -1,308 +1,312 @@ # Ensure that we recognise and error on 0 rows stop_on_empty <- function(data) { if (nrow(data) == 0) { stop("No rows were returned from the database") } return(invisible()) } # Check if the host machine is a remote WMF machine # e.g. stat100* or notebook100* as opposed to local is_wmnet <- function() { suppressWarnings(domain <- system("hostname -d", intern = TRUE)) if (length(domain) == 0) { return(FALSE) } else { return(grepl("\\.wmnet$", domain)) } } #' @title Update shard map #' @description Fetches the latest DB configuration from WMF's MediaWiki #' Configuration for mapping #' @param dev logical flag; if true, will write to inst/extdata; updates the #' installed map otherwise #' @export update_shardmap <- function(dev = FALSE) { # Begin Exclude Linting message("reading wmf's mediawiki db config") - url <- "https://phabricator.wikimedia.org/source/mediawiki-config/browse/master/wmf-config/db-eqiad.php?as=source&blame=off&view=raw" + url <- "https://phabricator.wikimedia.org/source/mediawiki-config/browse/master/wmf-config/db-eqiad.php?as=source&blame=off&view=raw" # nolint db_config <- readLines(url) + # nolint start # find where the shard config starts & ends: sectionsByDB_start <- which(grepl("'sectionsByDB' => [", db_config, fixed = TRUE)) sectionsByDB_end <- which(grepl("],", db_config, fixed = TRUE)) sectionsByDB_end <- min(sectionsByDB_end[sectionsByDB_end > sectionsByDB_start]) # extract & parse the relevant data: sectionsByDB <- gsub("[\t ',]", "", db_config[(sectionsByDB_start + 1):(sectionsByDB_end - 1)]) sectionsByDB <- sectionsByDB[sectionsByDB != "" & !grepl("^#", sectionsByDB)] sectionsByDB <- strsplit(sectionsByDB, "=>") sections_by_db <- purrr::map_dfr( sectionsByDB[purrr::map_lgl(sectionsByDB, ~ .x[2] != "wikitech")], ~ tibble::tibble(dbname = .x[1], shard = sub("^s([0-9]+)$", "\\1", .x[2])) ) + # nolint end # s3 is the default shard for other wikis (as of 2019-02-13) if (dev) { file_path <- "inst/extdata/sections_by_db.csv" } else { file_path <- system.file("extdata", "sections_by_db.csv", package = "wmf") } message("saving sectionsByDB to ", file_path) write.csv(sections_by_db, file_path) # End Exclude Linting } #' @title Connection details #' @description Figure out connection details (host name and port) based on #' database name. #' @param dbname e.g. "enwiki"; can be a vector of multiple database names #' @param use_x1 logical flag; use if querying an extension-related table that #' is hosted on x1 (e.g. `echo_*` tables); default `FALSE` #' @return a named `list` of `list(host, port)`s +# nolint start #' @references [wikitech:Data_access#MariaDB_replicas](https://wikitech.wikimedia.org/wiki/Analytics/Data_access#MariaDB_replicas) +# nolint end #' @export connection_details <- function(dbname, use_x1 = FALSE) { # 331 + the digit of the section in case of sX. # Example: s5 will be accessible to s5-analytics-replica.eqiad.wmnet:3315 # 3320 for x1. Example: x1-analytics-replica.eqiad.wmnet:3320 # 3350 for staging shardmap <- system.file("extdata", "sections_by_db.csv", package = "wmf") if (file.exists(shardmap)) { sections_by_db <- read.csv(shardmap) } else { stop("no shard map found; use update_shardmap() to download latest shard mapping") } shards <- purrr::map(purrr::set_names(dbname, dbname), function(db) { if (use_x1) { return(list(host = "x1-analytics-replica.eqiad.wmnet", port = 3320)) } else { if (db %in% sections_by_db$dbname) { shard <- sections_by_db$shard[sections_by_db$dbname == db] } else { shard <- 3 } return(list( host = sprintf("s%i-analytics-replica.eqiad.wmnet", shard), port = as.numeric(sprintf("331%i", shard)) )) } }) return(shards) } #' @title Work with MySQL databases #' @description Read from, write to, and check data from the MySQL databases and #' tables in the Wikimedia cluster. Assumes the presence of a validly #' formatted configuration file. #' @param query SQL query #' @param database name of the database to query; *optional* if passing a `con` #' @param use_x1 logical flag; use if querying an extension-related table that #' is hosted on x1 (e.g. `echo_*` tables); default `FALSE` #' @param hostname name of the machine to connect to, which depends on whether #' `query` is used to fetch from the **log** `database` (in which case #' connect to "db1108.eqiad.wmnet") or a MediaWiki ("content") DB, in which #' case [connection_details()] is used to return the appropriate shard host #' name and port based on the stored mapping (use [update_shardmap()] prior #' to make sure the latest mapping is used) #' @param con MySQL connection returned by [mysql_connect()]; *optional* -- if #' not provided, a temporary connection will be opened up #' @param table_name name of a table to check for the existence of or create, #' depending on the function #' @param default_file name of a config file containing username and password #' to use when connecting #' @examples \dontrun{ #' # Connection details (which shard to connect to) are fetched automatically: #' mysql_read("SELECT * FROM image LIMIT 100", "commonswiki") #' mysql_read("SELECT * FROM wbc_entity_usage LIMIT 100", "wikidatawiki") #' #' # Echo extension tables are on the x1 host: #' mysql_read("SELECT * #' FROM echo_event #' LEFT JOIN echo_notification #' ON echo_event.event_id = echo_notification.notification_event #' LIMIT 10;", #' "enwiki", use_x1 = TRUE) #' #' # If querying multiple databases in the same shard #' # a shared connection may be used: #' con <- mysql_connect("frwiki") #' results <- purrr::map( #' c("frwiki", "jawiki"), #' mysql_read, #' query = "SELECT...", #' con = con #' ) #' mysql_disconnect(con) #' } #' @name mysql #' @rdname mysql #' @seealso [query_hive()] or [global_query()] #' @export mysql_connect <- function( database, use_x1 = FALSE, default_file = NULL, hostname = NULL, port = NULL ) { if (is.null(hostname)) { if (database == "log") { if (use_x1) stop("using x1 does not make sense when connecting to 'log' db") hostname <- "db1108.eqiad.wmnet" if (is.null(port)) port <- 3306 } else { con_deets <- connection_details(database, use_x1 = use_x1)[[database]] hostname <- con_deets$host if (is.null(port)) port <- con_deets$port } } # Begin Exclude Linting if (is.null(default_file)) { possible_cnfs <- c( "discovery-stats-client.cnf", # on stat1005 "statistics-private-client.cnf", # on stat1005 "analytics-research-client.cnf", # on stat1005 "stats-research-client.cnf", # on stat1006 and also on stat1005 "research-client.cnf" # on notebook1001 ) for (cnf in file.path("/etc/mysql/conf.d", possible_cnfs)) { if (file.exists(cnf)) { default_file <- cnf break } } if (is.null(default_file)) { if (dir.exists("/etc/mysql/conf.d")) { cnfs <- dir("/etc/mysql/conf.d", pattern = "*.cnf") if (length(cnfs) == 0) { stop("no credentials found in mysql conf dir") } else { stop( "didn't find any of the specified credentials (", paste0(possible_cnfs, collapse = ", "), ")" ) } } else { stop("no configuration directory for mysql credentials") } } } con <- RMySQL::dbConnect( drv = RMySQL::MySQL(), host = hostname, port = port, dbname = database, default.file = default_file ) # End Exclude Linting return(con) } #' @rdname mysql #' @export mysql_read <- function(query, database = NULL, use_x1 = NULL, con = NULL) { already_connected <- !is.null(con) if (!already_connected && !is.null(database)) { # Open a temporary connection to the db: if (is.null(use_x1)) use_x1 <- FALSE con <- mysql_connect(database, use_x1 = use_x1) } # Begin Exclude Linting to_fetch <- RMySQL::dbSendQuery(con, query) data <- RMySQL::fetch(to_fetch, -1) message(sprintf("Fetched %.0f rows and %.0f columns.", nrow(data), ncol(data))) RMySQL::dbClearResult(RMySQL::dbListResults(con)[[1]]) # End Exclude Linting if (!already_connected) { # Close temporary connection: mysql_close(con) } stop_on_empty(data) return(data) } #' @rdname mysql #' @export mysql_exists <- function(database, table_name, use_x1 = NULL, con = NULL) { already_connected <- !is.null(con) if (!already_connected) { # Open a temporary connection to the db: if (is.null(use_x1)) use_x1 <- FALSE con <- mysql_connect(database, use_x1 = use_x1) } # Grab the results and close off: # Begin Exclude Linting table_exists <- RMySQL::dbExistsTable(conn = con, name = table_name) # End Exclude Linting if (!already_connected) { # Close temporary connection: mysql_close(con) } #Return return(table_exists) } #' @param x a `data.frame` to write #' @param ... additional arguments to pass to `dbWriteTable` #' @rdname mysql #' @export -mysql_write <- function(x, database, table_name, use_x1 = NULL, con = NULL, ...){ +mysql_write <- function(x, database, table_name, use_x1 = NULL, con = NULL, ...) { already_connected <- !is.null(con) if (!already_connected) { # Open a temporary connection to the db: if (is.null(use_x1)) use_x1 <- FALSE con <- mysql_connect(database, use_x1 = use_x1) } # Write: # Begin Exclude Linting result <- RMySQL::dbWriteTable( conn = con, name = table_name, value = x, row.names = FALSE, ... ) # End Exclude Linting if (!already_connected) { # Close temporary connection: mysql_close(con) } # Return the success/failure: return(result) } #' @rdname mysql #' @export mysql_close <- function(con) { # Begin Exclude Linting RMySQL::dbDisconnect(con) # End Exclude Linting return(invisible()) } #' @rdname mysql #' @export mysql_disconnect <- function(con) { mysql_close(con) } #' @title Builds a MySQL query aimed at the EventLogging-centric formats #' @description Constructs a MySQL query with a conditional around date. #' This is aimed at eventlogging, where the date/time is always "timestamp". #' @param fields the `SELECT` statement #' @param table the table to use #' @param date the date to restrict to. If `NULL`, yesterday will be used #' @param conditionals other conditions to include in the `WHERE` statement #' @export build_query <- function(fields, table, date = NULL, conditionals = NULL) { # Ensure we have a date and deconstruct it into a MW-friendly format if (is.null(date)) { date <- Sys.Date() - 1 } date <- gsub(x = date, pattern = "-", replacement = "") # Build the query proper (this will work for EL schemas where the field is always 'timestamp') query <- paste0( fields, " FROM ", table, " WHERE LEFT(timestamp, 8) = '", date, "'", ifelse(is.null(conditionals), "", " AND "), conditionals ) results <- mysql_read(query, "log") stop_on_empty(results) return(results) } diff --git a/R/palettes.R b/R/palettes.R index beaf009..6398da9 100644 --- a/R/palettes.R +++ b/R/palettes.R @@ -1,166 +1,166 @@ #' @title Wikimedia Design Color Palettes #' @description The [color palette](https://design.wikimedia.org/style-guide/visual-style_colors.html) #' represents our character and brings a hint of freshness to our products. #' Use `display_palettes()` to view the palettes and the names associated with #' the various colors in them. #' @param n number of colors (varies by palette) #' @section Base colors: #' Base colors define the content surface and the main color for content. #' Different shades of paper and ink are useful to emphasize or de-emphasize #' different content areas. #' #' Base colors range from pure white (Base100) to true black (Base0). #' Intermediate shades of gray include a tint of blue for greater harmony with #' our accent colors. #' #' When applying text on a surface, you need to check the #' [color contrast](http://webaim.org/resources/contrastchecker/) between the #' text and the background: #' - Base100...50 are safe text colors for a black surface. #' - Base30...0 are safe text colors for a white surface. #' @section Accent colors: #' Accent colors are used to emphasize actions and to highlight key information. #' Blue is a natural choice in our context, where it has been the default color #' used for links and conveys the idea of action. #' #' There are three shades provided for when you need a lighter (Accent90), #' regular (Accent50) or a darker (Accent10) version of blue. #' #' Accent50 is suitable to use for text and as background. When used for link #' text, this color provides sufficient contrast with black text. When used as #' background, it provides sufficient contrast with white text. #' @section Utility colors: #' Utility colors are another type of accent color. Common meanings are #' associated with them. We use shades of red, green, and yellow as utility #' colors. #' @source [Visual Style: Colors](https://design.wikimedia.org/style-guide/visual-style_colors.html) #' @rdname palettes #' @name Palettes NULL #' @rdname palettes #' @export colors_base <- function(n = 9) { if (n > 9) stop("only 9 base colors available") colors <- c( "Base100" = "#ffffff", "Base90" = "#f8f9fa", "Base80" = "#eaecf0", "Base70" = "#c8ccd1", "Base50" = "#a2a9b1", "Base30" = "#72777d", "Base20" = "#54595d", "Base10" = "#222222", "Base0" = "#000000" ) return(colors[unique(floor(seq.int(1, 9, length.out = n)))]) } #' @rdname palettes #' @export colors_accent <- function(n = 3) { if (n > 3) stop("only 3 accent colors available") colors <- c( "Accent50" = "#3366cc", "Accent10" = "#2a4b8d", "Accent90" = "#eaf3ff" ) - return(colors[1:n]) + return(colors[seq_len(n)]) } #' @rdname palettes #' @export colors_utility <- function(n = 9) { if (n > 9) stop("only 9 utility colors available") colors <- c( "Red90" = "#fee7e6", "Red50" = "#dd3333", "Red30" = "#b32424", "Green90" = "#d5fdf4", "Green50" = "#00af89", "Green30" = "#14866d", "Yellow90" = "#fef6e7", "Yellow50" = "#ffcc33", "Yellow30" = "#ac6600" ) if (n < 4) { - return(colors[c("Red50", "Green50", "Yellow50")][1:n]) + return(colors[c("Red50", "Green50", "Yellow50")][seq_len(n)]) } else if (n > 3 && n <= 6) { return(colors[c( "Red90", "Red30", "Green90", "Green30", "Yellow50", "Yellow30" - )][1:n]) + )][seq_len(n)]) } else { - return(colors[1:n]) + return(colors[seq_len(n)]) } } #' @rdname palettes #' @export colors_discrete <- function(n = 8) { if (n > 8) stop("only 8 discrete colors available") colors <- c( "Red50" = "#dd3333", "Red30" = "#b32424", "Green50" = "#00af89", "Green30" = "#14866d", "Accent50" = "#3366cc", "Accent10" = "#2a4b8d", "Yellow50" = "#ffcc33", "Yellow30" = "#ac6600" ) if (n < 5) { - return(colors[c("Red50", "Green50", "Accent50", "Yellow50")][1:n]) + return(colors[c("Red50", "Green50", "Accent50", "Yellow50")][seq_len(n)]) } else { - return(colors[1:n]) + return(colors[seq_len(n)]) } } #' @rdname palettes #' @import ggplot2 #' @export display_palettes <- function() { colors <- list( `colors_base()` = colors_base(), `colors_accent()` = colors_accent(), `colors_utility()` = colors_utility(), `colors_discrete()` = colors_discrete() ) colors <- purrr::map_dfr( colors, ~ data.frame( name = names(.x), color = unname(.x), - n = 1:length(.x), + n = seq_len(length(.x)), stringsAsFactors = FALSE), .id = "palette" ) color_map <- colors$color names(color_map) <- colors$name ggplot(colors, aes(x = n, y = 0)) + geom_point(size = 4, color = "black") + geom_point(aes(color = name), size = 3) + geom_label(aes(label = name), hjust = "left", nudge_y = 0.1, label.size = 0) + scale_y_continuous(limits = c(-0.5, 1)) + scale_x_reverse(breaks = 1:9, minor_breaks = NULL, limits = c(10, 0)) + scale_color_manual(values = color_map, guide = FALSE) + coord_flip() + facet_wrap(~ palette) + theme_min(base_size = 14) + theme( panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), strip.background = element_rect(fill = "#3366cc"), strip.text = element_text(color = "white"), plot.caption = element_text(hjust = 0) ) + labs( title = "Color palettes", subtitle = "see ?Palettes for details", caption = "Based on Wikimedia Design Style Guide (https://design.wikimedia.org/style-guide/)", y = NULL, x = "Index in palette" ) } diff --git a/R/proxies.R b/R/proxies.R index bcaeaf8..b426fbb 100644 --- a/R/proxies.R +++ b/R/proxies.R @@ -1,16 +1,8 @@ #' @title Set HTTP and HTTPS proxies #' @description Sets the HTTP and HTTPS proxies when running R on #' Wikimedia machines. -#' @examples \dontrun{ -#' # This will fail in the cluster -#' devtools::install_github("ironholds/urltools") -#' -#' # This will work -#' set_proxies() -#' devtools::install_github("ironholds/urltools") -#' } #' @export set_proxies <- function() { Sys.setenv("http_proxy" = "http://webproxy.eqiad.wmnet:8080") Sys.setenv("https_proxy" = "http://webproxy.eqiad.wmnet:8080") } diff --git a/R/time.R b/R/time.R index 5259d68..de42153 100644 --- a/R/time.R +++ b/R/time.R @@ -1,31 +1,31 @@ #' @title Convert to and from common timestamp formats #' @description Convert to and from MediaWiki and request log timestamp formats #' @param x a vector of timestamps #' @name timeconverters #' @rdname timeconverters #' @examples #' from_mediawiki("20150101010301") -#' @author Oliver Keyes +#' @author Os Keyes #' @seealso [lubridate::ymd_hms()] #' @export from_mediawiki <- function(x) { return(strptime(substr(x, 0, 14), format = "%Y%m%d%H%M%S", tz = "UTC")) } #' @rdname timeconverters #' @export from_log <- function(x) { return(strptime(substr(iconv(x, to = "UTF-8"), 0, 19), format = "%Y-%m-%dT%H:%M:%S", tz = "UTC")) } #' @rdname timeconverters #' @export to_mediawiki <- function(x) { gsub(x = x, pattern = "(:| |-)", replacement = "") } #' @rdname timeconverters #' @export to_log <- function(x) { gsub(x = x, pattern = " ", replacement = "T") } diff --git a/R/wmf.R b/R/wmf.R index 3929cc8..afdac36 100644 --- a/R/wmf.R +++ b/R/wmf.R @@ -1,8 +1,6 @@ #' @title wmf: R Code for Wikimedia Foundation Internal Usage #' @description This package contains functions made for Analysts at Wikimedia #' Foundation, but can be used by people outside of the Foundation. #' @docType package #' @name wmf -#' @useDynLib wmf -#' @importFrom Rcpp sourceCpp NULL diff --git a/README.md b/README.md index 7c1bfcd..f1f50d2 100644 --- a/README.md +++ b/README.md @@ -1,47 +1,46 @@ # R Tools for Wikimedia Foundation's Analysts [This package](https://phabricator.wikimedia.org/diffusion/1821/) contains functions made for Analysts at Wikimedia Foundation, but can be used by people outside of the Foundation. - `set_proxies` to set http(s) proxies on the analytics cluster - `global_query` for querying all of our MySQL databases - Utilities for working with logs, including EventLogging data: - `from_mediawiki` and `from_log` (and corresponding `to_*` functions) to convert between time formats - `refine_eventlogs` - parses date-time columns and JSON columns (via `parse_json`) - removes the "event_" prefix from column names - `query_hive` for querying our Hadoop cluster via Hive +- `mysql_read` for querying our MariaDB databases + - uses automatic shard detection, see `?connection_details` for more info - Sample size calculations: - `chisq_test_odds` estimates sample size for a chi-squared test given an odds ratio - `chisq_test_effect` estimates sample size for a chi-squared test given Cohen's *w* - Functions for estimating preference of ranking functions using clicks on interleaved search results: - `interleaved_preference` estimates preference; see vignette for details - `interleaved_bootstraps` resamples sessions with replacement to yield bootstrapped sample of preferences - `interleaved_confint` uses `interleaved_bootstraps` and `stats::quantile` to yield a bootstrapped confidence interval Also includes [Wikimedia Design visual style colors](https://design.wikimedia.org/style-guide/visual-style_colors.html): ![Color palettes included in the package based on Wikimedia Design Style Guide](palettes.png) ## Installation -This package requires compilation with a compiler that supports [C++11](https://en.wikipedia.org/wiki/C%2B%2B11). `g++-5` and `clang++` 3.3 have (near-)complete C++11 support. `g++-6` and `g++-7` are pretty common on Linux and if you have the most recent version of Command Line Tools for Xcode (via `xcode-select --install`) for macOS, you should have `clang++` 5.0.0 (or later), which includes full C++11 support. - ```R # install.packages("remotes", repos = c(CRAN = "https://cran.rstudio.com/")) -remotes::install_git("https://gerrit.wikimedia.org/r/wikimedia/discovery/wmf", build_vignettes = TRUE) +remotes::install_git("https://gerrit.wikimedia.org/r/wikimedia/discovery/wmf") # Alternatively, you can install from GitHub mirror: -remotes::install_github("wikimedia/wikimedia-discovery-wmf", build_vignettes = TRUE) +remotes::install_github("wikimedia/wikimedia-discovery-wmf") ``` To update: `remotes::update_packages("wmf")` ## Maintainers - [Mikhail Popov](https://meta.wikimedia.org/wiki/User:MPopov_(WMF)) -- [Chelsy Xie](https://meta.wikimedia.org/wiki/User:CXie_(WMF)) ## Additional Information Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms. diff --git a/data/interleaved_data.rda b/data/interleaved_data.rda deleted file mode 100644 index 1a9e203..0000000 Binary files a/data/interleaved_data.rda and /dev/null differ diff --git a/data/interleaved_data_a.rda b/data/interleaved_data_a.rda deleted file mode 100644 index eaa786c..0000000 Binary files a/data/interleaved_data_a.rda and /dev/null differ diff --git a/data/interleaved_data_b.rda b/data/interleaved_data_b.rda deleted file mode 100644 index 99ce25e..0000000 Binary files a/data/interleaved_data_b.rda and /dev/null differ diff --git a/inst/extdata/sections_by_db.csv b/inst/extdata/sections_by_db.csv index 0e87900..ffa2855 100644 --- a/inst/extdata/sections_by_db.csv +++ b/inst/extdata/sections_by_db.csv @@ -1,44 +1,46 @@ "","dbname","shard" -"1","enwiki",1 -"2","bgwiki",2 -"3","bgwiktionary",2 -"4","cswiki",2 -"5","enwikiquote",2 -"6","enwiktionary",2 -"7","eowiki",2 -"8","fiwiki",2 -"9","idwiki",2 -"10","itwiki",2 -"11","nlwiki",2 -"12","nowiki",2 -"13","plwiki",2 -"14","ptwiki",2 -"15","svwiki",2 -"16","thwiki",2 -"17","trwiki",2 -"18","zhwiki",2 -"19","commonswiki",4 -"20","testcommonswiki",4 -"21","cebwiki",5 -"22","dewiki",5 -"23","enwikivoyage",5 -"24","mgwiktionary",5 -"25","shwiki",5 -"26","srwiki",5 -"27","frwiki",6 -"28","jawiki",6 -"29","ruwiki",6 -"30","eswiki",7 -"31","huwiki",7 -"32","hewiki",7 -"33","ukwiki",7 -"34","frwiktionary",7 -"35","metawiki",7 -"36","arwiki",7 -"37","centralauth",7 -"38","cawiki",7 -"39","viwiki",7 -"40","fawiki",7 -"41","rowiki",7 -"42","kowiki",7 -"43","wikidatawiki",8 +"1","enwiki","1" +"2","bgwiki","2" +"3","bgwiktionary","2" +"4","cswiki","2" +"5","enwikiquote","2" +"6","enwiktionary","2" +"7","eowiki","2" +"8","fiwiki","2" +"9","idwiki","2" +"10","itwiki","2" +"11","nlwiki","2" +"12","nowiki","2" +"13","plwiki","2" +"14","ptwiki","2" +"15","svwiki","2" +"16","thwiki","2" +"17","trwiki","2" +"18","zhwiki","2" +"19","commonswiki","4" +"20","testcommonswiki","4" +"21","cebwiki","5" +"22","dewiki","5" +"23","enwikivoyage","5" +"24","mgwiktionary","5" +"25","shwiki","5" +"26","srwiki","5" +"27","frwiki","6" +"28","jawiki","6" +"29","ruwiki","6" +"30","eswiki","7" +"31","huwiki","7" +"32","hewiki","7" +"33","ukwiki","7" +"34","frwiktionary","7" +"35","metawiki","7" +"36","arwiki","7" +"37","centralauth","7" +"38","cawiki","7" +"39","viwiki","7" +"40","fawiki","7" +"41","rowiki","7" +"42","kowiki","7" +"43","wikidatawiki","8" +"44","labswiki","10" +"45","labtestwiki","11" diff --git a/man/FiveThirtyNine.Rd b/man/FiveThirtyNine.Rd index 0282bfd..69980cc 100644 --- a/man/FiveThirtyNine.Rd +++ b/man/FiveThirtyNine.Rd @@ -1,25 +1,25 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/dataviz.R \name{FiveThirtyNine} \alias{FiveThirtyNine} \alias{theme_fivethirtynine} \title{Theme inspired by fivethirtyeight.com plots} \usage{ theme_fivethirtynine(base_size = 12, base_family = "sans") } \arguments{ \item{base_size}{base font size} \item{base_family}{base font family} } \description{ A modification of \code{\link[ggthemes:theme_fivethirtyeight]{ggthemes::theme_fivethirtyeight()}} } \details{ Basically it adds axis titles (with some modification on the y to allow for long titles) back in and does a small amount of reduction of the overall plot size to avoid an absolute ton of extraneous spacing. } \author{ -Oliver Keyes +Os Keyes } diff --git a/man/chisq_test_effect.Rd b/man/chisq_test_effect.Rd index 1240c3f..1101bf7 100644 --- a/man/chisq_test_effect.Rd +++ b/man/chisq_test_effect.Rd @@ -1,40 +1,39 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/chisq_test.R \name{chisq_test_effect} \alias{chisq_test_effect} \title{Chi-square Test Sample Size Given Effect} \usage{ -chisq_test_effect(w = NULL, groups = 2, sig_level = 0.05, - power = 0.8) +chisq_test_effect(w = NULL, groups = 2, sig_level = 0.05, power = 0.8) } \arguments{ \item{w}{Effect size you want the test to be able to detect. (Optional)} \item{groups}{Number of groups. Used in degrees of freedom calculation. Defaults to 2 (e.g. control group vs treatment group).} \item{sig_level}{Probability of Type 1 error. Usually called alpha. Defaults to 0.05.} \item{power}{Ability to detect the effect. (1 - probability of Type 2 error) Defaults to 80\%.} } \value{ If \code{w} was not provided, returns a data frame containing possible values of w and the corresponding sample size estimates. } \description{ Uses Cohen's w for effect size to calculate sample size for a chi-squared test of independence. } \examples{ chisq_test_effect() chisq_test_effect(0.1) chisq_test_effect(w = 0.1, groups = 3, sig_level = 0.001, power = 0.9) } \seealso{ \code{\link[=chisq_test_odds]{chisq_test_odds()}} } \author{ Mikhail Popov } diff --git a/man/chisq_test_odds.Rd b/man/chisq_test_odds.Rd index 0daa6de..547b31b 100644 --- a/man/chisq_test_odds.Rd +++ b/man/chisq_test_odds.Rd @@ -1,66 +1,74 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/chisq_test.R \name{chisq_test_odds} \alias{chisq_test_odds} \title{Chi-square Test Sample Size Given Odds Ratio} \usage{ -chisq_test_odds(odds_ratio = NULL, p_control = NULL, - p_treatment = NULL, power = NULL, conf_level = 0.95, - sample_ratio = 1, visualize = FALSE) +chisq_test_odds( + odds_ratio = NULL, + p_control = NULL, + p_treatment = NULL, + power = NULL, + conf_level = 0.95, + sample_ratio = 1, + visualize = FALSE +) } \arguments{ \item{odds_ratio}{The expected odds ratio. That is, the ratio of the odds of the outcome in the test group relative to the control group. Optional, but see \emph{Details}.} \item{p_control}{Your guess for prevalence of outcome in the control group. Optional but see \strong{Details}.} \item{p_treatment}{Your guess for prevalence of outcome in the test group. Optional but see \strong{Details}.} \item{power}{The ability of the test to detect an effect where there is one. Power = 1 - Prob(Type 2 error). Optional. See \strong{Value} for details.} \item{conf_level}{Desired confidence level. Defaults to 95\%.} \item{sample_ratio}{Ratio of test group to control group. 1 is even split.} \item{visualize}{Whether to plot power or prevalence of outcome in the control group vs sample size. Can be used to help make a decision.} } \value{ If \code{power} was not provided, returns vector containing possible power values and the appropriate sample size for each \%. If all values were provided, returns a single sample size estimate. } \description{ Calculates sample size for chi-squared test of independence given the odds ratio. } -\details{ -The function only needs to know two of the following three: -\code{odds_ratio}, \code{p_control}, and \code{p_treatment}. If given -all three, it will check to make sure the odds ratio is correct. It -will figure out the missing third value from the other two. +\section{Details}{ + +The function only needs to know two of the following three: \code{odds_ratio}, +\code{p_control}, and \code{p_treatment}. If given all three, it will check to make +sure the odds ratio is correct. It will figure out the missing third value +from the other two. } + \section{References}{ Wang, H., Chow, S.-C., & Li, G. (2002). On sample size calculation based on odds ratio in clinical trials. \emph{Journal of Biopharmaceutical Statistics}, \strong{12}(4), 471-483. \href{http://doi.org/10.1081/BIP-120016231}{doi:10.1081/BIP-120016231} } \examples{ chisq_test_odds(p_treatment = 0.4, p_control = 0.25, power = 0.8) chisq_test_odds(odds_ratio = 2, p_control = 0.4, power = c(0.8, 0.9, 0.95)) chisq_test_odds(odds_ratio = 2, p_control = 0.4) chisq_test_odds(odds_ratio = 2, p_control = 0.4, visualize = TRUE) } \seealso{ \code{\link[=chisq_test_effect]{chisq_test_effect()}} } \author{ Mikhail Popov } diff --git a/man/connection_details.Rd b/man/connection_details.Rd index 6b261b2..02256a3 100644 --- a/man/connection_details.Rd +++ b/man/connection_details.Rd @@ -1,24 +1,24 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/mysql.R \name{connection_details} \alias{connection_details} \title{Connection details} \usage{ connection_details(dbname, use_x1 = FALSE) } \arguments{ \item{dbname}{e.g. "enwiki"; can be a vector of multiple database names} \item{use_x1}{logical flag; use if querying an extension-related table that -is hosted on x1 (e.g. \code{echo_*} tables); default \code{FALSE}} +is hosted on x1 (e.g. \verb{echo_*} tables); default \code{FALSE}} } \value{ a named \code{list} of \code{list(host, port)}s } \description{ Figure out connection details (host name and port) based on database name. } \references{ \href{https://wikitech.wikimedia.org/wiki/Analytics/Data_access#MariaDB_replicas}{wikitech:Data_access#MariaDB_replicas} } diff --git a/man/date_clause.Rd b/man/date_clause.Rd index 76b250c..07f8029 100644 --- a/man/date_clause.Rd +++ b/man/date_clause.Rd @@ -1,23 +1,23 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/hive.R \name{date_clause} \alias{date_clause} \title{Generate a Date Clause for a Hive query} \usage{ date_clause(date) } \arguments{ \item{date}{if \code{NULL}, yesterday will be used} } \value{ a list containing two elements: "date_clause" and "date"; the returning of the date allows you to include it } \description{ What it says on the tin; generates a -\code{WHERE year = foo AND month = bar} +\verb{WHERE year = foo AND month = bar} that can then be combined with other elements to form a Hive query. } \seealso{ \code{\link[=extract_ymd]{extract_ymd()}} } diff --git a/man/exact_binom.Rd b/man/exact_binom.Rd deleted file mode 100644 index 70cdcff..0000000 --- a/man/exact_binom.Rd +++ /dev/null @@ -1,38 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/RcppExports.R -\name{exact_binom} -\alias{exact_binom} -\title{Sample size for exact, one sample binomial test} -\usage{ -exact_binom(constant_prop, effect_size, alpha = 0.05, power = 0.8, - two_tail = TRUE) -} -\arguments{ -\item{constant_prop}{The proportion under the null hypothesis.} - -\item{effect_size}{Positive size of the difference between your null -hypothesis and the alternative hypothesis that you hope to detect. -\strong{Heads-up} that values less than 1\% might take a while to calculate.} - -\item{alpha}{Probability of rejecting the null hypothesis even though it is -true.} - -\item{power}{Probability of rejecting the null hypothesis (getting a -significant result) when the real difference is equal to the minimum -effect size.} - -\item{two_tail}{Whether to perform two-tail or one-tail power analysis. -\code{TRUE} (default) tests in both directions of difference.} -} -\description{ -Estimates sample size required to detect difference from a -constant proportion. -} -\examples{ -exact_binom(0.75, 0.03) -} -\references{ -\href{http://www.biostathandbook.com/power.html}{Power analysis} and -\href{http://www.biostathandbook.com/exactgof.html}{Exact test of goodness-of-fit} from -John H. McDonald's \href{http://www.biostathandbook.com/}{Handbook of Biological Statistics} -} diff --git a/man/formatters.Rd b/man/formatters.Rd index fd7efe9..51d7463 100644 --- a/man/formatters.Rd +++ b/man/formatters.Rd @@ -1,27 +1,27 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/formatters.R \name{Formatters} \alias{Formatters} \alias{percent2} \alias{pretty_num} \title{Formatters} \usage{ percent2(x, digits = 1, add_plus = FALSE) pretty_num(x, ...) } \arguments{ \item{x}{A vector to format} \item{...}{Additional parameters to pass to \code{\link[base:prettyNum]{base::prettyNum()}}} } \description{ Formatting utilities } \details{ \itemize{ -\item \code{percent2}: multiply by one hundred, display percent sign, clear "NA%"'s, +\item \code{percent2}: multiply by one hundred, display percent sign, clear "NA\%"'s, and optionally prepend a "+" to positive percentages \item \code{pretty_num}: shortcut to formatting 1e6 as 1,000,000 } } diff --git a/man/get_logfile.Rd b/man/get_logfile.Rd index 02cb790..adad6e1 100644 --- a/man/get_logfile.Rd +++ b/man/get_logfile.Rd @@ -1,29 +1,29 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/logs.R \name{get_logfile} \alias{get_logfile} \title{Retrieve a vector of sampled log files} \usage{ get_logfile(earliest = NULL, latest = NULL) } \arguments{ \item{earliest}{a \code{Date} object. Set to \code{NULL} by default, which triggers the retrieval of all log file names.} \item{latest}{a \code{Date} object; set to \code{NULL} by default. In the event that \code{earliest} is set but \code{latest} is not, the files retrieved will span from \code{earliest} to the current date; in the event that both arguments are set, the retrieved files will be those in that range.} } \value{ A vector of filenames that can be passed into \code{\link[=read_sampled_log]{read_sampled_log()}} } \description{ Grabs sampled log files to be piped into \code{\link[=read_sampled_log]{read_sampled_log()}}. By default this retrieves all sampled log files; it can be used to retrieve a particular date range of files through the "earliest" and "latest" arguments. } \author{ -Oliver Keyes +Os Keyes } diff --git a/man/ggplot2-flatviolin.Rd b/man/ggplot2-flatviolin.Rd index b4d1f0e..f023b54 100644 --- a/man/ggplot2-flatviolin.Rd +++ b/man/ggplot2-flatviolin.Rd @@ -1,76 +1,85 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/dataviz.R \docType{data} \name{geom_flat_violin} \alias{geom_flat_violin} \alias{GeomFlatViolin} \title{Flat violin plot} \source{ Gist: \href{https://gist.github.com/dgrtwo/eb7750e74997891d7c20}{dgrtwo/geom_flat_violin.R} } \usage{ -geom_flat_violin(mapping = NULL, data = NULL, stat = "ydensity", - position = "dodge", trim = TRUE, scale = "area", - show.legend = NA, inherit.aes = TRUE, ...) +geom_flat_violin( + mapping = NULL, + data = NULL, + stat = "ydensity", + position = "dodge", + trim = TRUE, + scale = "area", + show.legend = NA, + inherit.aes = TRUE, + ... +) } \arguments{ -\item{mapping}{Set of aesthetic mappings created by \code{\link[=aes]{aes()}} or -\code{\link[=aes_]{aes_()}}. If specified and \code{inherit.aes = TRUE} (the +\item{mapping}{Set of aesthetic mappings created by \code{\link[ggplot2:aes]{aes()}} or +\code{\link[ggplot2:aes_]{aes_()}}. If specified and \code{inherit.aes = TRUE} (the default), it is combined with the default mapping at the top level of the plot. You must supply \code{mapping} if there is no plot mapping.} \item{data}{The data to be displayed in this layer. There are three options: If \code{NULL}, the default, the data is inherited from the plot -data as specified in the call to \code{\link[=ggplot]{ggplot()}}. +data as specified in the call to \code{\link[ggplot2:ggplot]{ggplot()}}. A \code{data.frame}, or other object, will override the plot data. All objects will be fortified to produce a data frame. See -\code{\link[=fortify]{fortify()}} for which variables will be created. +\code{\link[ggplot2:fortify]{fortify()}} for which variables will be created. A \code{function} will be called with a single argument, the plot data. The return value must be a \code{data.frame}, and -will be used as the layer data.} +will be used as the layer data. A \code{function} can be created +from a \code{formula} (e.g. \code{~ head(.x, 10)}).} \item{position}{Position adjustment, either as a string, or the result of a call to a position adjustment function.} \item{trim}{If \code{TRUE} (default), trim the tails of the violins to the range of the data. If \code{FALSE}, don't trim the tails.} \item{show.legend}{logical. Should this layer be included in the legends? \code{NA}, the default, includes if any aesthetics are mapped. \code{FALSE} never includes, and \code{TRUE} always includes. It can also be a named logical vector to finely select the aesthetics to display.} \item{inherit.aes}{If \code{FALSE}, overrides the default aesthetics, rather than combining with them. This is most useful for helper functions that define both data and aesthetics and shouldn't inherit behaviour from -the default plot specification, e.g. \code{\link[=borders]{borders()}}.} +the default plot specification, e.g. \code{\link[ggplot2:borders]{borders()}}.} -\item{...}{Other arguments passed on to \code{\link[=layer]{layer()}}. These are +\item{...}{Other arguments passed on to \code{\link[ggplot2:layer]{layer()}}. These are often aesthetics, used to set an aesthetic to a fixed value, like \code{colour = "red"} or \code{size = 3}. They may also be parameters to the paired geom/stat.} \item{geom, stat}{Use to override the default connection between \code{geom_violin} and \code{stat_ydensity}.} } \description{ Violin plots are a compact display of continuous distributions but are usually mirrored to mimick boxplots. The "flat" version removes that mirrorness and makes the violin plots less...suggestive. } \examples{ \dontrun{ ggplot(diamonds, aes(cut, carat)) + geom_flat_violin() + coord_flip() } } \author{ \href{https://github.com/dgrtwo}{David Robinson} } \keyword{datasets} diff --git a/man/global_query.Rd b/man/global_query.Rd index 4ccaea3..02565ee 100644 --- a/man/global_query.Rd +++ b/man/global_query.Rd @@ -1,28 +1,28 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/global.R \name{global_query} \alias{global_query} \title{Global SQL queries for analytics-store.eqiad.wmnet} \usage{ global_query(query, project_type = "all") } \arguments{ \item{query}{the SQL query you want to run} \item{project_type}{what class of wiki (e.g. "wikisource", "wiktionary") you want to run against. Set to "all" by default.} } \description{ \code{global_query} is a simple wrapper around the MySQL queries that allows a useR to send a query to all production dbs on analytics-store.eqiad.wmnet, joining the results from each query into a single object. } \seealso{ \code{\link[=mysql_read]{mysql_read()}} for querying an individual db, \code{\link[=from_mediawiki]{from_mediawiki()}} for converting MediaWiki timestamps into \code{POSIXlt} timestamps, or \code{\link[=query_hive]{query_hive()}} for accessing the Hive datastore } \author{ -Oliver Keyes +Os Keyes } diff --git a/man/interleaved.Rd b/man/interleaved.Rd deleted file mode 100644 index 3082830..0000000 --- a/man/interleaved.Rd +++ /dev/null @@ -1,131 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/RcppExports.R, R/interleaved.R -\docType{data} -\name{interleaved_bootstraps} -\alias{interleaved_bootstraps} -\alias{interleaved_confint} -\alias{interleaved_preference} -\alias{interleaved} -\alias{interleaved_data} -\alias{interleaved_data_a} -\alias{interleaved_data_b} -\title{Interleaved search results} -\format{\code{interleaved_data*} are \code{data.frame}-s of generated search sessions with -the following columns: -\describe{ -\item{session_id}{10-character alphanumeric ID; for grouping events} -\item{timestamp}{when the event occurred; uses \link[base:DateTimeClasses]{POSIXct} format} -\item{event}{"serp" or "click"} -\item{position}{position ("ranking") of the clicked search result} -\item{ranking_function}{"A" or "B"} -} -Users in \code{interleaved_data} have no preference, users in -\code{interleaved_data_a} have preference for ranking function "A", and users -in \code{interleaved_data_b} have preference for ranking function "B".} -\usage{ -interleaved_bootstraps(sessions, clicks, bootstraps = 1000L) - -interleaved_confint(sessions, clicks, bootstraps = 1000L, - confidence = 0.95) - -interleaved_preference(sessions, clicks) - -interleaved_data - -interleaved_data_a - -interleaved_data_b -} -\arguments{ -\item{sessions}{vector of session IDs used to group \code{positions} and -\code{ranking_functions}} - -\item{clicks}{vector that shows which ranking function the -clicked search result came from ("A" or "B")} - -\item{bootstraps}{number of times to sample unique sessions -(with replacement); 1000 by default} - -\item{confidence}{level; 0.95 by default} -} -\description{ -Tools for analysis of experiments that use interleaved search -results wherein users receive results from multiple sets of retrieval -functions. -\itemize{ -\item \code{interleaved_data} is a fake dataset used for testing and examples; -refer to \strong{Format} section below -\item \code{interleaved_data_a} is a fake dataset used for testing and examples; -"A" is preferred over "B" -\item \code{interleaved_data_b} is a fake dataset used for testing and examples; -"B" is preferred over "A" -\item \code{interleaved_preference} returns a test statistic summarizing the -interleaving experiment; a positive value indicates that A is better -than B, a negative value indicates that B is better than A -\item \code{interleaved_bootstraps} returns a bootstrapped sample of preference -statistics computed by resampling sessions with replacements -\item \code{interleaved_confint} returns a \code{list} with elements "point.est", -"lower", and "upper" (uses \code{interleaved_bootstraps} internally) -\item \code{interleaved_sample_size} estimates the sample size required to detect -a particular effect size with a specified power and significance level -} -} -\examples{ -data("interleaved_data", package = "wmf") -x <- interleaved_data[interleaved_data$event == "click", ] -x <- x[order(x$session_id, x$timestamp), ] -data("interleaved_data_a", package = "wmf") -y <- interleaved_data_a[interleaved_data_a$event == "click", ] -y <- y[order(y$session_id, y$timestamp), ] -data("interleaved_data_b", package = "wmf") -z <- interleaved_data_b[interleaved_data_b$event == "click", ] -z <- z[order(z$session_id, z$timestamp), ] - -# Bootstrapped preference statistics: - -## Data without a clear preference: -b <- interleaved_bootstraps(x$session_id, x$ranking_function) -hist(b) - -## Data where A is preferred over B: -b <- interleaved_bootstraps(y$session_id, y$ranking_function) -hist(b) - -## Data where B is preferred over A: -b <- interleaved_bootstraps(z$session_id, z$ranking_function) -hist(b) - -# Preference statistic confidence intervals: - -## Data without a clear preference: -interleaved_confint(x$session_id, x$ranking_function) - -## Data where A is preferred over B: -interleaved_confint(y$session_id, y$ranking_function) - -## Data where B is preferred over A: -interleaved_confint(z$session_id, z$ranking_function) - -# Preference statistic calculation: - -## Data without a clear preference: -interleaved_preference(x$session_id, x$ranking_function) - -## Data where A is preferred over B: -interleaved_preference(y$session_id, y$ranking_function) - -## Data where B is preferred over A: -interleaved_preference(z$session_id, z$ranking_function) -} -\references{ -\itemize{ -\item Chapelle, O., Joachims, T., Radlinski, F., & Yue, Y. (2012). Large-scale -validation and analysis of interleaved search evaluation. -\emph{ACM Transactions on Information Systems}, \strong{30}(1), 1-41. -\href{https://doi.org/10.1145/2094072.2094078}{doi:10.1145/2094072.2094078} -\item Radlinski, F. and Craswell, N. (2013). \href{https://www.microsoft.com/en-us/research/publication/optimized-interleaving-for-online-retrieval-evaluation/}{Optimized interleaving for online retrieval evaluation}. -\emph{ACM International Conference on Web Search and Data Mining (WSDM)}. -\href{https://doi.org/10.1145/2433396.2433429}{doi:10.1145/2433396.2433429} -} -} -\keyword{datasets} diff --git a/man/mysql.Rd b/man/mysql.Rd index c9ba3dd..7b18a23 100644 --- a/man/mysql.Rd +++ b/man/mysql.Rd @@ -1,87 +1,92 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/mysql.R \name{mysql} \alias{mysql} \alias{mysql_connect} \alias{mysql_read} \alias{mysql_exists} \alias{mysql_write} \alias{mysql_close} \alias{mysql_disconnect} \title{Work with MySQL databases} \usage{ -mysql_connect(database, use_x1 = FALSE, default_file = NULL, - hostname = NULL, port = NULL) +mysql_connect( + database, + use_x1 = FALSE, + default_file = NULL, + hostname = NULL, + port = NULL +) mysql_read(query, database = NULL, use_x1 = NULL, con = NULL) mysql_exists(database, table_name, use_x1 = NULL, con = NULL) mysql_write(x, database, table_name, use_x1 = NULL, con = NULL, ...) mysql_close(con) mysql_disconnect(con) } \arguments{ \item{database}{name of the database to query; \emph{optional} if passing a \code{con}} \item{use_x1}{logical flag; use if querying an extension-related table that -is hosted on x1 (e.g. \code{echo_*} tables); default \code{FALSE}} +is hosted on x1 (e.g. \verb{echo_*} tables); default \code{FALSE}} \item{default_file}{name of a config file containing username and password to use when connecting} \item{hostname}{name of the machine to connect to, which depends on whether \code{query} is used to fetch from the \strong{log} \code{database} (in which case connect to "db1108.eqiad.wmnet") or a MediaWiki ("content") DB, in which case \code{\link[=connection_details]{connection_details()}} is used to return the appropriate shard host name and port based on the stored mapping (use \code{\link[=update_shardmap]{update_shardmap()}} prior to make sure the latest mapping is used)} \item{query}{SQL query} \item{con}{MySQL connection returned by \code{\link[=mysql_connect]{mysql_connect()}}; \emph{optional} -- if not provided, a temporary connection will be opened up} \item{table_name}{name of a table to check for the existence of or create, depending on the function} \item{x}{a \code{data.frame} to write} \item{...}{additional arguments to pass to \code{dbWriteTable}} } \description{ Read from, write to, and check data from the MySQL databases and tables in the Wikimedia cluster. Assumes the presence of a validly formatted configuration file. } \examples{ \dontrun{ # Connection details (which shard to connect to) are fetched automatically: mysql_read("SELECT * FROM image LIMIT 100", "commonswiki") mysql_read("SELECT * FROM wbc_entity_usage LIMIT 100", "wikidatawiki") # Echo extension tables are on the x1 host: mysql_read("SELECT * FROM echo_event LEFT JOIN echo_notification ON echo_event.event_id = echo_notification.notification_event LIMIT 10;", "enwiki", use_x1 = TRUE) # If querying multiple databases in the same shard # a shared connection may be used: con <- mysql_connect("frwiki") results <- purrr::map( c("frwiki", "jawiki"), mysql_read, query = "SELECT...", con = con ) mysql_disconnect(con) } } \seealso{ \code{\link[=query_hive]{query_hive()}} or \code{\link[=global_query]{global_query()}} } diff --git a/man/query_hive.Rd b/man/query_hive.Rd index 076bc05..a602ae8 100644 --- a/man/query_hive.Rd +++ b/man/query_hive.Rd @@ -1,61 +1,67 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/hive.R \name{query_hive} \alias{query_hive} \title{Query Hadoop cluster with Hive} \usage{ -query_hive(query, override_jars = FALSE, heap_size = 1024, - use_nice = TRUE, use_ionice = TRUE, use_beeline = FALSE) +query_hive( + query, + override_jars = FALSE, + heap_size = 1024, + use_nice = TRUE, + use_ionice = TRUE, + use_beeline = FALSE +) } \arguments{ \item{query}{a Hive query} \item{override_jars}{A logical flag indicating whether to override the path. Hive on WMF's analytics machine(s) loads some JARs by default, so if your query uses an updated version of an existing UDF and you want to load the JAR that you built yourself, set this to \code{TRUE}. See -\href{https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Hive/QueryUsingUDF#Testing_changes_to_existing_udf}{Testing changes to existing UDF} +\href{https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Hive/QueryUsingUDF#Testing_changes_to_existing_udf}{Testing changes to existing UDF} # nolint for more details.} \item{heap_size}{\code{HADOOP_HEAPSIZE}; default is 1024 (alt: 2048 or 4096)} \item{use_nice}{Whether to use \code{nice} for less greedy CPU usage in a multi-user environment. The default is \code{TRUE}.} \item{use_ionice}{Whether to use \code{ionice} for less greedy I/O in a multi-user environment. The default is \code{TRUE}.} \item{use_beeline}{Whether to use \code{beeline} to connect with Hive instead of \code{hive}. The default is \code{FALSE}.} } \value{ A \code{data.frame} containing the results of the query, or a \code{TRUE} if the user has chosen to write straight to file. } \description{ Queries Hive } \section{Escaping}{ \code{hive_query} works by running the query you provide through the CLI via a \code{\link[=system]{system()}} call. As a result, single escapes for meaningful characters (such as quotes) within the query will not work: R will interpret them -only as escaping that character /within R/. Double escaping (\) is thus +only as escaping that character /within R/. Double escaping (\\\) is thus necessary, in the same way that it is for regular expressions. } \section{Handling our hadoop/hive setup}{ The \code{webrequests} table is documented \href{https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Hive}{on Wikitech}, which also provides \href{https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Hive/Queries}{a set of example queries}. When it comes to manipulating the rows with Java before they get to you, Nuria has written a \href{https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Hive/QueryUsingUDF}{brief tutorial on loading UDFs} which should help if you want to engage in that. } \examples{ \dontrun{ query_hive("USE wmf; DESCRIBE webrequest;") } } \seealso{ \code{\link[lubridate:ymd_hms]{lubridate::ymd_hms()}} for converting the "dt" column in the webrequests table to proper datetime, and \code{\link[=mysql_read]{mysql_read()}} and \code{\link[=global_query]{global_query()}} for querying our MySQL databases } diff --git a/man/read_sampled_log.Rd b/man/read_sampled_log.Rd index 9c3f65b..224b3a8 100644 --- a/man/read_sampled_log.Rd +++ b/man/read_sampled_log.Rd @@ -1,45 +1,45 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/logs.R \name{read_sampled_log} \alias{read_sampled_log} \title{Read a sampled log file} \usage{ read_sampled_log(file, transparent = FALSE, nrows = NULL) } \arguments{ \item{file}{a filename, retrieved with \code{\link[=get_logfile]{get_logfile()}}} \item{transparent}{a logical flag whether to gunzip the log file explicitly first (default) or read it in directly.} \item{nrows}{number of rows to read in; \emph{optional}} } \value{ a \code{data.frame} containing 16 columns: \itemize{ \item squid \item sequence_no \item timestamp \item servicetime \item ip_address \item status_code \item reply_size \item request_method \item url \item squid_status \item mime_type \item referer \item x_forwarded \item user_agent \item lang \item x_analytics } } \description{ Reads a sampled log file identified with \code{\link[=get_logfile]{get_logfile()}}. The sampled logs are returned as a data.frame with 16 columns - see the \strong{Value} documentation. } \author{ -Oliver Keyes +Os Keyes } diff --git a/man/refine_eventlogs.Rd b/man/refine_eventlogs.Rd index 6414d0b..2956a02 100644 --- a/man/refine_eventlogs.Rd +++ b/man/refine_eventlogs.Rd @@ -1,29 +1,29 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/logs.R \name{refine_eventlogs} \alias{refine_eventlogs} \title{Refine EventLogging data} \usage{ refine_eventlogs(el_data, dt_cols = NULL, json_cols = NULL) } \arguments{ \item{el_data}{EventLogging data} \item{dt_cols}{character vector of timestamp and date-time column names to parse; can also be a named list of parsing functions to apply on a per-column basis, as \code{\link[lubridate:ymd_hms]{lubridate::ymd_hms()}} is used by default} \item{json_cols}{character vector of JSON-containing column names that need to be parsed; can also be a named list of parsing functions to apply on a per-column basis, as \code{\link[=parse_json]{parse_json()}} is used by default} } \value{ -A \code{tibble} (see \link[tibble:`tibble-package`]{tibble::tibble-package} for more info) +A \code{tibble} (see \link[tibble:`tibble-package`]{tibble::\code{tibble-package}} for more info) } \description{ Converts date-time and JSON columns, removes "event_" prefix fom column names, and returns \code{tibble}s. } \author{ Mikhail Popov } diff --git a/man/set_proxies.Rd b/man/set_proxies.Rd index 7ecaab6..f195a0b 100644 --- a/man/set_proxies.Rd +++ b/man/set_proxies.Rd @@ -1,22 +1,12 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/proxies.R \name{set_proxies} \alias{set_proxies} \title{Set HTTP and HTTPS proxies} \usage{ set_proxies() } \description{ Sets the HTTP and HTTPS proxies when running R on Wikimedia machines. } -\examples{ -\dontrun{ -# This will fail in the cluster -devtools::install_github("ironholds/urltools") - -# This will work -set_proxies() -devtools::install_github("ironholds/urltools") -} -} diff --git a/man/theme_facet.Rd b/man/theme_facet.Rd index ac0db8d..4aa03fd 100644 --- a/man/theme_facet.Rd +++ b/man/theme_facet.Rd @@ -1,27 +1,32 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/dataviz.R \name{theme_facet} \alias{theme_facet} \title{Simple theme for facet-ed ggplots} \usage{ -theme_facet(base_size = 12, base_family = "", border = TRUE, - clean_xaxis = FALSE, ...) +theme_facet( + base_size = 12, + base_family = "", + border = TRUE, + clean_xaxis = FALSE, + ... +) } \arguments{ \item{base_size}{font size} \item{base_family}{font family} \item{border}{whether to add a border around facets} \item{clean_xaxis}{whether to remove ticks & labels from x-axis} \item{...}{additional parameters to pass to \code{theme()}} } \description{ A minimal theme that puts the legend at the bottom and puts the facet labels into gray boxes. The border around those can be disabled. } \author{ Mikhail Popov & Chelsy Xie } diff --git a/man/timeconverters.Rd b/man/timeconverters.Rd index cb1ac20..43557c0 100644 --- a/man/timeconverters.Rd +++ b/man/timeconverters.Rd @@ -1,33 +1,33 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/time.R \name{timeconverters} \alias{timeconverters} \alias{from_mediawiki} \alias{from_log} \alias{to_mediawiki} \alias{to_log} \title{Convert to and from common timestamp formats} \usage{ from_mediawiki(x) from_log(x) to_mediawiki(x) to_log(x) } \arguments{ \item{x}{a vector of timestamps} } \description{ Convert to and from MediaWiki and request log timestamp formats } \examples{ from_mediawiki("20150101010301") } \seealso{ \code{\link[lubridate:ymd_hms]{lubridate::ymd_hms()}} } \author{ -Oliver Keyes +Os Keyes } diff --git a/man/wmf.Rd b/man/wmf.Rd index a4e85c0..1778caf 100644 --- a/man/wmf.Rd +++ b/man/wmf.Rd @@ -1,11 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/wmf.R \docType{package} \name{wmf} \alias{wmf} -\alias{wmf-package} \title{wmf: R Code for Wikimedia Foundation Internal Usage} \description{ This package contains functions made for Analysts at Wikimedia Foundation, but can be used by people outside of the Foundation. } diff --git a/src/.gitignore b/src/.gitignore deleted file mode 100644 index 22034c4..0000000 --- a/src/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -*.o -*.so -*.dll diff --git a/src/Makevars b/src/Makevars deleted file mode 100644 index e66fbbd..0000000 --- a/src/Makevars +++ /dev/null @@ -1 +0,0 @@ -CXX_STD=CXX11 diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp deleted file mode 100644 index cccf156..0000000 --- a/src/RcppExports.cpp +++ /dev/null @@ -1,75 +0,0 @@ -// Generated by using Rcpp::compileAttributes() -> do not edit by hand -// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 - -#include -#include - -using namespace Rcpp; - -// exact_binom -unsigned int exact_binom(double constant_prop, double effect_size, double alpha, double power, bool two_tail); -RcppExport SEXP _wmf_exact_binom(SEXP constant_propSEXP, SEXP effect_sizeSEXP, SEXP alphaSEXP, SEXP powerSEXP, SEXP two_tailSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< double >::type constant_prop(constant_propSEXP); - Rcpp::traits::input_parameter< double >::type effect_size(effect_sizeSEXP); - Rcpp::traits::input_parameter< double >::type alpha(alphaSEXP); - Rcpp::traits::input_parameter< double >::type power(powerSEXP); - Rcpp::traits::input_parameter< bool >::type two_tail(two_tailSEXP); - rcpp_result_gen = Rcpp::wrap(exact_binom(constant_prop, effect_size, alpha, power, two_tail)); - return rcpp_result_gen; -END_RCPP -} -// interleaved_bootstraps -std::vector interleaved_bootstraps(std::vector sessions, std::vector clicks, int bootstraps); -RcppExport SEXP _wmf_interleaved_bootstraps(SEXP sessionsSEXP, SEXP clicksSEXP, SEXP bootstrapsSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< std::vector >::type sessions(sessionsSEXP); - Rcpp::traits::input_parameter< std::vector >::type clicks(clicksSEXP); - Rcpp::traits::input_parameter< int >::type bootstraps(bootstrapsSEXP); - rcpp_result_gen = Rcpp::wrap(interleaved_bootstraps(sessions, clicks, bootstraps)); - return rcpp_result_gen; -END_RCPP -} -// interleaved_confint -List interleaved_confint(std::vector sessions, std::vector clicks, int bootstraps, double confidence); -RcppExport SEXP _wmf_interleaved_confint(SEXP sessionsSEXP, SEXP clicksSEXP, SEXP bootstrapsSEXP, SEXP confidenceSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< std::vector >::type sessions(sessionsSEXP); - Rcpp::traits::input_parameter< std::vector >::type clicks(clicksSEXP); - Rcpp::traits::input_parameter< int >::type bootstraps(bootstrapsSEXP); - Rcpp::traits::input_parameter< double >::type confidence(confidenceSEXP); - rcpp_result_gen = Rcpp::wrap(interleaved_confint(sessions, clicks, bootstraps, confidence)); - return rcpp_result_gen; -END_RCPP -} -// interleaved_preference -double interleaved_preference(std::vector sessions, std::vector clicks); -RcppExport SEXP _wmf_interleaved_preference(SEXP sessionsSEXP, SEXP clicksSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< std::vector >::type sessions(sessionsSEXP); - Rcpp::traits::input_parameter< std::vector >::type clicks(clicksSEXP); - rcpp_result_gen = Rcpp::wrap(interleaved_preference(sessions, clicks)); - return rcpp_result_gen; -END_RCPP -} - -static const R_CallMethodDef CallEntries[] = { - {"_wmf_exact_binom", (DL_FUNC) &_wmf_exact_binom, 5}, - {"_wmf_interleaved_bootstraps", (DL_FUNC) &_wmf_interleaved_bootstraps, 3}, - {"_wmf_interleaved_confint", (DL_FUNC) &_wmf_interleaved_confint, 4}, - {"_wmf_interleaved_preference", (DL_FUNC) &_wmf_interleaved_preference, 2}, - {NULL, NULL, 0} -}; - -RcppExport void R_init_wmf(DllInfo *dll) { - R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); - R_useDynamicSymbols(dll, FALSE); -} diff --git a/src/exact_binom.cpp b/src/exact_binom.cpp deleted file mode 100644 index 7a4fc0d..0000000 --- a/src/exact_binom.cpp +++ /dev/null @@ -1,50 +0,0 @@ -#include -using namespace Rcpp; -#include // UINT_MAX -#include // std::abs -#include -using boost::math::binomial_distribution; - -//' @title Sample size for exact, one sample binomial test -//' @description Estimates sample size required to detect difference from a -//' constant proportion. -//' @param constant_prop The proportion under the null hypothesis. -//' @param effect_size Positive size of the difference between your null -//' hypothesis and the alternative hypothesis that you hope to detect. -//' **Heads-up** that values less than 1\% might take a while to calculate. -//' @param alpha Probability of rejecting the null hypothesis even though it is -//' true. -//' @param power Probability of rejecting the null hypothesis (getting a -//' significant result) when the real difference is equal to the minimum -//' effect size. -//' @param two_tail Whether to perform two-tail or one-tail power analysis. -//' `TRUE` (default) tests in both directions of difference. -//' @examples -//' exact_binom(0.75, 0.03) -//' @references [Power analysis](http://www.biostathandbook.com/power.html) and -//' [Exact test of goodness-of-fit](http://www.biostathandbook.com/exactgof.html) from -//' John H. McDonald's [_Handbook of Biological Statistics_](http://www.biostathandbook.com/) -//' @export -// [[Rcpp::export]] -unsigned int exact_binom(double constant_prop, double effect_size, double alpha = 0.05, double power = 0.8, bool two_tail = true) { - if (two_tail) { - alpha = alpha / 2; - } - unsigned int i = 10; - double beta = 1 - power; - bool end_condition = true; - do { - i += 1; - if (i == INT_MAX) { - break; - } - end_condition = (std::abs(cdf(binomial_distribution<>(i, constant_prop + effect_size), quantile(binomial_distribution<>(i, constant_prop), 1 - alpha)) - beta) / beta >= 0.01); - } while (end_condition); - return i; -} - -/*** R -exact_binom(0.75, 0.01, power = 0.9) -exact_binom(0.75, 0.03, power = 0.9) -exact_binom(0.75, 0.03, power = 0.9, two_tail = FALSE) -*/ diff --git a/src/interleaved_confint.cpp b/src/interleaved_confint.cpp deleted file mode 100644 index f38763e..0000000 --- a/src/interleaved_confint.cpp +++ /dev/null @@ -1,94 +0,0 @@ -#include -#include "interleaved_map.h" -#include // std::unique_copy, std::sort -#include // std::vector -#include // std::back_insert_iterator -// [[Rcpp::depends(RcppArmadillo)]] -using namespace Rcpp; - -//' @param bootstraps number of times to sample unique sessions -//' (with replacement); 1000 by default -//' @examples -//' data("interleaved_data", package = "wmf") -//' x <- interleaved_data[interleaved_data$event == "click", ] -//' x <- x[order(x$session_id, x$timestamp), ] -//' data("interleaved_data_a", package = "wmf") -//' y <- interleaved_data_a[interleaved_data_a$event == "click", ] -//' y <- y[order(y$session_id, y$timestamp), ] -//' data("interleaved_data_b", package = "wmf") -//' z <- interleaved_data_b[interleaved_data_b$event == "click", ] -//' z <- z[order(z$session_id, z$timestamp), ] -//' -//' # Bootstrapped preference statistics: -//' -//' ## Data without a clear preference: -//' b <- interleaved_bootstraps(x$session_id, x$ranking_function) -//' hist(b) -//' -//' ## Data where A is preferred over B: -//' b <- interleaved_bootstraps(y$session_id, y$ranking_function) -//' hist(b) -//' -//' ## Data where B is preferred over A: -//' b <- interleaved_bootstraps(z$session_id, z$ranking_function) -//' hist(b) -//' @rdname interleaved -//' @export -// [[Rcpp::export]] -std::vector interleaved_bootstraps(std::vector sessions, std::vector clicks, int bootstraps = 1000) { - std::map wins = interleaved_map(sessions, clicks); - std::vector preferences(bootstraps); - // Get a vector of unique session IDs: - std::sort(sessions.begin(), sessions.end()); - std::vector uniques; - std::unique_copy(sessions.begin(), sessions.end(), std::back_inserter(uniques)); - // Bootstrap preferences: - int winsA, winsB, ties; - std::vector resampled; - for (int i = 0; i < bootstraps; i++) { - // Sample sessions with replacement: - resampled = RcppArmadillo::sample(uniques, uniques.size(), true); - // Compute preference: - winsA = 0; winsB = 0; ties = 0; // reset tallies - for (std::string &session : resampled) - { - switch(wins[session]) { - case 0 : ties++; - break; - case 1 : winsA++; - break; - case -1: winsB++; - break; - } - } - preferences[i] = (((winsA + (ties / 2.0)) / (winsA + winsB + ties)) - 0.5); - } - return preferences; -} - -//' @param confidence level; 0.95 by default -//' @examples -//' -//' # Preference statistic confidence intervals: -//' -//' ## Data without a clear preference: -//' interleaved_confint(x$session_id, x$ranking_function) -//' -//' ## Data where A is preferred over B: -//' interleaved_confint(y$session_id, y$ranking_function) -//' -//' ## Data where B is preferred over A: -//' interleaved_confint(z$session_id, z$ranking_function) -//' @rdname interleaved -//' @export -// [[Rcpp::export]] -List interleaved_confint(std::vector sessions, std::vector clicks, int bootstraps = 1000, double confidence = 0.95) { - std::vector preferences = interleaved_bootstraps(sessions, clicks, bootstraps); - Environment stats("package:stats"); - Function quantile = stats["quantile"]; - double alpha = 1 - confidence; - double median = as(quantile(preferences, 0.5)); - double lower = as(quantile(preferences, alpha / 2)); - double upper = as(quantile(preferences, (1 - (alpha / 2)))); - return List::create(_["point.est"] = median, _["lower"] = lower, _["upper"] = upper); -} diff --git a/src/interleaved_map.cpp b/src/interleaved_map.cpp deleted file mode 100644 index 1c67e58..0000000 --- a/src/interleaved_map.cpp +++ /dev/null @@ -1,29 +0,0 @@ -#include -using namespace Rcpp; - -std::map interleaved_map(std::vector sessions, std::vector clicks) { - std::map wins; // 0 if tie, -1 if B was preferred, 1 if A was preferred - int perSessionWinsA = 0; - int perSessionWinsB = 0; - if (clicks[0] == "A") { - perSessionWinsA++; - } else { - perSessionWinsB++; - } - for (int i = 1; i < sessions.size(); i++) { - if (sessions[i] != sessions[i - 1]) { - // We're now looking at a new session, so let's process - // the previous session's tally of wins: - wins.insert(std::make_pair(sessions[i - 1], (perSessionWinsA == perSessionWinsB) ? 0 : ((perSessionWinsA > perSessionWinsB) ? 1 : -1))); - // Reset tallies: - perSessionWinsA = 0; - perSessionWinsB = 0; - } - if (clicks[i] == "A") { - perSessionWinsA++; - } else { - perSessionWinsB++; - } - } - return wins; -} diff --git a/src/interleaved_map.h b/src/interleaved_map.h deleted file mode 100644 index 5689edc..0000000 --- a/src/interleaved_map.h +++ /dev/null @@ -1 +0,0 @@ -std::map interleaved_map(std::vector sessions, std::vector clicks); diff --git a/src/interleaved_preference.cpp b/src/interleaved_preference.cpp deleted file mode 100644 index e20c788..0000000 --- a/src/interleaved_preference.cpp +++ /dev/null @@ -1,43 +0,0 @@ -#include -#include "interleaved_map.h" -// [[Rcpp::plugins(cpp11)]] -using namespace Rcpp; - -//' @param sessions vector of session IDs used to group `positions` and -//' `ranking_functions` -//' @param clicks vector that shows which ranking function the -//' clicked search result came from ("A" or "B") -//' @examples -//' -//' # Preference statistic calculation: -//' -//' ## Data without a clear preference: -//' interleaved_preference(x$session_id, x$ranking_function) -//' -//' ## Data where A is preferred over B: -//' interleaved_preference(y$session_id, y$ranking_function) -//' -//' ## Data where B is preferred over A: -//' interleaved_preference(z$session_id, z$ranking_function) -//' @rdname interleaved -//' @export -// [[Rcpp::export]] -double interleaved_preference(std::vector sessions, std::vector clicks) { - std::map wins = interleaved_map(sessions, clicks); - int winsA = 0; - int winsB = 0; - int ties = 0; - for (auto const& session : wins) - { - switch(session.second) { - case 0 : ties++; - break; - case 1 : winsA++; - break; - case -1: winsB++; - break; - } - } - double preference = (((winsA + (ties / 2.0)) / (winsA + winsB + ties)) - 0.5); - return preference; -} diff --git a/tests/testthat/test-interleaved.R b/tests/testthat/test-interleaved.R deleted file mode 100644 index b42757e..0000000 --- a/tests/testthat/test-interleaved.R +++ /dev/null @@ -1,46 +0,0 @@ -context("Interleaved search results") - -test_data <- suppressMessages(lapply( - wmf:::fake_interleaved_data(n_sessions = 10, seed = 0), - function(dataset) { - return(dataset[dataset$event == "click", ]) - } -)) - -test_that("preference statistic", { - expect_equal( - interleaved_preference( - test_data$no_preference$session_id, - test_data$no_preference$ranking_function - ), - -0.0555, - tolerance = 0.001 - ) - expect_equal( - interleaved_preference( - test_data$a_preferred$session_id, - test_data$a_preferred$ranking_function - ), - 0.3125, - tolerance = 0.001 - ) - expect_equal( - interleaved_preference( - test_data$b_preferred$session_id, - test_data$b_preferred$ranking_function - ), - -0.3889, - tolerance = 0.001 - ) -}) - -set.seed(42) -bootstrapped_preferences <- interleaved_bootstraps( - test_data$no_preference$session_id, - test_data$no_preference$ranking_function, - bootstraps = 10L -) - -test_that("preference confidence intervals", { - expect_equal(length(bootstrapped_preferences), 10) -}) diff --git a/tests/testthat/test-power.R b/tests/testthat/test-power.R index b70b1d3..ce9053c 100644 --- a/tests/testthat/test-power.R +++ b/tests/testthat/test-power.R @@ -1,44 +1,38 @@ context("Sample size calculations") test_that("chisq_test_odds returns the appropriate estimates", { expect_equal( chisq_test_odds(odds_ratio = 2, p_treatment = 0.4, p_control = 0.25, power = 0.8, conf_level = 0.95), 311 ) expect_equal(chisq_test_odds(p_treatment = 0.4, p_control = 0.25, power = 0.8), 311) expect_equal( unname(chisq_test_odds(p_treatment = 0.4, p_control = 0.25, power = c(0.8, 0.9, 0.95))), c(311, 416, 514) ) }) test_that("chisq_test_odds returns errors when it should", { expect_error(chisq_test_odds()) expect_error(chisq_test_odds(2)) expect_error(chisq_test_odds(odds_ratio = 2, power = 0.8)) }) test_that("chisq_test_odds returns warnings when it should", { expect_warning( chisq_test_odds(p_treatment = 0.4, p_control = 0.25, power = 0.8, visualize = TRUE), "All parameters known. Nothing to visualize." ) }) test_that("chisq_test returns the appropriate estimates", { expect_equal(chisq_test_effect(0.3), 88) expect_equal(chisq_test_effect(0.1, groups = 3, power = 0.95), 1545) expect_equal(chisq_test_effect(), c("tiny" = 3140, "small" = 785, "medium" = 88, "large" = 32)) }) test_that("chisq_test returns errors when it should", { expect_error(chisq_test_effect(w = 0.01)) expect_error(chisq_test_effect(w = 0.1, power = 0.001)) expect_error(chisq_test_effect(w = 0.1, sig_level = 2)) }) - -test_that("exact_binom calculates appropriate sample sizes", { - expect_equal(exact_binom(0.75, 0.03, alpha = 0.05, power = 0.9, two_tail = TRUE), 2105) - expect_equal(exact_binom(0.75, 0.03, alpha = 0.05, power = 0.9, two_tail = FALSE), 1716) - expect_equal(exact_binom(0.75, 0.01, alpha = 0.05, power = 0.9, two_tail = TRUE), 19394) -}) diff --git a/vignettes/interleaved.Rmd b/vignettes/interleaved.Rmd deleted file mode 100644 index 2b8fd73..0000000 --- a/vignettes/interleaved.Rmd +++ /dev/null @@ -1,109 +0,0 @@ ---- -title: "Estimating Preference For Ranking Functions With Clicks On Interleaved Search Results" -author: "Mikhail Popov" -date: "`r Sys.Date()`" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Vignette Title} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- -```{r, echo=FALSE} -set.seed(0) -``` -## Introduction - -The way Search and Discovery's Analysts have been assessing changes to search has so traditionally relied on A/B testing wherein the control group receives results using the latest configuration and the test group (or groups) receives results using the experimental configuration. Another way to evaluate the user-perceived relevance of search results from the experimental configuration relies on a technique called *interleaving*. In it, each user is their own baseline -- we perform two searches behind the scenes and then interleave them together into a single set of results using the team draft algorithm described by Chapelle et al. (2012): - -1. **Input**: result sets $A$ and $B$. -2. **Initialize**: an empty interleaved result sets $I$ and drafts $T_A, T_B$ for keeping track of which results belong to which team. -3. For each round of picking: - a. Randomly decide whether we first pick from $A$ or from $B$. - b. Without loss of generality, if $A$ is randomly chosen to go first, grab top result $a \in A$, append it to $I$ and $T_A$: $I \gets a, T_A \gets a$. - c. Take the top result $b \in B$ such that $b \neq a$ and append it to $I$ after $a$ and to $T_B$: $I \gets b, T_B \gets b$. - d. Update $A = A \setminus \{a, b\}$ and $B \setminus \{a, b\}$ so the two results that were just appended to $I$ are not considered again. - e. Stop when $|I| = \text{maximum per page}$, so only the first page contains interleaved results. -4. **Output**: interleaved results $I$ and team drafts $T_A, T_B$. - -By keeping track of which results belong to which ranking function when the user clicks on them, we can estimate a preference for one ranker over the other. The preference statistic $\Delta_{AB}$ is described by Chapelle et al. as - -$$ -\Delta_{AB} = \frac{\text{wins}_A + \frac{1}{2} \text{ties}}{\text{wins}_A + \text{wins}_B + \text{ties}} - 0.5, -$$ - -where wins are calculated by counting clicks on the results from teams "A" and "B". A positive value of $\Delta_{AB}$ indicates that $A \succ B$, a negative value indicates that $B \succ A$. We performed two types of calculations: per-session and per-search. In **per-session**, "A" has won if there are more clicks on team "A" results than team "B" results and $\text{wins}_A$ is incremented by one for each such session. In **per-search**, "A" has won if there are more clicks on team "A" results in each search, thus any one session can contribute multiple points to the overall $\text{wins}_A$. - -In order to obtain confidence intervals for the preference statistic, we utilize [bootstrapping](https://en.wikipedia.org/wiki/Bootstrapping_(statistics)) with $m$ iterations. - -1. For bootstrap iteration $i = 1, \ldots, m$: - a. Sample unique IDs with replacement. - b. Calculate $\Delta_{AB}^{(i)}$ from new data. -2. The confidence intervals (CIs) are calculated by finding percentiles of the distribution of bootstrapped preferences $\{\Delta_{AB}^{(1)}, \ldots, \Delta_{AB}^{(m)}\}$ -- e.g. the 2.5th and 97.5th percentiles for a 95% CI. - -## Simulated Data - -This package provides simulated search and click data. The three built-in datasets have simulated users that (1) exhibit no preference, (2) exhibit preference for the ranking function "A", and (3) exhibit preference for the ranking function "B". - -```{r} -data(interleaved_data, package = "wmf") # no preference -data(interleaved_data_a, package = "wmf") # preference for A -data(interleaved_data_b, package = "wmf") # preference for B -``` - -Here are the first few rows of the third dataset: - -```{r, results='asis'} -knitr::kable(head(interleaved_data_b)) -``` - -## Estimation - -```{r} -library(wmf) -``` - -To calculate $\Delta_{AB}$ with `interleaved_preference`, we will need to use the clicks. We also use bootstrapping via `interleaved_bootstraps` which resamples sessions (with replacement) to obtain a distribution of the preference statistic $\Delta_{AB}$. After we plot each bootstrapped sample, we mark the 95% confidence interval bounds. **Note** that `interleaved_confint` outputs the `quantile`-based CI and uses the same bootstrap function internally. - -### No preference - -When users click on the interleaved results *without* a preference, the resulting preference statistic is close to 0 and the confidence interval covers 0: - -```{r no_pref} -x <- interleaved_data[interleaved_data$event == "click", ] -x <- x[order(x$session_id, x$timestamp), ] -boot_x <- interleaved_bootstraps(x$session_id, x$ranking_function) -hist(boot_x, col = "gray70", border = NA, main = "No preference", xlab = "Bootstrapped preferences") -abline(v = quantile(boot_x, c(0.025, 0.975)), lty = "dashed") -abline(v = interleaved_preference(x$session_id, x$ranking_function), lwd = 2) -``` - -### Preference for A - -When users click on the interleaved results *with* a preference for A, the resulting preference statistic is *positive* and the confidence interval does *not* cover 0: - -```{r a_pref} -y <- interleaved_data_a[interleaved_data_a$event == "click", ] -y <- y[order(y$session_id, y$timestamp), ] -boot_y <- interleaved_bootstraps(y$session_id, y$ranking_function) -hist(boot_y, col = "gray70", border = NA, main = "Preference for A", xlab = "Bootstrapped preferences") -abline(v = quantile(boot_y, c(0.025, 0.975)), lty = "dashed") -abline(v = interleaved_preference(y$session_id, y$ranking_function), lwd = 2) -``` - -### Preference for B - -When users click on the interleaved results *with* a preference for B, the resulting preference statistic is *negative* and the confidence interval does *not* cover 0: - -```{r b_pref} -z <- interleaved_data_b[interleaved_data_b$event == "click", ] -z <- z[order(z$session_id, z$timestamp), ] -boot_z <- interleaved_bootstraps(z$session_id, z$ranking_function) -hist(boot_z, col = "gray70", border = NA, main = "Preference for B", xlab = "Bootstrapped preferences") -abline(v = quantile(boot_z, c(0.025, 0.975)), lty = "dashed") -abline(v = interleaved_preference(z$session_id, z$ranking_function), lwd = 2) -``` - -# References - -- Chapelle, O., Joachims, T., Radlinski, F., & Yue, Y. (2012). Large-scale validation and analysis of interleaved search evaluation. *ACM Transactions on Information Systems*, **30**(1), 1-41. [doi:10.1145/2094072.2094078](https://doi.org/10.1145/2094072.2094078) -- Radlinski, F. and Craswell, N. (2013). [Optimized interleaving for online retrieval evaluation](https://www.microsoft.com/en-us/research/publication/optimized-interleaving-for-online-retrieval-evaluation/). *ACM International Conference on Web Search and Data Mining (WSDM)*. [doi:10.1145/2433396.2433429](https://doi.org/10.1145/2433396.2433429) diff --git a/wmf.Rproj b/wmf.Rproj index f0d6187..270314b 100644 --- a/wmf.Rproj +++ b/wmf.Rproj @@ -1,21 +1,21 @@ Version: 1.0 RestoreWorkspace: Default SaveWorkspace: Default AlwaysSaveHistory: Default EnableCodeIndexing: Yes UseSpacesForTab: Yes NumSpacesForTab: 2 Encoding: UTF-8 RnwWeave: Sweave LaTeX: pdfLaTeX AutoAppendNewline: Yes StripTrailingWhitespace: Yes BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source -PackageRoxygenize: rd,collate,namespace,vignette +PackageRoxygenize: rd,collate,namespace