diff --git a/DESCRIPTION b/DESCRIPTION index e19d86e..f587ec0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,43 +1,43 @@ Package: wmfdata Type: Package -Title: R Code for Wikimedia Foundation Internal Usage +Title: R Tools For Wikimedia Foundation's Analysts And Data Scientists Version: 0.9.0 Date: 2020-08-18 Authors@R: c( person("Mikhail", "Popov", email = "mikhail@wikimedia.org", role = c("aut", "cre")), person("Os", "Keyes", role = "aut", comment = "No longer employed at the Foundation"), person("Chelsy", "Xie", role = "ctb", comment = "No longer employed at the Foundation"), person(family = "Wikimedia Foundation", role = "cph") ) -Description: This package contains functions made for Analysts at Wikimedia - Foundation, but can be used by people outside of the Foundation. +Description: This package contains functions made for Analysts and Scientists at + the Wikimedia Foundation. License: MIT + file LICENSE URL: https://gerrit.wikimedia.org/r/plugins/gitiles/wikimedia/discovery/wmf/ BugReports: https://phabricator.wikimedia.org/maniphest/task/create/?projects=Product-Analytics&assigned=mpopov Depends: R (>= 3.1.0) Imports: dplyr, fs, glue, ggthemes (>= 3.4.0), ggplot2, jsonlite, lubridate, progress, purrr, pwr, readr, RMySQL, tibble, urltools, magrittr, zeallot Suggests: knitr, lintr (>= 2.0.0), roxygen2 (>= 7.0.0), testthat LazyData: TRUE Roxygen: list(markdown = TRUE) Encoding: UTF-8 RoxygenNote: 7.1.1 diff --git a/NAMESPACE b/NAMESPACE index b84c003..6b5cd62 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,44 +1,43 @@ # Generated by roxygen2: do not edit by hand export(build_query) export(chisq_test_effect) export(chisq_test_odds) export(colors_accent) export(colors_base) export(colors_discrete) export(colors_utility) export(connection_details) -export(date_clause) export(display_palettes) export(extract_ymd) export(from_log) export(from_mediawiki) export(geom_flat_violin) export(get_logfile) export(global_query) export(invert_list) export(mysql_close) export(mysql_connect) export(mysql_disconnect) export(mysql_exists) export(mysql_read) export(mysql_write) export(null2na) export(percent2) export(pretty_num) export(query_hive) export(read_sampled_log) export(rewrite_conditional) export(set_proxies) export(theme_facet) export(theme_fivethirtynine) export(theme_min) export(to_log) export(to_mediawiki) export(update_shardmap) export(write_conditional) import(ggplot2) import(ggthemes) importFrom(pwr,pwr.chisq.test) importFrom(readr,read_tsv) importFrom(urltools,url_decode) diff --git a/NEWS.md b/NEWS.md index eeaa9b8..7d23036 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,101 +1,102 @@ wmfdata 0.9.0 ============= * New name, to be more consistent the Python sibling [wmfdata](https://github.com/wikimedia/wmfdata-python). * Updated installation instructions. +* Removed obsolete functions for refining legacy EventLogging data. wmf 0.8.1 ========= * Added `debug` mode to `query_hive()` which prints useful information and does not auto-delete the temporary query and results files. wmf 0.8.0 ========= * Factored out interleaving & exact binomial C++ code into the [ortiz](https://gerrit.wikimedia.org/g/wikimedia/discovery/ortiz/) package * Updated DESCRIPTION * Fixed dependencies wmf 0.7.1 ========= * Updated for new sharded MariaDB replicas ([T212386](https://phabricator.wikimedia.org/T212386)) * Updated for x1 replica (cf. [T172410#4965383](https://phabricator.wikimedia.org/T172410#4965383)) wmf 0.6.0 ========= * Added color palettes based on [Wikimedia Design Style Guide](https://design.wikimedia.org/style-guide/) wmf 0.5.2 ========= * Added `use_beeline` argument in the `query_hive` function to use `beeline` instead of Hive CLI. wmf 0.5.0 ========= * Added `invert_list()` for inverting keys and values in named lists * Added formatting helpers: `percent2()` and `pretty_num()` * Added [David Robinson's `geom_flat_violin`](https://gist.github.com/dgrtwo/eb7750e74997891d7c20) wmf 0.4.0 ========= * Added `parse_json()` * Added `refine_eventlogs()` wmf 0.3.1 ========= * Switched host name from db1047.eqiad.wmnet to db1108.eqiad.wmnet per [T156844](https://phabricator.wikimedia.org/T156844) wmf 0.3.0 ========= * C++-based `exact_binomial()` to quickly estimate sample size for exact binomial tests * Functions for working with interleaved search results experiments * See `?interleaved` for details * See `vignette("interleaved", package = "wmf")` for an example * Requires a compiler that supports C++11 * ggplot themes `theme_min()` and `theme_facet()` * Documentation updates * Syntax-checking unit test * MIT licensing wmf 0.2.7 ========= * Changes which host MySQL functions connect to, depending on the database: - "db1047.eqiad.wmnet" for event logging data from "log" db - "analytics-store.eqiad.wmnet" (same as before) for wiki content * See [T176639](https://phabricator.wikimedia.org/T176639) for more details. wmf 0.2.6 ========= * Adds support for more MySQL config filenames since those vary between the different machines * Smarter about choosing a config file wmf 0.2.5 ========= * Fixes Hive query execution to remove messages/warnings. wmf 0.2.4 ========= * Ungroups grouped data frames when rewriting. See [T146422](https://phabricator.wikimedia.org/T146422) for more details. wmf 0.2.3 ========= * Fixes ggplot2 theme margin bug [discovered & fixed](https://github.com/wikimedia/wikimedia-discovery-wmf/pull/1) by Oliver Keyes. wmf 0.2.2 ========= * Updates `query_hive()` to support [JAR path overriding](https://wikitech.wikimedia.org/wiki/Analytics/Cluster/Hive/QueryUsingUDF#Testing_changes_to_existing_udf) * Updates the MySQL config file path so the package can now also be used on stat1003 * Updates maintainer contact info in README wmf 0.2.1 ========= * Adds a Contributor Code of Conduct wmf 0.2.0 ========= * Adds compatibility with RMySQL 0.9.4+ wmf 0.1.1 ========= * Fix a bug in global_query wmf 0.1.0 ========= Initial release diff --git a/R/chisq_test.R b/R/chisq_test.R index 59f7b7e..c3844be 100644 --- a/R/chisq_test.R +++ b/R/chisq_test.R @@ -1,168 +1,169 @@ -# nolint start -oddsRatio <- function(p_treatment, p_control) { - return((p_treatment / (1 - p_treatment)) / (p_control / (1 - p_control))) -} -pTreatment <- function(p_control, odds_ratio) { - return((odds_ratio * p_control) / ((p_control * (odds_ratio - 1)) + 1)) -} -pControl <- function(p_treatment, odds_ratio) { - return(1 / ((odds_ratio * ((1 / p_treatment) - 1)) + 1)) -} -# nolint end - #' @title Chi-square Test Sample Size Given Odds Ratio #' @description Calculates sample size for chi-squared test of independence #' given the odds ratio. #' @param odds_ratio The expected odds ratio. That is, the ratio of the odds of #' the outcome in the test group relative to the control group. Optional, #' but see *Details*. #' @param p_control Your guess for prevalence of outcome in the control group. #' Optional but see **Details**. #' @param p_treatment Your guess for prevalence of outcome in the test group. #' Optional but see **Details**. #' @param power The ability of the test to detect an effect where there is one. #' Power = 1 - Prob(Type 2 error). Optional. See **Value** for details. #' @param conf_level Desired confidence level. Defaults to 95%. #' @param sample_ratio Ratio of test group to control group. 1 is even split. #' @param visualize Whether to plot power or prevalence of outcome in the #' control group vs sample size. Can be used to help make a decision. #' @section Details: #' The function only needs to know two of the following three: `odds_ratio`, #' `p_control`, and `p_treatment`. If given all three, it will check to make #' sure the odds ratio is correct. It will figure out the missing third value #' from the other two. #' @section References: #' Wang, H., Chow, S.-C., & Li, G. (2002). On sample size calculation based on #' odds ratio in clinical trials. *Journal of Biopharmaceutical #' Statistics*, **12**(4), 471-483. #' [doi:10.1081/BIP-120016231](http://doi.org/10.1081/BIP-120016231) #' @return If `power` was not provided, returns vector containing #' possible power values and the appropriate sample size for each %. #' If all values were provided, returns a single sample size estimate. #' @examples #' chisq_test_odds(p_treatment = 0.4, p_control = 0.25, power = 0.8) #' chisq_test_odds(odds_ratio = 2, p_control = 0.4, power = c(0.8, 0.9, 0.95)) #' chisq_test_odds(odds_ratio = 2, p_control = 0.4) #' chisq_test_odds(odds_ratio = 2, p_control = 0.4, visualize = TRUE) #' @author Mikhail Popov #' @seealso [chisq_test_effect()] #' @export chisq_test_odds <- function( odds_ratio = NULL, p_control = NULL, p_treatment = NULL, power = NULL, conf_level = 0.95, sample_ratio = 1, visualize = FALSE ) { # Begin Exclude Linting # Checks power_missing <- is.null(power) prob_control_missing <- is.null(p_control) prob_treatment_missing <- is.null(p_treatment) odds_ratio_missing <- is.null(odds_ratio) if ((odds_ratio_missing + prob_control_missing + prob_treatment_missing) > 1) { stop("Only one of {odds_ratio, p_control, p_treatment} can be missing.") } # Imputations (Part 1) if (power_missing) { power <- seq(0.5, 0.99, 0.01) } + + # nolint start + oddsRatio <- function(p_treatment, p_control) { + return((p_treatment / (1 - p_treatment)) / (p_control / (1 - p_control))) + } + pTreatment <- function(p_control, odds_ratio) { + return((odds_ratio * p_control) / ((p_control * (odds_ratio - 1)) + 1)) + } + pControl <- function(p_treatment, odds_ratio) { + return(1 / ((odds_ratio * ((1 / p_treatment) - 1)) + 1)) + } + # nolint end + # Imputations (Part 2) if (prob_control_missing) { p_control <- pControl(p_treatment, odds_ratio) } else if (prob_treatment_missing) { p_treatment <- pTreatment(p_control, odds_ratio) } else if (odds_ratio_missing) { odds_ratio <- oddsRatio(p_treatment, p_control) } # End Exclude Linting # Calculations x <- p_treatment * (1 - p_treatment) * sample_ratio y <- p_control * (1 - p_control) z_alpha <- stats::qnorm((1 - conf_level) / 2) z_beta <- stats::qnorm(1 - power) n_b <- (1 / x + 1 / y) * (((z_alpha + z_beta) ^ 2) / (log(odds_ratio) ^ 2)) n_a <- sample_ratio * n_b n <- ceiling(n_a + n_b) # Visualization if (visualize) { if (power_missing || length(power) > 1) { graphics::plot( power, n, type = "l", main = "Sample size as function of statistical power", ylab = "N", xlab = "Power to detect effect", lwd = 2, xaxt = "n" ) graphics::axis( side = 1, at = seq(0.5, 1, 0.1), labels = sprintf("%.0f%%", 100 * seq(0.5, 1, 0.1)) ) graphics::abline(v = seq(0.5, 1, 0.1), lty = "dotted", col = "lightgray", lwd = graphics::par("lwd")) } else { warning("All parameters known. Nothing to visualize.") } } # Output if (power_missing || length(power) > 1) { names(n) <- sprintf("%.0f%%", power * 100) } return(n) } #' @title Chi-square Test Sample Size Given Effect #' @description Uses Cohen's w for effect size to calculate sample size for #' a chi-squared test of independence. #' @param w Effect size you want the test to be able to detect. (Optional) #' @param groups Number of groups. Used in degrees of freedom calculation. #' Defaults to 2 (e.g. control group vs treatment group). #' @param sig_level Probability of Type 1 error. Usually called alpha. #' Defaults to 0.05. #' @param power Ability to detect the effect. (1 - probability of Type 2 error) #' Defaults to 80%. #' @return If `w` was not provided, returns a data frame containing #' possible values of w and the corresponding sample size estimates. #' @examples #' chisq_test_effect() #' chisq_test_effect(0.1) #' chisq_test_effect(w = 0.1, groups = 3, sig_level = 0.001, power = 0.9) #' @importFrom pwr pwr.chisq.test #' @author Mikhail Popov #' @seealso [chisq_test_odds()] #' @export chisq_test_effect <- function( w = NULL, groups = 2, sig_level = 0.05, power = 0.8 ) { # Checks w_missing <- is.null(w) if (!w_missing && w <= 0.01) stop("'w' must be > 0.01") if (power <= 0.1 || power > 1.0) stop("'power' must be in (0.1, 1]") # Imputation if (w_missing) w <- c(0.05, 0.1, 0.3, 0.5) # Calculation and output if (length(w) > 1) { n <- ceiling(vapply(w, function(ww) { return(pwr::pwr.chisq.test( w = ww, N = NULL, df = groups - 1, sig.level = sig_level, power = power )$N) }, 0)) names(n) <- c("tiny", "small", "medium", "large") } else { n <- ceiling(pwr::pwr.chisq.test( w = w, N = NULL, df = groups - 1, sig.level = sig_level, power = power )$N) } return(n) } diff --git a/R/hive.R b/R/hive.R index 471f174..2048ac4 100644 --- a/R/hive.R +++ b/R/hive.R @@ -1,105 +1,81 @@ #' @title Query Hadoop cluster with Hive #' @description Queries Hive #' @param query a Hive query #' @param heap_size `HADOOP_HEAPSIZE`; default is 1024 (alt: 2048 or 4096) #' @param use_nice Whether to use `nice` for less greedy CPU usage in a multi-user environment. The default is `TRUE`. #' @param use_ionice Whether to use `ionice` for less greedy I/O in a multi-user environment. The default is `TRUE`. #' @param use_beeline Whether to use `beeline` to connect with Hive instead of `hive`. The default is `FALSE`. #' @param debug Whether to print the query and any messages/info which could be useful for debugging. #' @section Escaping: #' `hive_query` works by running the query you provide through the CLI via a #' [system()] call. As a result, single escapes for meaningful characters #' (such as quotes) within the query will not work: R will interpret them #' only as escaping that character /within R/. Double escaping (\\\) is thus #' necessary, in the same way that it is for regular expressions. #' @return A `data.frame` containing the results of the query, or a `TRUE` if #' the user has chosen to write straight to file. # nolint start #' @section Handling our hadoop/hive setup: #' The `webrequests` table is documented [on Wikitech](https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Hive), #' which also provides [a set of example queries](https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Hive/Queries). When it comes to manipulating the rows with Java before they get to you, Nuria has written a #' [brief tutorial on loading UDFs](https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Hive/QueryUsingUDF) #' which should help if you want to engage in that. # nolint end #' @seealso [lubridate::ymd_hms()] for converting the "dt" column in the #' webrequests table to proper datetime, and [mysql_read()] and #' [global_query()] for querying our MySQL databases #' @examples #' \dontrun{ #' query_hive("USE wmf; DESCRIBE webrequest;") #' } #' @export query_hive <- function(query, heap_size = 1024, use_nice = TRUE, use_ionice = TRUE, use_beeline = FALSE, debug = FALSE) { message("Don't forget to authenticate with Kerberos using kinit") # Write query out to tempfile and create tempfile for results. query_dump <- fs::file_temp(pattern = "temp_query", tmp_dir = ".", ext = ".hql") cat(query, file = query_dump) results_dump <- fs::file_temp(pattern = "temp_results", tmp_dir = ".", ext = ".tsv") if (debug) { message("Query written to: ", query_dump) message("Results will be written to: ", results_dump) } cli <- dplyr::case_when( use_beeline && debug ~ "beeline", use_beeline && ~debug ~ "beeline --silent=true", !use_beeline && debug ~ "hive", !use_beeline && !debug ~ "hive -S" ) if (use_nice) cli <- paste("nice", cli) if (use_ionice) cli <- paste("ionice", cli) # Query and read in the results cmd <- "export HADOOP_HEAPSIZE={heap_size} && {cli} -f {query_dump} 2>&1" cmd <- paste(cmd, "> {results_dump}") cmd <- glue::glue(cmd) if (debug) message("Command to run: ", cmd) std_err <- system(cmd, intern = TRUE) if (debug) message("stderr:\n\t", std_err) if (fs::file_exists(results_dump)) { results <- utils::read.delim(results_dump, sep = "\t", quote = "", as.is = TRUE, header = TRUE) if (debug) message("First few rows of read-in results:\n", head(results)) } else { stop("The file '", results_dump, "' does not exist") } # Clean up and return if (debug) { message("Query and results files were not automatically deleted to allow for inspection.") message(glue::glue("Do not forget to clean up using:\nrm {query_dump}\nrm {results_dump}")) } else { file.remove(query_dump, results_dump) } stop_on_empty(results) return(results) } - -#' @title Generate a Date Clause for a Hive query -#' @description What it says on the tin; generates a -#' `WHERE year = foo AND month = bar` -#' that can then be combined with other elements to form a Hive query. -#' @param date if `NULL`, yesterday will be used -#' @return a list containing two elements: "date_clause" and "date"; the -#' returning of the date allows you to include it -#' @seealso [extract_ymd()] -#' @export -date_clause <- function(date) { - warning("Deprecated; recommended to use `c(year, month, day) %<-% wmf::extract_ymd(date)` instead") - if (is.null(date)) { - date <- Sys.Date() - 1 - } - fragment <- sprintf( - "WHERE year = %s AND month = %s AND day = %s ", - lubridate::year(date), - lubridate::month(date), - lubridate::mday(date) - ) - output <- list(date_clause = fragment, date = date) - return(output) -} diff --git a/README.md b/README.md index b23a123..0150bd5 100644 --- a/README.md +++ b/README.md @@ -1,43 +1,43 @@ -# R Tools for Wikimedia Foundation's Analysts +# wmfdata: R tools for Wikimedia Foundation's Analysts & Data Scientists **NOTICE**: while this package is installed from GitHub ([wikimedia/wmfdata-r](https://github.com/wikimedia/wmfdata-r)), that repository is a read-only mirror of [wikimedia/discovery/wmf](https://gerrit.wikimedia.org/g/wikimedia/discovery/wmf) repository hosted on [Gerrit](https://www.mediawiki.org/wiki/Gerrit). See [mediawiki:Developer account](https://www.mediawiki.org/wiki/Developer_account) for information about creating a Wikimedia Developer account for contributing to this package, MediaWiki, and other Wikimedia projects. Other packages from [Wikimedia Foundations's Product Analytics team](https://www.mediawiki.org/wiki/Product_Analytics) include [wmfdata](https://github.com/wikimedia/wmfdata-python) for working with Wikimedia data in Python, and [waxer](https://github.com/wikimedia/waxer) for querying Wikimedia Analytics Query Service in R, and [wmfastr](https://github.com/wikimedia/wmfastr) for speedy dwelltime and search preference metrics calculations in R. ## Installation ```R # install.packages("remotes", repos = c(CRAN = "https://cran.rstudio.com/")) remotes::install_github("wikimedia/wmfdata-r") ``` To update: ```R remotes::update_packages("wmfdata") ``` ## Highlights - `set_proxies` to set http(s) proxies on the analytics cluster - `global_query` for querying all of our MySQL databases - Utilities for working with logs, including EventLogging data: - `from_mediawiki` and `from_log` (and corresponding `to_*` functions) to convert between time formats - `query_hive` for querying our Hadoop cluster via Hive - `mysql_read` for querying our MariaDB databases - uses automatic shard detection, see `?connection_details` for more info - Sample size calculations: - `chisq_test_odds` estimates sample size for a chi-squared test given an odds ratio - `chisq_test_effect` estimates sample size for a chi-squared test given Cohen's *w* Also includes [Wikimedia Design visual style colors](https://design.wikimedia.org/style-guide/visual-style_colors.html): ![Color palettes included in the package based on Wikimedia Design Style Guide](palettes.png) ## Maintainers - [Mikhail Popov](https://meta.wikimedia.org/wiki/User:MPopov_(WMF)) ## Additional Information Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms. diff --git a/man/date_clause.Rd b/man/date_clause.Rd deleted file mode 100644 index 07f8029..0000000 --- a/man/date_clause.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hive.R -\name{date_clause} -\alias{date_clause} -\title{Generate a Date Clause for a Hive query} -\usage{ -date_clause(date) -} -\arguments{ -\item{date}{if \code{NULL}, yesterday will be used} -} -\value{ -a list containing two elements: "date_clause" and "date"; the -returning of the date allows you to include it -} -\description{ -What it says on the tin; generates a -\verb{WHERE year = foo AND month = bar} -that can then be combined with other elements to form a Hive query. -} -\seealso{ -\code{\link[=extract_ymd]{extract_ymd()}} -} diff --git a/man/wmfdata-package.Rd b/man/wmfdata-package.Rd index 1228938..7483f4d 100644 --- a/man/wmfdata-package.Rd +++ b/man/wmfdata-package.Rd @@ -1,35 +1,35 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/wmfdata-package.R \docType{package} \name{wmfdata-package} \alias{wmfdata} \alias{wmfdata-package} -\title{wmfdata: R Code for Wikimedia Foundation Internal Usage} +\title{wmfdata: R Tools For Wikimedia Foundation's Analysts And Data Scientists} \description{ -This package contains functions made for Analysts at Wikimedia - Foundation, but can be used by people outside of the Foundation. +This package contains functions made for Analysts and Scientists at + the Wikimedia Foundation. } \seealso{ Useful links: \itemize{ \item \url{https://gerrit.wikimedia.org/r/plugins/gitiles/wikimedia/discovery/wmf/} \item Report bugs at \url{https://phabricator.wikimedia.org/maniphest/task/create/?projects=Product-Analytics&assigned=mpopov} } } \author{ \strong{Maintainer}: Mikhail Popov \email{mikhail@wikimedia.org} Authors: \itemize{ \item Os Keyes (No longer employed at the Foundation) } Other contributors: \itemize{ \item Chelsy Xie (No longer employed at the Foundation) [contributor] \item Wikimedia Foundation [copyright holder] } } \keyword{internal} diff --git a/tests/testthat/mobile_app_data.rds b/tests/testthat/mobile_app_data.rds deleted file mode 100644 index c4485cc..0000000 Binary files a/tests/testthat/mobile_app_data.rds and /dev/null differ diff --git a/tests/testthat/test-queries.R b/tests/testthat/test-queries.R deleted file mode 100644 index 2b68cbf..0000000 --- a/tests/testthat/test-queries.R +++ /dev/null @@ -1,5 +0,0 @@ -context("Queries") - -test_that("date clause", { - expect_equal(date_clause(as.Date("2017-08-01"))$date_clause, "WHERE year = 2017 AND month = 8 AND day = 1 ") -}) diff --git a/tests/testthat/test-refinement.R b/tests/testthat/test-refinement.R deleted file mode 100644 index 4aa36b3..0000000 --- a/tests/testthat/test-refinement.R +++ /dev/null @@ -1,32 +0,0 @@ -context("Refinement") - -mobile_app_data <- readr::read_rds("mobile_app_data.rds") - -test_that("refine_eventlogs (universal parsers)", { - refined_eventlogs <- refine_eventlogs( - mobile_app_data, - dt_cols = c("timestamp", "event_client_dt"), - json_cols = c("userAgent", "event_languages") - ) - expect_true(tibble::is_tibble(refined_eventlogs)) - expect_true(lubridate::is.POSIXct(refined_eventlogs$timestamp)) - expect_true(lubridate::is.POSIXct(refined_eventlogs$client_dt)) - expect_equal(refined_eventlogs[["userAgent"]][[1]]$wmf_app_version, "2.7.235-r-2018-06-21") - expect_equal(refined_eventlogs[1, "languages"][[1]][[1]], "de") -}) - -test_that("refine_eventlogs (per-column parsers)", { - refined_eventlogs <- refine_eventlogs( - mobile_app_data, - dt_cols = list( - "timestamp" = from_mediawiki, - "event_client_dt" = lubridate::ymd_hms - ), - json_cols = c("userAgent", "event_languages") - ) - expect_true(tibble::is_tibble(refined_eventlogs)) - expect_true(lubridate::is.POSIXlt(refined_eventlogs$timestamp)) - expect_true(lubridate::is.POSIXct(refined_eventlogs$client_dt)) - expect_equal(refined_eventlogs[["userAgent"]][[1]]$wmf_app_version, "2.7.235-r-2018-06-21") - expect_equal(refined_eventlogs[1, "languages"][[1]][[1]], "de") -})