Page MenuHomePhabricator

WMCH contributors report 2020

Authored By
valerio.bozzolan
Sep 24 2020, 10:37 AM
Size
5 KB
Referenced Files
None
Subscribers
None

WMCH contributors report 2020

#!/usr/bin/php
<?php
// See
// https://phabricator.wikimedia.org/T263253
// load the suckless-php framework configuration file
// git clone https://gitpull.it/source/suckless-php/
require 'load.php';
// must be registered before this date (inclusive)
define( 'REGISTRATION_DATE_BEFORE', '2016-01-15' );
// must have some edits after this date (inclusive)
define( 'MIN_EDITS_RANGE_AFTER', '2020-01-15' );
// must have some edits before this date (inclusive)
define( 'MIN_EDITS_RANGE_BEFORE', '2021-01-15' );
// how much edits must have in that date range
define( 'MIN_EDITS_RANGE', 50 );
/**
* Generate the WMCH user report 2020 for a specific wiki
*
* @param string $wiki_codename Wiki codename like 'itwiki'
* @param string $wiki_database Wiki database name like 'itwiki_p'
*/
function generate_wmch_report_for_wiki( $wiki_codename, $wiki_database ) {
// name of the exported filename
$filename_export = sprintf(
'%s.csv',
$wiki_codename
);
// fields to be exported from the query and put in the CSV file
$EXPORT_FIELDS = [
'user_id',
'user_name',
'user_editcount',
// // these fields are not populated in replicas
// 'user_registration',
// 'user_touched',
];
// Abbiano più di 5 anni di vita Wikipediana (quindi registrati almeno prima del 15 gennaio 2016)
$max_user_registration_timestamp = strtotime( REGISTRATION_DATE_BEFORE );
// Con almeno 50 edit indifferenziati su almeno un singolo progetto tra il 15 gennaio 2020 e il 15 gennaio 2021
// (quindi se ne ha 20 su Wikipedia e 30 su Wikisource non va bene, ne deve avere almeno 50 su Wikipedia ad esempio).
$min_edits_range_timestamp_after = strtotime( MIN_EDITS_RANGE_AFTER );
$min_edits_range_timestamp_before = strtotime( MIN_EDITS_RANGE_BEFORE );
// must have at least one contribution before a specific date
$subquery_old_contributions = ( new Query() )
->from( 'revision' )
->equals( 'rev_actor', 'actor_id' )
->whereInt( 'rev_timestamp', $max_user_registration_timestamp, '<=' );
// search in an efficient way if the contributor has at least some edits in a period
// it selects "1" if the "LIMIT 50, 1" returns something
$subquery_contributions_in_period = ( new Query() )
->from( 'revision' )
->equals( 'rev_actor', 'actor_id' )
->whereInt( 'rev_timestamp', $min_edits_range_timestamp_after, '>=' )
->whereInt( 'rev_timestamp', $min_edits_range_timestamp_before, '<=' )
// Efficient "minimum number of rows" statement:
// We want to discover if this query has at least MIN_EDITS_RANGE rows
// but note that the COUNT(*) function means a full table scan. It's expensive. Don't do that.
// Why? Because COUNTing *all* his/her/its contributions just to discover if they are greater than "$min_edits" is inefficient.
// Instead, we can use the LIMIT clause to skip the "MIN_EDITS_RANGE - 1" rows and return just the next following row if it exists;
// so we will return zero or one row depending if this user has at least MIN_EDITS_RANGE contributions or not,
// and this is freacking efficient because the LIMIT clause uses indexes.
// -- boz
->limit( MIN_EDITS_RANGE - 1, 1 );
// the final query to the user table with all the conditions
$query = ( new Query() )
->select( $EXPORT_FIELDS )
->from( 'user' )
->joinOn( 'INNER', 'actor', 'actor_user', 'user_id' )
// // this does not work because the user_registration field is NULL in Wikimedia Foundation replicas
// ->whereInt( 'user_registration', $max_user_registration_timestamp, '<=' )
// must have at least one old contribution
->whereExists( $subquery_old_contributions )
// must have some number of contributions in the specified period
->whereExists( $subquery_contributions_in_period );
// select the desired database
DB::instance()->selectDB( $wiki_database );
// show the current query
message( "Executing on $wiki_database: {$query->getQuery()}" );
// export in a CSV file
$out = fopen( $filename_export, 'w' );
// put an heading in the file with the exported field names
fputcsv( $out, $EXPORT_FIELDS );
// export the users
$users = $query->queryGenerator();
foreach( $users as $user ) {
// export the user fields in the right order
$data = [];
foreach( $EXPORT_FIELDS as $field ) {
$data[] = $user->{ $field };
}
// write a line
fputcsv( $fp, $data );
}
// close the CSV file
fclose( $out );
}
$languages = [
'it',
'fr',
'en',
'de',
'als',
];
$wikis = [
'wiki',
'wikisource',
'wiktionary',
'wikivoyage',
'wikibooks',
'wikiquote',
'wikiversity',
'wikinews',
];
foreach( $languages as $language ) {
foreach( $wikis as $wiki ) {
// wiki codename like 'itwiki'
$wiki_codename = $language . $wiki;
// wiki database name like 'itwiki_p'
$wiki_database = $wiki_codename . '_p';
// check if the database exists
$wiki_exists = query_row( sprintf(
"SELECT 1 FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = '%s'",
esc_sql( $wiki_database )
) );
// does the database exists?
if( $wiki_exists ) {
message( "Processing $wiki_database" );
// execute the report
generate_wmch_report_for_wiki( $wiki_codename, $wiki_database );
} else {
// nope, it does not exist
// do nothing
print( "Skip unexisting $wiki_database" );
}
}
}
/**
* Print a message
*
* @param string $message
*/
function message( $message ) {
printf(
"[%s] %s\n",
date( 'Y-m-d H:i:s' ),
$message
);
}

File Metadata

Mime Type
text/plain; charset=utf-8
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
8575413
Default Alt Text
WMCH contributors report 2020 (5 KB)

Event Timeline