Page MenuHomePhabricator
Paste P12790

WMCH contributors report 2020
ActivePublic

Authored by valerio.bozzolan on Sep 24 2020, 10:37 AM.
#!/usr/bin/php
<?php
// See
// https://phabricator.wikimedia.org/T263253
// load the suckless-php framework configuration file
// git clone https://gitpull.it/source/suckless-php/
require 'load.php';
// must be registered before this date (inclusive)
define( 'REGISTRATION_DATE_BEFORE', '2016-01-15' );
// must have some edits after this date (inclusive)
define( 'MIN_EDITS_RANGE_AFTER', '2020-01-15' );
// must have some edits before this date (inclusive)
define( 'MIN_EDITS_RANGE_BEFORE', '2021-01-15' );
// how much edits must have in that date range
define( 'MIN_EDITS_RANGE', 50 );
// process this number of users at time
// note that if you have very strict filters this number should be very low
define( 'USERS_BATCH', 10 );
$languages = [
'it',
'fr',
// 'en',
'de',
'als',
];
$projects = [
'wiki',
'wikisource',
'wiktionary',
'wikivoyage',
'wikibooks',
'wikiquote',
'wikiversity',
'wikinews',
];
// for each language
foreach( $languages as $language ) {
// for each project
foreach( $projects as $project ) {
// wiki codename like 'itwiki'
$wiki_codename = $language . $project;
// wiki database name like 'itwiki_p'
$wiki_database = $wiki_codename . '_p';
message( "Processing $wiki_database" );
try {
// try to execute the report
generate_wmch_report_for_wiki( $wiki_codename, $wiki_database );
} catch( Exception $e ) {
message( $e->getMessage() );
}
}
}
/**
* Generate the WMCH user report 2020 for a specific wiki
*
* @param string $wiki_codename Wiki codename like 'itwiki'
* @param string $wiki_database Wiki database name like 'itwiki_p'
*/
function generate_wmch_report_for_wiki( $wiki_codename, $wiki_database ) {
// check if the database exists
$wiki_exists = query_row( sprintf(
"SELECT 1 FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = '%s'",
esc_sql( $wiki_database )
) );
// no wiki no party
if( !$wiki_exists ) {
throw new Exception( "missing database $wiki_database" );
}
// name of the exported filename
$filename_export = sprintf(
'%s.csv',
$wiki_codename
);
// check if the file already exists
$file_exists = file_exists( $filename_export );
// last processed user id
$last_user_id = 0;
// do not process twice a file
if( $file_exists ) {
message( "continuing already existing file $filename_export" );
$last_user_id = last_user_id_from_filename( $filename_export );
}
// fields to be exported from the query and put in the CSV file
$EXPORT_FIELDS = [
'user_id',
'user_name',
'user_editcount',
// // these fields are not populated in replicas
// 'user_registration',
// 'user_touched',
];
// Abbiano più di 5 anni di vita Wikipediana (quindi registrati almeno prima del 15 gennaio 2016)
$max_user_registration_timestamp = date_to_mysql_timestamp( REGISTRATION_DATE_BEFORE );
// Con almeno 50 edit indifferenziati su almeno un singolo progetto tra il 15 gennaio 2020 e il 15 gennaio 2021
// (quindi se ne ha 20 su Wikipedia e 30 su Wikisource non va bene, ne deve avere almeno 50 su Wikipedia ad esempio).
$min_edits_range_timestamp_after = date_to_mysql_timestamp( MIN_EDITS_RANGE_AFTER );
$min_edits_range_timestamp_before = date_to_mysql_timestamp( MIN_EDITS_RANGE_BEFORE );
// must have at least one contribution before a specific date
$subquery_old_contributions = ( new Query() )
->from( 'revision_userindex' )
->equals( 'rev_actor', 'actor_id' )
->whereStr( 'rev_timestamp', $max_user_registration_timestamp, '<=' );
// search in an efficient way if the contributor has at least some edits in a period
// it selects "1" if the "LIMIT 50, 1" returns something
$subquery_contributions_in_period = ( new Query() )
->from( 'revision_userindex' )
->equals( 'rev_actor', 'actor_id' )
->whereStr( 'rev_timestamp', $min_edits_range_timestamp_after, '>=' )
->whereStr( 'rev_timestamp', $min_edits_range_timestamp_before, '<=' )
// Efficient "minimum number of rows" statement:
// We want to discover if this query has at least MIN_EDITS_RANGE rows
// but note that the COUNT(*) function means a full table scan. It's expensive. Don't do that.
// Why? Because COUNTing *all* his/her/its contributions just to discover if they are greater than "$min_edits" is inefficient.
// Instead, we can use the LIMIT clause to skip the "MIN_EDITS_RANGE - 1" rows and return just the next following row if it exists;
// so we will return zero or one row depending if this user has at least MIN_EDITS_RANGE contributions or not,
// and this is freacking efficient because the LIMIT clause uses indexes.
// -- boz
->limit( MIN_EDITS_RANGE - 1, 1 );
// export in a CSV file
$out = fopen( $filename_export, 'a' );
if( $out === false ) {
throw new Exception( "cannot open file $filename_export" );
}
// lock the file as exclusive write mode
// LOCK_EX: exclusive flag
// LOCK_NB: make the call non-blocking
if( !flock( $out, LOCK_EX|LOCK_NB ) ) {
throw new Exception( "cannot lock file $filename_export" );
}
// put an heading in the file with the exported field names
if( !$file_exists ) {
fputcsv( $out, $EXPORT_FIELDS );
}
// process a reduced set of users at time
do {
// the final query to the user table with all the conditions
$query = ( new Query() )
->select( $EXPORT_FIELDS )
->from( 'user' )
->joinOn( 'INNER', 'actor', 'actor_user', 'user_id' )
// // this does not work because the user_registration field is NULL in Wikimedia Foundation replicas
// ->whereInt( 'user_registration', $max_user_registration_timestamp, '<=' )
// continue the last bulk
// this is zero if its the first time and don't care
->whereInt( 'user_id', $last_user_id, '>' )
// must have at least one old contribution
->whereExists( $subquery_old_contributions )
// must have some number of contributions in the specified period
->whereExists( $subquery_contributions_in_period )
// sort by user ID
->orderBy( 'user_id' )
// process this number of users
->limit( USERS_BATCH );
// select the desired database
DB::instance()->selectDB( $wiki_database );
// show the current query
message( "Executing query on $wiki_database after user $last_user_id" );
// query the results
$users = $query->queryGenerator();
// last processed user from the next loop
$user = null;
// loop the users
foreach( $users as $user ) {
// export the user fields in the right order
$data = [];
foreach( $EXPORT_FIELDS as $field ) {
$data[] = $user->{ $field };
}
// write a line
fputcsv( $out, $data );
}
// end loop users
// on the last processed user remember its ID
if( $user ) {
$last_user_id = $user->user_id;
}
} while( $user );
// end query splitter
// close the CSV file
fclose( $out );
}
/**
* Print a message
*
* @param string $message
*/
function message( $message ) {
printf(
"[%s] %s\n",
date( 'Y-m-d H:i:s' ),
$message
);
}
/**
* Read the last line of a file
*
* See https://stackoverflow.com/a/1510204/3451846
*
* @param string $file File pathname
* @return string
*/
function last_file_line( $file ) {
$last_line = '';
$fp = fopen( $file, 'r' );
// start from the end of the file
fseek( $fp, -1, SEEK_END );
$pos = ftell( $fp );
// loop backword util "\n" is found
// but if the last line is empty, skip it
while( ( ( $c = fgetc( $fp ) ) !== "\n" || $last_line === '' ) && $pos > 0 ) {
if( $c !== "\n" ) {
$last_line = $c . $last_line;
}
fseek( $fp, $pos-- );
}
fclose( $fp );
return $last_line;
}
/**
* Try to extract the last user ID from a CSV of User IDs in the first column
*
* It returns zero (falsy) if the user id has no sense.
*
* @param string $filename
* @return int|false
*/
function last_user_id_from_filename( $filename ) {
$user_id = false;
// try to read the last user ID
$last_line = last_file_line( $filename );
if( $last_line ) {
$user_id = (int) str_getcsv( $last_line )[0];
}
return $user_id;
}
function date_to_mysql_timestamp( $date ) {
$time = strtotime( $date );
return date( 'YmdHis', $time );
}