WMCH contributors report 2020
ActivePublic
Actions

Authored by valerio.bozzolan on Sep 24 2020, 10:37 AM.

Tags

Referenced Files

	F32362179: WMCH contributors report 2020
	Sep 24 2020, 2:57 PM

	F32361937: WMCH contributors report 2020
	Sep 24 2020, 10:37 AM

Subscribers

None

	#!/usr/bin/php
	<?php

	// See
	// https://phabricator.wikimedia.org/T263253

	// load the suckless-php framework configuration file
	// git clone https://gitpull.it/source/suckless-php/
	require 'load.php';

	// must be registered before this date (inclusive)
	define( 'REGISTRATION_DATE_BEFORE', '2016-01-15' );

	// must have some edits after this date (inclusive)
	define( 'MIN_EDITS_RANGE_AFTER', '2020-01-15' );

	// must have some edits before this date (inclusive)
	define( 'MIN_EDITS_RANGE_BEFORE', '2021-01-15' );

	// how much edits must have in that date range
	define( 'MIN_EDITS_RANGE', 50 );

	// process this number of users at time
	// note that if you have very strict filters this number should be very low
	define( 'USERS_BATCH', 10 );

	$languages = [
	'it',
	'fr',
	// 'en',
	'de',
	'als',
	];

	$projects = [
	'wiki',
	'wikisource',
	'wiktionary',
	'wikivoyage',
	'wikibooks',
	'wikiquote',
	'wikiversity',
	'wikinews',
	];

	// for each language
	foreach( $languages as $language ) {

	// for each project
	foreach( $projects as $project ) {

	// wiki codename like 'itwiki'
	$wiki_codename = $language . $project;

	// wiki database name like 'itwiki_p'
	$wiki_database = $wiki_codename . '_p';

	message( "Processing $wiki_database" );

	try {
	// try to execute the report
	generate_wmch_report_for_wiki( $wiki_codename, $wiki_database );
	} catch( Exception $e ) {
	message( $e->getMessage() );
	}
	}
	}

	/**
	* Generate the WMCH user report 2020 for a specific wiki
	*
	* @param string $wiki_codename Wiki codename like 'itwiki'
	* @param string $wiki_database Wiki database name like 'itwiki_p'
	*/
	function generate_wmch_report_for_wiki( $wiki_codename, $wiki_database ) {

	// check if the database exists
	$wiki_exists = query_row( sprintf(
	"SELECT 1 FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = '%s'",
	esc_sql( $wiki_database )
	) );

	// no wiki no party
	if( !$wiki_exists ) {
	throw new Exception( "missing database $wiki_database" );
	}

	// name of the exported filename
	$filename_export = sprintf(
	'%s.csv',
	$wiki_codename
	);

	// check if the file already exists
	$file_exists = file_exists( $filename_export );

	// last processed user id
	$last_user_id = 0;

	// do not process twice a file
	if( $file_exists ) {

	message( "continuing already existing file $filename_export" );

	$last_user_id = last_user_id_from_filename( $filename_export );
	}

	// fields to be exported from the query and put in the CSV file
	$EXPORT_FIELDS = [
	'user_id',
	'user_name',
	'user_editcount',

	// // these fields are not populated in replicas
	// 'user_registration',
	// 'user_touched',
	];

	// Abbiano più di 5 anni di vita Wikipediana (quindi registrati almeno prima del 15 gennaio 2016)
	$max_user_registration_timestamp = date_to_mysql_timestamp( REGISTRATION_DATE_BEFORE );

	// Con almeno 50 edit indifferenziati su almeno un singolo progetto tra il 15 gennaio 2020 e il 15 gennaio 2021
	// (quindi se ne ha 20 su Wikipedia e 30 su Wikisource non va bene, ne deve avere almeno 50 su Wikipedia ad esempio).
	$min_edits_range_timestamp_after = date_to_mysql_timestamp( MIN_EDITS_RANGE_AFTER );
	$min_edits_range_timestamp_before = date_to_mysql_timestamp( MIN_EDITS_RANGE_BEFORE );

	// must have at least one contribution before a specific date
	$subquery_old_contributions = ( new Query() )
	->from( 'revision_userindex' )
	->equals( 'rev_actor', 'actor_id' )
	->whereStr( 'rev_timestamp', $max_user_registration_timestamp, '<=' );

	// search in an efficient way if the contributor has at least some edits in a period
	// it selects "1" if the "LIMIT 50, 1" returns something
	$subquery_contributions_in_period = ( new Query() )
	->from( 'revision_userindex' )
	->equals( 'rev_actor', 'actor_id' )
	->whereStr( 'rev_timestamp', $min_edits_range_timestamp_after, '>=' )
	->whereStr( 'rev_timestamp', $min_edits_range_timestamp_before, '<=' )

	// Efficient "minimum number of rows" statement:
	// We want to discover if this query has at least MIN_EDITS_RANGE rows
	// but note that the COUNT(*) function means a full table scan. It's expensive. Don't do that.
	// Why? Because COUNTing all his/her/its contributions just to discover if they are greater than "$min_edits" is inefficient.
	// Instead, we can use the LIMIT clause to skip the "MIN_EDITS_RANGE - 1" rows and return just the next following row if it exists;
	// so we will return zero or one row depending if this user has at least MIN_EDITS_RANGE contributions or not,
	// and this is freacking efficient because the LIMIT clause uses indexes.
	// -- boz
	->limit( MIN_EDITS_RANGE - 1, 1 );

	// export in a CSV file
	$out = fopen( $filename_export, 'a' );
	if( $out === false ) {
	throw new Exception( "cannot open file $filename_export" );
	}

	// lock the file as exclusive write mode
	// LOCK_EX: exclusive flag
	// LOCK_NB: make the call non-blocking
	if( !flock( $out, LOCK_EX\|LOCK_NB ) ) {
	throw new Exception( "cannot lock file $filename_export" );
	}

	// put an heading in the file with the exported field names
	if( !$file_exists ) {
	fputcsv( $out, $EXPORT_FIELDS );
	}

	// process a reduced set of users at time
	do {

	// the final query to the user table with all the conditions
	$query = ( new Query() )
	->select( $EXPORT_FIELDS )
	->from( 'user' )
	->joinOn( 'INNER', 'actor', 'actor_user', 'user_id' )

	// // this does not work because the user_registration field is NULL in Wikimedia Foundation replicas
	// ->whereInt( 'user_registration', $max_user_registration_timestamp, '<=' )

	// continue the last bulk
	// this is zero if its the first time and don't care
	->whereInt( 'user_id', $last_user_id, '>' )

	// must have at least one old contribution
	->whereExists( $subquery_old_contributions )

	// must have some number of contributions in the specified period
	->whereExists( $subquery_contributions_in_period )

	// sort by user ID
	->orderBy( 'user_id' )

	// process this number of users
	->limit( USERS_BATCH );

	// select the desired database
	DB::instance()->selectDB( $wiki_database );

	// show the current query
	message( "Executing query on $wiki_database after user $last_user_id" );

	// query the results
	$users = $query->queryGenerator();

	// last processed user from the next loop
	$user = null;

	// loop the users
	foreach( $users as $user ) {

	// export the user fields in the right order
	$data = [];
	foreach( $EXPORT_FIELDS as $field ) {
	$data[] = $user->{ $field };
	}

	// write a line
	fputcsv( $out, $data );
	}
	// end loop users

	// on the last processed user remember its ID
	if( $user ) {
	$last_user_id = $user->user_id;
	}

	} while( $user );
	// end query splitter

	// close the CSV file
	fclose( $out );
	}

	/**
	* Print a message
	*
	* @param string $message
	*/
	function message( $message ) {
	printf(
	"[%s] %s\n",
	date( 'Y-m-d H:i:s' ),
	$message
	);
	}

	/**
	* Read the last line of a file
	*
	* See https://stackoverflow.com/a/1510204/3451846
	*
	* @param string $file File pathname
	* @return string
	*/
	function last_file_line( $file ) {

	$last_line = '';

	$fp = fopen( $file, 'r' );

	// start from the end of the file
	fseek( $fp, -1, SEEK_END );
	$pos = ftell( $fp );

	// loop backword util "\n" is found
	// but if the last line is empty, skip it
	while( ( ( $c = fgetc( $fp ) ) !== "\n" \|\| $last_line === '' ) && $pos > 0 ) {

	if( $c !== "\n" ) {
	$last_line = $c . $last_line;
	}

	fseek( $fp, $pos-- );
	}

	fclose( $fp );

	return $last_line;
	}

	/**
	* Try to extract the last user ID from a CSV of User IDs in the first column
	*
	* It returns zero (falsy) if the user id has no sense.
	*
	* @param string $filename
	* @return int\|false
	*/
	function last_user_id_from_filename( $filename ) {

	$user_id = false;

	// try to read the last user ID
	$last_line = last_file_line( $filename );
	if( $last_line ) {
	$user_id = (int) str_getcsv( $last_line )[0];
	}

	return $user_id;
	}

	function date_to_mysql_timestamp( $date ) {

	$time = strtotime( $date );

	return date( 'YmdHis', $time );

	}

Event Timeline

valerio.bozzolan created this paste.Sep 24 2020, 10:37 AM

valerio.bozzolan mentioned this in T263253: WMCH 2020 contributors report.

valerio.bozzolan edited the content of this paste. (Show Details)Sep 24 2020, 2:57 PM

valerio.bozzolan mentioned this in T271499: WMCH January-December 2020 contributors report.Jan 8 2021, 12:18 AM

WMCH contributors report 2020ActivePublicActions

Event Timeline

WMCH contributors report 2020
ActivePublic
Actions