Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F32362179
WMCH contributors report 2020
No One
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Authored By
valerio.bozzolan
Sep 24 2020, 2:57 PM
2020-09-24 14:57:10 (UTC+0)
Size
7 KB
Referenced Files
None
Subscribers
None
WMCH contributors report 2020
View Options
#!/usr/bin/php
<?php
// See
// https://phabricator.wikimedia.org/T263253
// load the suckless-php framework configuration file
// git clone https://gitpull.it/source/suckless-php/
require
'load.php'
;
// must be registered before this date (inclusive)
define
(
'REGISTRATION_DATE_BEFORE'
,
'2016-01-15'
);
// must have some edits after this date (inclusive)
define
(
'MIN_EDITS_RANGE_AFTER'
,
'2020-01-15'
);
// must have some edits before this date (inclusive)
define
(
'MIN_EDITS_RANGE_BEFORE'
,
'2021-01-15'
);
// how much edits must have in that date range
define
(
'MIN_EDITS_RANGE'
,
50
);
// process this number of users at time
// note that if you have very strict filters this number should be very low
define
(
'USERS_BATCH'
,
10
);
$languages
=
[
'it'
,
'fr'
,
// 'en',
'de'
,
'als'
,
];
$projects
=
[
'wiki'
,
'wikisource'
,
'wiktionary'
,
'wikivoyage'
,
'wikibooks'
,
'wikiquote'
,
'wikiversity'
,
'wikinews'
,
];
// for each language
foreach
(
$languages
as
$language
)
{
// for each project
foreach
(
$projects
as
$project
)
{
// wiki codename like 'itwiki'
$wiki_codename
=
$language
.
$project
;
// wiki database name like 'itwiki_p'
$wiki_database
=
$wiki_codename
.
'_p'
;
message
(
"Processing $wiki_database"
);
try
{
// try to execute the report
generate_wmch_report_for_wiki
(
$wiki_codename
,
$wiki_database
);
}
catch
(
Exception
$e
)
{
message
(
$e
->
getMessage
()
);
}
}
}
/**
* Generate the WMCH user report 2020 for a specific wiki
*
* @param string $wiki_codename Wiki codename like 'itwiki'
* @param string $wiki_database Wiki database name like 'itwiki_p'
*/
function
generate_wmch_report_for_wiki
(
$wiki_codename
,
$wiki_database
)
{
// check if the database exists
$wiki_exists
=
query_row
(
sprintf
(
"SELECT 1 FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = '%s'"
,
esc_sql
(
$wiki_database
)
)
);
// no wiki no party
if
(
!
$wiki_exists
)
{
throw
new
Exception
(
"missing database $wiki_database"
);
}
// name of the exported filename
$filename_export
=
sprintf
(
'%s.csv'
,
$wiki_codename
);
// check if the file already exists
$file_exists
=
file_exists
(
$filename_export
);
// last processed user id
$last_user_id
=
0
;
// do not process twice a file
if
(
$file_exists
)
{
message
(
"continuing already existing file $filename_export"
);
$last_user_id
=
last_user_id_from_filename
(
$filename_export
);
}
// fields to be exported from the query and put in the CSV file
$EXPORT_FIELDS
=
[
'user_id'
,
'user_name'
,
'user_editcount'
,
// // these fields are not populated in replicas
// 'user_registration',
// 'user_touched',
];
// Abbiano più di 5 anni di vita Wikipediana (quindi registrati almeno prima del 15 gennaio 2016)
$max_user_registration_timestamp
=
date_to_mysql_timestamp
(
REGISTRATION_DATE_BEFORE
);
// Con almeno 50 edit indifferenziati su almeno un singolo progetto tra il 15 gennaio 2020 e il 15 gennaio 2021
// (quindi se ne ha 20 su Wikipedia e 30 su Wikisource non va bene, ne deve avere almeno 50 su Wikipedia ad esempio).
$min_edits_range_timestamp_after
=
date_to_mysql_timestamp
(
MIN_EDITS_RANGE_AFTER
);
$min_edits_range_timestamp_before
=
date_to_mysql_timestamp
(
MIN_EDITS_RANGE_BEFORE
);
// must have at least one contribution before a specific date
$subquery_old_contributions
=
(
new
Query
()
)
->
from
(
'revision_userindex'
)
->
equals
(
'rev_actor'
,
'actor_id'
)
->
whereStr
(
'rev_timestamp'
,
$max_user_registration_timestamp
,
'<='
);
// search in an efficient way if the contributor has at least some edits in a period
// it selects "1" if the "LIMIT 50, 1" returns something
$subquery_contributions_in_period
=
(
new
Query
()
)
->
from
(
'revision_userindex'
)
->
equals
(
'rev_actor'
,
'actor_id'
)
->
whereStr
(
'rev_timestamp'
,
$min_edits_range_timestamp_after
,
'>='
)
->
whereStr
(
'rev_timestamp'
,
$min_edits_range_timestamp_before
,
'<='
)
// Efficient "minimum number of rows" statement:
// We want to discover if this query has at least MIN_EDITS_RANGE rows
// but note that the COUNT(*) function means a full table scan. It's expensive. Don't do that.
// Why? Because COUNTing *all* his/her/its contributions just to discover if they are greater than "$min_edits" is inefficient.
// Instead, we can use the LIMIT clause to skip the "MIN_EDITS_RANGE - 1" rows and return just the next following row if it exists;
// so we will return zero or one row depending if this user has at least MIN_EDITS_RANGE contributions or not,
// and this is freacking efficient because the LIMIT clause uses indexes.
// -- boz
->
limit
(
MIN_EDITS_RANGE
-
1
,
1
);
// export in a CSV file
$out
=
fopen
(
$filename_export
,
'a'
);
if
(
$out
===
false
)
{
throw
new
Exception
(
"cannot open file $filename_export"
);
}
// lock the file as exclusive write mode
// LOCK_EX: exclusive flag
// LOCK_NB: make the call non-blocking
if
(
!
flock
(
$out
,
LOCK_EX
|
LOCK_NB
)
)
{
throw
new
Exception
(
"cannot lock file $filename_export"
);
}
// put an heading in the file with the exported field names
if
(
!
$file_exists
)
{
fputcsv
(
$out
,
$EXPORT_FIELDS
);
}
// process a reduced set of users at time
do
{
// the final query to the user table with all the conditions
$query
=
(
new
Query
()
)
->
select
(
$EXPORT_FIELDS
)
->
from
(
'user'
)
->
joinOn
(
'INNER'
,
'actor'
,
'actor_user'
,
'user_id'
)
// // this does not work because the user_registration field is NULL in Wikimedia Foundation replicas
// ->whereInt( 'user_registration', $max_user_registration_timestamp, '<=' )
// continue the last bulk
// this is zero if its the first time and don't care
->
whereInt
(
'user_id'
,
$last_user_id
,
'>'
)
// must have at least one old contribution
->
whereExists
(
$subquery_old_contributions
)
// must have some number of contributions in the specified period
->
whereExists
(
$subquery_contributions_in_period
)
// sort by user ID
->
orderBy
(
'user_id'
)
// process this number of users
->
limit
(
USERS_BATCH
);
// select the desired database
DB
::
instance
()->
selectDB
(
$wiki_database
);
// show the current query
message
(
"Executing query on $wiki_database after user $last_user_id"
);
// query the results
$users
=
$query
->
queryGenerator
();
// last processed user from the next loop
$user
=
null
;
// loop the users
foreach
(
$users
as
$user
)
{
// export the user fields in the right order
$data
=
[];
foreach
(
$EXPORT_FIELDS
as
$field
)
{
$data
[]
=
$user
->{
$field
};
}
// write a line
fputcsv
(
$out
,
$data
);
}
// end loop users
// on the last processed user remember its ID
if
(
$user
)
{
$last_user_id
=
$user
->
user_id
;
}
}
while
(
$user
);
// end query splitter
// close the CSV file
fclose
(
$out
);
}
/**
* Print a message
*
* @param string $message
*/
function
message
(
$message
)
{
printf
(
"[%s] %s
\n
"
,
date
(
'Y-m-d H:i:s'
),
$message
);
}
/**
* Read the last line of a file
*
* See https://stackoverflow.com/a/1510204/3451846
*
* @param string $file File pathname
* @return string
*/
function
last_file_line
(
$file
)
{
$last_line
=
''
;
$fp
=
fopen
(
$file
,
'r'
);
// start from the end of the file
fseek
(
$fp
,
-
1
,
SEEK_END
);
$pos
=
ftell
(
$fp
);
// loop backword util "\n" is found
// but if the last line is empty, skip it
while
(
(
(
$c
=
fgetc
(
$fp
)
)
!==
"
\n
"
||
$last_line
===
''
)
&&
$pos
>
0
)
{
if
(
$c
!==
"
\n
"
)
{
$last_line
=
$c
.
$last_line
;
}
fseek
(
$fp
,
$pos
--
);
}
fclose
(
$fp
);
return
$last_line
;
}
/**
* Try to extract the last user ID from a CSV of User IDs in the first column
*
* It returns zero (falsy) if the user id has no sense.
*
* @param string $filename
* @return int|false
*/
function
last_user_id_from_filename
(
$filename
)
{
$user_id
=
false
;
// try to read the last user ID
$last_line
=
last_file_line
(
$filename
);
if
(
$last_line
)
{
$user_id
=
(
int
)
str_getcsv
(
$last_line
)[
0
];
}
return
$user_id
;
}
function
date_to_mysql_timestamp
(
$date
)
{
$time
=
strtotime
(
$date
);
return
date
(
'YmdHis'
,
$time
);
}
File Metadata
Details
Attached
Mime Type
text/plain; charset=utf-8
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
8575523
Default Alt Text
WMCH contributors report 2020 (7 KB)
Attached To
Mode
P12790 WMCH contributors report 2020
Attached
Detach File
Event Timeline
Log In to Comment