Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F32361937
WMCH contributors report 2020
No One
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Authored By
valerio.bozzolan
Sep 24 2020, 10:37 AM
2020-09-24 10:37:25 (UTC+0)
Size
5 KB
Referenced Files
None
Subscribers
None
WMCH contributors report 2020
View Options
#!/usr/bin/php
<?php
// See
// https://phabricator.wikimedia.org/T263253
// load the suckless-php framework configuration file
// git clone https://gitpull.it/source/suckless-php/
require
'load.php'
;
// must be registered before this date (inclusive)
define
(
'REGISTRATION_DATE_BEFORE'
,
'2016-01-15'
);
// must have some edits after this date (inclusive)
define
(
'MIN_EDITS_RANGE_AFTER'
,
'2020-01-15'
);
// must have some edits before this date (inclusive)
define
(
'MIN_EDITS_RANGE_BEFORE'
,
'2021-01-15'
);
// how much edits must have in that date range
define
(
'MIN_EDITS_RANGE'
,
50
);
/**
* Generate the WMCH user report 2020 for a specific wiki
*
* @param string $wiki_codename Wiki codename like 'itwiki'
* @param string $wiki_database Wiki database name like 'itwiki_p'
*/
function
generate_wmch_report_for_wiki
(
$wiki_codename
,
$wiki_database
)
{
// name of the exported filename
$filename_export
=
sprintf
(
'%s.csv'
,
$wiki_codename
);
// fields to be exported from the query and put in the CSV file
$EXPORT_FIELDS
=
[
'user_id'
,
'user_name'
,
'user_editcount'
,
// // these fields are not populated in replicas
// 'user_registration',
// 'user_touched',
];
// Abbiano più di 5 anni di vita Wikipediana (quindi registrati almeno prima del 15 gennaio 2016)
$max_user_registration_timestamp
=
strtotime
(
REGISTRATION_DATE_BEFORE
);
// Con almeno 50 edit indifferenziati su almeno un singolo progetto tra il 15 gennaio 2020 e il 15 gennaio 2021
// (quindi se ne ha 20 su Wikipedia e 30 su Wikisource non va bene, ne deve avere almeno 50 su Wikipedia ad esempio).
$min_edits_range_timestamp_after
=
strtotime
(
MIN_EDITS_RANGE_AFTER
);
$min_edits_range_timestamp_before
=
strtotime
(
MIN_EDITS_RANGE_BEFORE
);
// must have at least one contribution before a specific date
$subquery_old_contributions
=
(
new
Query
()
)
->
from
(
'revision'
)
->
equals
(
'rev_actor'
,
'actor_id'
)
->
whereInt
(
'rev_timestamp'
,
$max_user_registration_timestamp
,
'<='
);
// search in an efficient way if the contributor has at least some edits in a period
// it selects "1" if the "LIMIT 50, 1" returns something
$subquery_contributions_in_period
=
(
new
Query
()
)
->
from
(
'revision'
)
->
equals
(
'rev_actor'
,
'actor_id'
)
->
whereInt
(
'rev_timestamp'
,
$min_edits_range_timestamp_after
,
'>='
)
->
whereInt
(
'rev_timestamp'
,
$min_edits_range_timestamp_before
,
'<='
)
// Efficient "minimum number of rows" statement:
// We want to discover if this query has at least MIN_EDITS_RANGE rows
// but note that the COUNT(*) function means a full table scan. It's expensive. Don't do that.
// Why? Because COUNTing *all* his/her/its contributions just to discover if they are greater than "$min_edits" is inefficient.
// Instead, we can use the LIMIT clause to skip the "MIN_EDITS_RANGE - 1" rows and return just the next following row if it exists;
// so we will return zero or one row depending if this user has at least MIN_EDITS_RANGE contributions or not,
// and this is freacking efficient because the LIMIT clause uses indexes.
// -- boz
->
limit
(
MIN_EDITS_RANGE
-
1
,
1
);
// the final query to the user table with all the conditions
$query
=
(
new
Query
()
)
->
select
(
$EXPORT_FIELDS
)
->
from
(
'user'
)
->
joinOn
(
'INNER'
,
'actor'
,
'actor_user'
,
'user_id'
)
// // this does not work because the user_registration field is NULL in Wikimedia Foundation replicas
// ->whereInt( 'user_registration', $max_user_registration_timestamp, '<=' )
// must have at least one old contribution
->
whereExists
(
$subquery_old_contributions
)
// must have some number of contributions in the specified period
->
whereExists
(
$subquery_contributions_in_period
);
// select the desired database
DB
::
instance
()->
selectDB
(
$wiki_database
);
// show the current query
message
(
"Executing on $wiki_database: {$query->getQuery()}"
);
// export in a CSV file
$out
=
fopen
(
$filename_export
,
'w'
);
// put an heading in the file with the exported field names
fputcsv
(
$out
,
$EXPORT_FIELDS
);
// export the users
$users
=
$query
->
queryGenerator
();
foreach
(
$users
as
$user
)
{
// export the user fields in the right order
$data
=
[];
foreach
(
$EXPORT_FIELDS
as
$field
)
{
$data
[]
=
$user
->{
$field
};
}
// write a line
fputcsv
(
$fp
,
$data
);
}
// close the CSV file
fclose
(
$out
);
}
$languages
=
[
'it'
,
'fr'
,
'en'
,
'de'
,
'als'
,
];
$wikis
=
[
'wiki'
,
'wikisource'
,
'wiktionary'
,
'wikivoyage'
,
'wikibooks'
,
'wikiquote'
,
'wikiversity'
,
'wikinews'
,
];
foreach
(
$languages
as
$language
)
{
foreach
(
$wikis
as
$wiki
)
{
// wiki codename like 'itwiki'
$wiki_codename
=
$language
.
$wiki
;
// wiki database name like 'itwiki_p'
$wiki_database
=
$wiki_codename
.
'_p'
;
// check if the database exists
$wiki_exists
=
query_row
(
sprintf
(
"SELECT 1 FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = '%s'"
,
esc_sql
(
$wiki_database
)
)
);
// does the database exists?
if
(
$wiki_exists
)
{
message
(
"Processing $wiki_database"
);
// execute the report
generate_wmch_report_for_wiki
(
$wiki_codename
,
$wiki_database
);
}
else
{
// nope, it does not exist
// do nothing
print
(
"Skip unexisting $wiki_database"
);
}
}
}
/**
* Print a message
*
* @param string $message
*/
function
message
(
$message
)
{
printf
(
"[%s] %s
\n
"
,
date
(
'Y-m-d H:i:s'
),
$message
);
}
File Metadata
Details
Attached
Mime Type
text/plain; charset=utf-8
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
8575413
Default Alt Text
WMCH contributors report 2020 (5 KB)
Attached To
Mode
P12790 WMCH contributors report 2020
Attached
Detach File
Event Timeline
Log In to Comment