<?php /*
// DON'T ADOPT THIS SCRIPT :D :D IT'S VERY RAW! (Even if it works) __ ___ ___ _ ___ __ _
// This script is to be saved in the maintenance directory of MediaWiki. / |/ /__ ____/ (_)___ | | / (_) /__(_)
// This script was used to delete 700K+ pages of spam and 1+ millions of spammed revisions. / /|_/ / _ \/ __ / / __ `/ | /| / / / //_/ /
// Author: Valerio Bozzolan, / / / / __/ /_/ / / /_/ /| |/ |/ / / ,< / /
/_/ /_/\___/\__,_/_/\__,_/ |__/|__/_/_/|_/_/
_____ ____ ___ __ _____ _____________
/ ___// __ \/ | / |/ / |/ / ____/ __ \
\__ \/ /_/ / /| | / /|_/ / /|_/ / __/ / /_/ /
___/ / ____/ ___ |/ / / / / / / /___/ _, _/
/____/_/ /_/ |_/_/ /_/_/ /_/_____/_/ |_|
____ __ ___ _ __
/ __ \/ /_ / (_) __(_)___ _/ /____ _____
/ / / / __ \/ / / | / / / __ `/ __/ _ \/ ___/
/ /_/ / /_/ / / /| |/ / / /_/ / /_/ __/ /
\____/_.___/_/_/ |___/_/\__,_/\__/\___/_/ - by bozz \o/
DON'T ADOPT THIS SCRIPT :D :D IT'S VERY RAW! (even if "it worked on my computer")
This script is supposed to be saved in the maintenance directory of MediaWiki.
This script was used to delete 700K+ pages of spam and 1+ millions of spammed revisions.
This script does not support any option. It does not even support --wiki since in my case
I just need to execute something like:
HTTP_HOST=es.dicoado.org ./boz-obliviate-spammers.php
So, have fun, at your own risk.
Source code:
https://phabricator.wikimedia.org/P56671
Context:
WMCHhttps://phabricator.wikimedia.org/T308969
// Author: Valerio Bozzolan, WMCH
License: GNU GPL v3+
// https://phabricator.wikimedia.org/T308969*/
use MediaWiki\MediaWikiServices;
$LIMIT$BATCH = 5000;
define( 'OBLIVIATED_USERS_LOG', 'obliviated-users.txt' );
// FALSE: you don't want interaction. You can still edit the spam keywords and the bot will reload it.
// TRUE: you want to review every page, manually.
define( 'INTERACTION', false );
// Spam time cut line. After this timestamp, there is spam.
// TODO: make an argument.
$AFTER_TIMESTAMP = '20160501000000';
require_once __DIR__ . "/commandLine.inc";
// TODO: create argument for this
$SYSTEM_USERNAME = 'WikiSysopMini';
$SYSTEM_USER = User::newFromName( $SYSTEM_USERNAME );
if ( !$SYSTEM_USER ) {
throw new Exception( "Invalid username" );
}
$to_be_obliviated_user_ids = [];
$paging = 0;
do {
echo "Querying $LIMITBATCH results (paging $paging)...\n";
$found_something = false;
// Find spammed revisions. Confirm. Oblivate authors.
foreach( search_some_spam_pages_with_rev_user( $LIMITBATCH, $paging ) as $spammed_page ) {
$found_something = true;
if(is_user_trusted($spammed_page->rev_user)) {
echo "Skip trusted user {$spammed_page->rev_user}\n";
continue;
}
$page = Title::newFromID( $spammed_page->page_id );
$text_short = trim( $spammed_page->old_text );
$text_short = mb_strimwidth($text_short, 0, 500, '...');
if( $text_short === '' ) {
echo "Skipped empty page\n";
continue;
}
$surely_spam_keywords = file_get_contents('/tmp/spam-keywords.txt');
$surely_spam_keywords = explode( "\n", trim( $surely_spam_keywords ) );
// Confirm one by one.
echo "\n----------------\n";
echo "$text_short\n\n";
echo "----------------\n";
echo 'https://' . $_SERVER['HTTP_HOST'] . '/wiki/Special:PermaLink/' . $spammed_page->rev_id . "\n";
echo "\n";
$obliviate = false;
if( $found_term = contains_terms( $spammed_page->old_text, $surely_spam_keywords ) ) {
echo "SUSPICIOUS TERM FOUND FROM UNTRUSTED USER: <$found_term> AND THE DATE MATCHES THE SPECIFIED SPAM TIME WINDOW >= $AFTER_TIMESTAMP\n";
$obliviate = true;
} else if( INTERACTION ) {
$obliviate = yesno("Is this SPAM? Delete its user and ALL their edits?");
}
if( $obliviate ) {
echo "OBLIVIATING >:D\n";
obliviate_user( $spammed_page->rev_user );
} else {
echo "Skipping...\n Consider putting user as autopatrolled if you trust that.\n";
}
}
$paging++;
} while( $found_something );
echo "DONE! Let's play again soon :D :D :D\n";
echo "Now please run:\n";
echo " ./deleteOrphanedRevisions.php\n";
echo " ./deleteArchivedRevisions.php\n";
/**
* Completely nuke one MediaWiki user by its User ID (numeric).
* @param int $user_id
*/
function obliviate_user( $user_id ) {
// Put obliviated users in a log file
file_put_contents(OBLIVIATED_USERS_LOG, $user_id . "\n", FILE_APPEND);
// Find pages where the spammer is author, and delete one by one.
foreach( search_pages_where_user_is_author( $user_id ) as $created_page ) {
drop_page( $created_page->page_id );
}
// Find other edits where the spammer is author.
$revisions_to_be_nuked = [];
foreach( search_revisions_where_user_is_author( $user_id ) as $created_revision ) {
$revisions_to_be_nuked[] = $created_revision->rev_id;
}
// Delete revisions, in batches.
while( $revisions_to_be_nuked ) {
$some_revisions_to_be_nuked = array_splice($revisions_to_be_nuked, 0, 50);
drop_revisions($some_revisions_to_be_nuked);
}
}
function search_pages_where_user_is_author( $user_id ) {
$user_id_safe = (int) $user_id;
if( !$user_id_safe ) {
throw new Exception( "Not a valid User ID: $user_id");
}
$sql_first_revision_of_user = "
SELECT
page_id,
rev_page,
page_namespace,
page_title
FROM
revision AS mainrev
JOIN page ON page_id = rev_page
WHERE rev_user = $user_id_safe
AND rev_timestamp =
( SELECT MIN(rev_timestamp)
FROM revision AS childrev
WHERE childrev.rev_page = mainrev.rev_page
)
";
return query_objects( $sql_first_revision_of_user );
}
function search_revisions_where_user_is_author( $user_id ) {
$user_id_safe = (int) $user_id;
if( !$user_id_safe ) {
throw new Exception( "Not a valid User ID: $user_id");
}
$sql_revisions_of_user = "
SELECT
rev_id
FROM
revision
WHERE rev_user = $user_id_safe
";
return query_objects( $sql_revisions_of_user );
}
/**
* Search some pages and return the related revision's User.
* PLEASE SANITIZE THE $AFTER_TIMESTAMP GLOBAL
* @param int $limit How many results for each page.
* @param int $page From zero to infinite
*/
function search_some_spam_pages_with_rev_user( $limit, $page ) {
global $AFTER_TIMESTAMP;
$offset = $limit * $page;
$timestamp_safe = wfGetDB( DB_MASTER )->addQuotes( $AFTER_TIMESTAMP );
/*
* At the moment we just inspect all revisions with runtime checks.
* But you may want to activate something like this to do SQL-based checks.
* (spoiler: probably slower, if you have so much spam) - so, commented.
$like_queries = [];
foreach( $argv as $arg ) {
$arg = trim( $arg );
$search_safe = 'old_text LIKE %' . $dbw->addQuotes( $arg ) . '%';
$like_queries[] = $search_safe;
}
$like_queries_safe = implode( ' AND ', $like_queries );
*/
// Select last version
$sql_spammed_pages = "
SELECT
rev_id,
page_id,
rev_user,
old_text
FROM page
JOIN revision
ON (page_latest = rev_id)
JOIN text
ON (rev_text_id = old_id)
WHERE rev_timestamp > $timestamp_safe
AND
page_namespace IN( 0, 2 )
LIMIT $limit
OFFSET $offset
";
return query_objects( $sql_spammed_pages );
}
/**
* Please do not specify too much arguments or the shell may crash.
* Please VALIDATE the input. Each revision MUST be numeric integer.
* @param array $revision_ids
*/
function drop_revisions( $revision_ids ) {
if(!$revision_ids) {
return;
}
// LOL
$revision_ids_str = implode( " ", $revision_ids );
echo "Deleting revisions created by known spammer: $revision_ids_str\n";
$result = passthru( sprintf( 'php ./deleteRevision.php %s', $revision_ids_str ) );
if($result === false) {
throw new Exeption("Failed deleting revision: $revision_id");
}
}
function drop_page( $page_id ) {
global $SYSTEM_USER;
$page_id = (int) $page_id;
if( !$page_id ) {
throw new Exception( "Not a valid page ID: $page_id");
}
$title = Title::newFromID( $page_id );
if(!$title) {
throw new Exception("Wrong page ID $page_id" );
}
echo "Dropping page $title\n";
// Nothing to do
if(!$title->exists()){
return;
}
if ( $title->getNamespace() == NS_FILE ) {
$img = wfFindFile( $title, [ 'ignoreRedirect' => true ] );
if ( $img && $img->isLocal() && !$img->delete( $reason ) ) {
echo "[WARNING] Failed to delete local file\n";
}
}
$wikipage = WikiPage::factory( $title );
if(!$wikipage) {
throw new Exception("Wrong Wikipage $title");
}
$error = '';
$success = $wikipage->doDeleteArticle( "Spam", false, 0, true, $error, $SYSTEM_USER );
if(!$success) {
throw new Exception("Cannot drop page $title: $error");
}
echo "Dropping page... done!\n";
}
function is_user_trusted( $user_id ) {
$user = User::newFromId( $user_id );
return $user->isAllowed( 'autopatrol' );
}
/**
* Ran a raw SQL query and get an array of objects.
* @param string $query
* @return array
*/
function query_objects( $query ) {
$query = str_replace( "\n", " ", $query );
return wfGetDB( DB_MASTER )->query( trim( $query ) );
}
function contains_terms( $text, $terms ) {
$text = strtolower( $text );
foreach( $terms as $term ) {
$term = strtolower( $term );
if( $term && false !== strpos( $text, $term ) ) {
return $term;
}
}
return false;
}
/**
* Ask for a 'Y' or a 'n'. Yes is default.
* @param string $question Creative question text.
* @return bool True if the user pressed yes or just enter.
*/
function yesno( $question ) {
echo "$question [Y/n]\n";
$a = trim(readline());
return $a === '' || $a === 'Y' || $a === 'y';
}