Page MenuHomePhabricator
Paste P56671

MediaWiki SPAMMER Obliviater
ActivePublic

Authored by ValerioBoz-WMCH on Feb 11 2024, 8:12 PM.
Referenced Files
F41863317: MediaWiki SPAMMER Obliviater
Feb 12 2024, 8:00 AM
F41863258: MediaWiki SPAMMER Killer
Feb 12 2024, 7:50 AM
F41862987: MediaWiki SPAMMER Killer
Feb 12 2024, 7:19 AM
F41856772: MediaWiki SPAMMER Killer
Feb 11 2024, 8:12 PM
Subscribers
<?php /*
__ ___ ___ _ ___ __ _
/ |/ /__ ____/ (_)___ | | / (_) /__(_)
/ /|_/ / _ \/ __ / / __ `/ | /| / / / //_/ /
/ / / / __/ /_/ / / /_/ /| |/ |/ / / ,< / /
/_/ /_/\___/\__,_/_/\__,_/ |__/|__/_/_/|_/_/
_____ ____ ___ __ _____ _____________
/ ___// __ \/ | / |/ / |/ / ____/ __ \
\__ \/ /_/ / /| | / /|_/ / /|_/ / __/ / /_/ /
___/ / ____/ ___ |/ / / / / / / /___/ _, _/
/____/_/ /_/ |_/_/ /_/_/ /_/_____/_/ |_|
____ __ ___ _ __
/ __ \/ /_ / (_) __(_)___ _/ /____ _____
/ / / / __ \/ / / | / / / __ `/ __/ _ \/ ___/
/ /_/ / /_/ / / /| |/ / / /_/ / /_/ __/ /
\____/_.___/_/_/ |___/_/\__,_/\__/\___/_/ - by bozz \o/
FEATURES:
- keyword-based (to be added by you)
- find spammers, not just spam (!)
- then, OBLIVIATE SPAMMERS >:]
- dropping all pages created by that spammer
- dropping all revisions created by that spammer
THINGS THAT ARE MISSING:
- user ban is done separately (feel free to contribute to do it here)
RATIONALE
DON'T ADOPT THIS SPAMMER OBLIVIATER SCRIPT! THIS SCRIPT IS VERY RAW!
Even if "it worked on my computer" this script can nuke your wiki!
Batteries not included! Do a full backup of your wiki! Do it twice! Take a full snapshot!
Lock the garage! Keep children away! Give vitamins to children! and stay hydrated!
This script is supposed to be saved in the maintenance directory of MediaWiki.
This script was used to delete 700K+ pages of spam and 1+ millions of spammed revisions.
This script does not support any argument. It does not even support --wiki since in my case
I just need to execute something like:
HTTP_HOST=es.dicoado.org ./boz-obliviate-spammers.php
So, have fun, at your own risk.
Source code:
https://phabricator.wikimedia.org/P56671
Context:
https://phabricator.wikimedia.org/T308969
Author: Valerio Bozzolan, WMCH
License: GNU GPL v3+
*/
use MediaWiki\MediaWikiServices;
// The script processes this number of pages at a time in a single pagination.
$BATCH = 5000;
//
// SPAM KEYWORDS
//
// This is an input file.
// Format: 1 keyword for each line, separated by just one newline.
// So for example in one line you can have just "xxx" or " xxx " if it should be one word. Spaces are respected.
// Lines surrounded by just newlines will be ignored.
// Don't let children read this file... :D :D
// Don't contact me to provide this file. I nuked it to do not hurt my sensible eyes.
define( 'XXX_SPAM_KEYWORDS', '/tmp/spam-keywords.txt' );
//
// SPAMMERS FOUND
//
// This is an output file.
// Each line is a spammer user ID.
define( 'OBLIVIATED_USERS_LOG', 'obliviated-users.txt' );
// FALSE: you don't want interaction. You can still edit the spam keywords and the bot will reload it.
// TRUE: you want to review every page, manually.
define( 'INTERACTION', false );
// Super important constant.
define( 'OH_NO_A_CHAD', !INTERACTION );
// Spam time cut line. After this timestamp, there is spam.
// TODO: make an argument.
$AFTER_TIMESTAMP = '20160501000000';
require_once __DIR__ . "/commandLine.inc";
// TODO: create argument for this
$SYSTEM_USERNAME = 'WikiSysopMini';
$SYSTEM_USER = User::newFromName( $SYSTEM_USERNAME );
if ( !$SYSTEM_USER ) {
throw new Exception( "Invalid username" );
}
if(!INTERACTION) {
echo "------------------------------------\n";
echo "[WARNING] INTERACTION is disabled!!!\n";
echo "------------------------------------\n";
}
// Show some info.
echo "Current XXX_SPAM_KEYWORDS: " . XXX_SPAM_KEYWORDS . "\n";
echo "Current AFTER_TIMESTAMP: " . $AFTER_TIMESTAMP . "\n";
echo "Current SYSTEM_USERNAME: " . $SYSTEM_USERNAME . "\n";
// Allow a few seconds to change your mind.
if(OH_NO_A_CHAD) {
echo "TIME TO KILL DAMN SPAMMERZ. How? In the only known way: the HARD way.\n";
echo "Starting...\n";
echo "[Press CTRL+C to quit]\n";
sleep( 5 );
}
$to_be_obliviated_user_ids = [];
$paging = 0;
do {
echo "Querying $BATCH results (paging $paging)...\n";
$found_something = false;
// Find spammed revisions. Confirm. Oblivate authors.
foreach( search_some_spam_pages_with_rev_user( $BATCH, $paging ) as $spammed_page ) {
$found_something = true;
if(is_user_trusted($spammed_page->rev_user)) {
echo "Skip trusted user {$spammed_page->rev_user}\n";
continue;
}
$page = Title::newFromID( $spammed_page->page_id );
$text_short = trim( $spammed_page->old_text );
$text_short = mb_strimwidth($text_short, 0, 500, '...');
if( $text_short === '' ) {
echo "Skipped empty page\n";
continue;
}
// Confirm one by one.
echo "\n----------------\n";
echo "$text_short\n\n";
echo "----------------\n";
echo 'https://' . $_SERVER['HTTP_HOST'] . '/wiki/Special:PermaLink/' . $spammed_page->rev_id . "\n";
echo "\n";
$obliviate = false;
if( $found_term = contains_spam_keyword( $spammed_page->old_text, $surely_spam_keywords ) ) {
echo "SUSPICIOUS TERM FOUND FROM UNTRUSTED USER: <$found_term> AND THE DATE MATCHES THE SPECIFIED SPAM TIME WINDOW >= $AFTER_TIMESTAMP\n";
$obliviate = true;
} else if( INTERACTION ) {
$obliviate = yesno("Is this SPAM? Delete its user and ALL their edits?");
}
if( $obliviate ) {
echo "OBLIVIATING >:D\n";
obliviate_user( $spammed_page->rev_user );
} else {
echo "Skipping...\n Consider putting user as autopatrolled if you trust that.\n";
}
}
$paging++;
} while( $found_something );
echo "DONE! Let's play again soon :D :D :D\n";
echo "Now please run:\n";
echo " ./deleteOrphanedRevisions.php\n";
echo " ./deleteArchivedRevisions.php\n";
/**
* Completely nuke one MediaWiki user by its User ID (numeric).
* @param int $user_id
*/
function obliviate_user( $user_id ) {
// Put obliviated users in a log file
file_put_contents(OBLIVIATED_USERS_LOG, $user_id . "\n", FILE_APPEND);
// Find pages where the spammer is author, and delete one by one.
foreach( search_pages_where_user_is_author( $user_id ) as $created_page ) {
drop_page( $created_page->page_id );
}
// Find other edits where the spammer is author.
$revisions_to_be_nuked = [];
foreach( search_revisions_where_user_is_author( $user_id ) as $created_revision ) {
$revisions_to_be_nuked[] = $created_revision->rev_id;
}
// Delete revisions, in batches.
while( $revisions_to_be_nuked ) {
$some_revisions_to_be_nuked = array_splice($revisions_to_be_nuked, 0, 50);
drop_revisions($some_revisions_to_be_nuked);
}
}
function search_pages_where_user_is_author( $user_id ) {
$user_id_safe = (int) $user_id;
if( !$user_id_safe ) {
throw new Exception( "Not a valid User ID: $user_id");
}
$sql_first_revision_of_user = "
SELECT
page_id,
rev_page,
page_namespace,
page_title
FROM
revision AS mainrev
JOIN page ON page_id = rev_page
WHERE rev_user = $user_id_safe
AND rev_timestamp =
( SELECT MIN(rev_timestamp)
FROM revision AS childrev
WHERE childrev.rev_page = mainrev.rev_page
)
";
return query_objects( $sql_first_revision_of_user );
}
function search_revisions_where_user_is_author( $user_id ) {
$user_id_safe = (int) $user_id;
if( !$user_id_safe ) {
throw new Exception( "Not a valid User ID: $user_id");
}
$sql_revisions_of_user = "
SELECT
rev_id
FROM
revision
WHERE rev_user = $user_id_safe
";
return query_objects( $sql_revisions_of_user );
}
/**
* Search some pages and return the related revision's User.
* PLEASE SANITIZE THE $AFTER_TIMESTAMP GLOBAL
* @param int $limit How many results for each page.
* @param int $page From zero to infinite
*/
function search_some_spam_pages_with_rev_user( $limit, $page ) {
global $AFTER_TIMESTAMP;
$offset = $limit * $page;
$timestamp_safe = wfGetDB( DB_MASTER )->addQuotes( $AFTER_TIMESTAMP );
/*
* At the moment we just inspect all revisions with runtime checks.
* But you may want to activate something like this to do SQL-based checks.
* (spoiler: probably slower, if you have so much spam) - so, commented.
$like_queries = [];
foreach( $argv as $arg ) {
$arg = trim( $arg );
$search_safe = 'old_text LIKE %' . $dbw->addQuotes( $arg ) . '%';
$like_queries[] = $search_safe;
}
$like_queries_safe = implode( ' AND ', $like_queries );
*/
// Select last version
$sql_spammed_pages = "
SELECT
rev_id,
page_id,
rev_user,
old_text
FROM page
JOIN revision
ON (page_latest = rev_id)
JOIN text
ON (rev_text_id = old_id)
WHERE rev_timestamp > $timestamp_safe
AND
page_namespace IN( 0, 2 )
LIMIT $limit
OFFSET $offset
";
return query_objects( $sql_spammed_pages );
}
/**
* Please do not specify too much arguments or the shell may crash.
* Please VALIDATE the input. Each revision MUST be numeric integer.
* @param array $revision_ids
*/
function drop_revisions( $revision_ids ) {
if(!$revision_ids) {
return;
}
// LOL
$revision_ids_str = implode( " ", $revision_ids );
echo "Deleting revisions created by known spammer: $revision_ids_str\n";
$result = passthru( sprintf( 'php ./deleteRevision.php %s', $revision_ids_str ) );
if($result === false) {
throw new Exeption("Failed deleting revision: $revision_id");
}
}
function drop_page( $page_id ) {
global $SYSTEM_USER;
$page_id = (int) $page_id;
if( !$page_id ) {
throw new Exception( "Not a valid page ID: $page_id");
}
$title = Title::newFromID( $page_id );
if(!$title) {
throw new Exception("Wrong page ID $page_id" );
}
echo "Dropping page $title\n";
// Nothing to do
if(!$title->exists()){
return;
}
if ( $title->getNamespace() == NS_FILE ) {
$img = wfFindFile( $title, [ 'ignoreRedirect' => true ] );
if ( $img && $img->isLocal() && !$img->delete( $reason ) ) {
echo "[WARNING] Failed to delete local file\n";
}
}
$wikipage = WikiPage::factory( $title );
if(!$wikipage) {
throw new Exception("Wrong Wikipage $title");
}
$error = '';
$success = $wikipage->doDeleteArticle( "Spam", false, 0, true, $error, $SYSTEM_USER );
if(!$success) {
throw new Exception("Cannot drop page $title: $error");
}
echo "Dropping page... done!\n";
}
function is_user_trusted( $user_id ) {
$user = User::newFromId( $user_id );
return $user->isAllowed( 'autopatrol' );
}
/**
* Ran a raw SQL query and get an array of objects.
* @param string $query
* @return array
*/
function query_objects( $query ) {
$query = str_replace( "\n", " ", $query );
return wfGetDB( DB_MASTER )->query( trim( $query ) );
}
/**
* Check whenever an input string contains a well-known spam keyword.
* It will respect your spaces around a keyword.
* A line consisting in just a single space (or multiple ones) will be ignored.
* The comparison is case-insensive.
* @param string $text
* @return false|string False if not found, or, the found keyword.
*/
function contains_spam_keyword( $text ) {
$terms_raw = file_get_contents(XXX_SPAM_KEYWORDS);
$terms = explode( "\n", trim( $terms_raw ) );
$text = strtolower( $text );
foreach( $terms as $term ) {
// A line consisting of a single space would be a huge harm
$term_space = trim( $term );
if($term_space !== '') {
$term = strtolower( $term );
if(false !== mb_strpos( $text, $term ) ) {
return $term;
}
}
}
return false;
}
/**
* Ask for a 'Y' or a 'n'. Yes is default.
* @param string $question Creative question text.
* @return bool True if the user pressed yes or just enter.
*/
function yesno( $question ) {
echo "$question [Y/n]\n";
$a = trim(readline());
return $a === '' || $a === 'Y' || $a === 'y';
}