Page MenuHomePhabricator

Unindexed pages

Authored By
freephile
Dec 31 2018, 6:41 PM
Size
4 KB
Referenced Files
None
Subscribers
None

Unindexed pages

<?php
/**
* A maintenance script that allows you to see if any documents in an
* Elasticsearch index which are marked 'OK' actually do not have content
* in their associated entry within Elasticsearch. This can happen if the
* index is somehow corrupted (during upgrade?). If this type of corruptions
* happens, you'll experience a lack of search results for content that is known
* to be in the wiki.
*
* Elasticsearch will not index that 'missing' content until a new edit is
* made to the page marked as 'OK'. This script
* is possibly the only way (short of some jedi es query) for a MediaWiki admin
* to see which wiki articles are affected by this "black hole" of search.
*
* To avoid changing any content, we attempt to make a 'null edit' for pages
* affected by the situation. These edits do appear in RecentChanges, and can be
* managed with other MediaWiki administrative tools.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup Maintenance
*/
/**
* This script is a maintenance script for CirrusSearch
* @var [type]
*/
namespace CirrusSearch;
use CirrusSearch\Maintenance\Maintenance;
use Title;
use WikiPage;
use MediaWiki\MediaWikiServices;
$IP = getenv( 'MW_INSTALL_PATH' );
if ( $IP === false ) {
$IP = __DIR__ . '/../../..';
}
require_once "$IP/maintenance/Maintenance.php";
require_once __DIR__ . '/../includes/Maintenance/Maintenance.php';
/**
* Maintenance script to check articles to see if they are indexed by Elasticsearch.
*
* @ingroup Maintenance
*/
class PurgeUnindexedPages extends Maintenance {
public function __construct() {
parent::__construct();
$this->addDescription( 'Check articles to see if they are indexed by Elasticsearch' );
// $name, $description, $required = false, $withArg = false, $shortName = false, $multiOccurrence = false
$this->addOption(
'dry-run',
'Do not perform any corrections/edits with "-n or --dry-run"',
false,
false,
'n'
);
$this->addOption('verbose',
'List the titles with "-v or --verbose"',
false,
false,
'v'
);
$this->setBatchSize( 1000 ); // parent method; adds --batch-size option
}
public function execute() {
global $wgUser;
$start = ''; // what title starts the select statement
$numArticles = 0; // how many articles are there?
$numBad = 0; // how many bad articles did we find?
// do not edit anything
$this->mNope = ( $this->hasOption( 'dry-run' ) )? true : false;
// print a list (and stats)
$this->mVerbose = ( $this->hasOption( 'verbose' ) )? true: false;
$config = MediaWikiServices::getInstance()->getConfigFactory()->makeConfig( 'CirrusSearch' );
$conn = new Connection( $config );
$searcher = new Searcher( $conn, 0, 0, $config, [], $wgUser );
$db = wfGetDB( DB_REPLICA );
do {
// $table, $vars, $conds = '', $fname = __METHOD__, $options = [], $join_conds = []
$res = $db->select( 'page', [ 'page_namespace', 'page_title' ],
[ 'page_title > ' . $db->addQuotes( $start ) ] , __METHOD__,
[ 'ORDER BY' => 'page_title', 'LIMIT' => $this->getBatchSize() ] );
foreach ( $res as $row ) {
$numArticles++;
$start = $row->page_title;
$title = Title::makeTitleSafe( $row->page_namespace, $row->page_title );
if ($title === null) {
$this->output( "unable to create title object from " .
"{$row->page_namespace}: {$row->page_title}\n" );
continue;
}
$docId = $config->makeId( $title->getArticleID() );
$esSources = $searcher->get( [ $docId ], true );
// We erroneously relied on if ( !$esSources->isOK() )
// until it was discovered that
// the bad articles were already marked 'OK'.
if ( !count($esSources->value) ) {
$numBad++;
if ( $this->mVerbose ) {
$this->output( $title->getText() . "\n" );
}
if ( !$this->mNope ) {
$page = new WikiPage( $title );
$page->doEditContent( $page->getContent(), 'This changes nothing', EDIT_UPDATE, false, $wgUser );
$this->output( $title->getText() . " fixed\n" );
}
}
}
} while ( $res->numRows() );
$this->output( "Found $numBad hidden articles out of $numArticles.\n\n" );
}
}
$maintClass = PurgeUnindexedPages::class;
require_once RUN_MAINTENANCE_IF_MAIN;

File Metadata

Mime Type
text/plain; charset=utf-8
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
6906826
Default Alt Text
Unindexed pages (4 KB)

Event Timeline