Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F27765895
Unindexed pages
No One
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Authored By
freephile
Dec 31 2018, 6:41 PM
2018-12-31 18:41:25 (UTC+0)
Size
4 KB
Referenced Files
None
Subscribers
None
Unindexed pages
View Options
<?php
/**
* A maintenance script that allows you to see if any documents in an
* Elasticsearch index which are marked 'OK' actually do not have content
* in their associated entry within Elasticsearch. This can happen if the
* index is somehow corrupted (during upgrade?). If this type of corruptions
* happens, you'll experience a lack of search results for content that is known
* to be in the wiki.
*
* Elasticsearch will not index that 'missing' content until a new edit is
* made to the page marked as 'OK'. This script
* is possibly the only way (short of some jedi es query) for a MediaWiki admin
* to see which wiki articles are affected by this "black hole" of search.
*
* To avoid changing any content, we attempt to make a 'null edit' for pages
* affected by the situation. These edits do appear in RecentChanges, and can be
* managed with other MediaWiki administrative tools.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup Maintenance
*/
/**
* This script is a maintenance script for CirrusSearch
* @var [type]
*/
namespace
CirrusSearch
;
use
CirrusSearch\Maintenance\Maintenance
;
use
Title
;
use
WikiPage
;
use
MediaWiki\MediaWikiServices
;
$IP
=
getenv
(
'MW_INSTALL_PATH'
);
if
(
$IP
===
false
)
{
$IP
=
__DIR__
.
'/../../..'
;
}
require_once
"$IP/maintenance/Maintenance.php"
;
require_once
__DIR__
.
'/../includes/Maintenance/Maintenance.php'
;
/**
* Maintenance script to check articles to see if they are indexed by Elasticsearch.
*
* @ingroup Maintenance
*/
class
PurgeUnindexedPages
extends
Maintenance
{
public
function
__construct
()
{
parent
::
__construct
();
$this
->
addDescription
(
'Check articles to see if they are indexed by Elasticsearch'
);
// $name, $description, $required = false, $withArg = false, $shortName = false, $multiOccurrence = false
$this
->
addOption
(
'dry-run'
,
'Do not perform any corrections/edits with "-n or --dry-run"'
,
false
,
false
,
'n'
);
$this
->
addOption
(
'verbose'
,
'List the titles with "-v or --verbose"'
,
false
,
false
,
'v'
);
$this
->
setBatchSize
(
1000
);
// parent method; adds --batch-size option
}
public
function
execute
()
{
global
$wgUser
;
$start
=
''
;
// what title starts the select statement
$numArticles
=
0
;
// how many articles are there?
$numBad
=
0
;
// how many bad articles did we find?
// do not edit anything
$this
->
mNope
=
(
$this
->
hasOption
(
'dry-run'
)
)?
true
:
false
;
// print a list (and stats)
$this
->
mVerbose
=
(
$this
->
hasOption
(
'verbose'
)
)?
true
:
false
;
$config
=
MediaWikiServices
::
getInstance
()->
getConfigFactory
()->
makeConfig
(
'CirrusSearch'
);
$conn
=
new
Connection
(
$config
);
$searcher
=
new
Searcher
(
$conn
,
0
,
0
,
$config
,
[],
$wgUser
);
$db
=
wfGetDB
(
DB_REPLICA
);
do
{
// $table, $vars, $conds = '', $fname = __METHOD__, $options = [], $join_conds = []
$res
=
$db
->
select
(
'page'
,
[
'page_namespace'
,
'page_title'
],
[
'page_title > '
.
$db
->
addQuotes
(
$start
)
]
,
__METHOD__
,
[
'ORDER BY'
=>
'page_title'
,
'LIMIT'
=>
$this
->
getBatchSize
()
]
);
foreach
(
$res
as
$row
)
{
$numArticles
++;
$start
=
$row
->
page_title
;
$title
=
Title
::
makeTitleSafe
(
$row
->
page_namespace
,
$row
->
page_title
);
if
(
$title
===
null
)
{
$this
->
output
(
"unable to create title object from "
.
"{$row->page_namespace}: {$row->page_title}
\n
"
);
continue
;
}
$docId
=
$config
->
makeId
(
$title
->
getArticleID
()
);
$esSources
=
$searcher
->
get
(
[
$docId
],
true
);
// We erroneously relied on if ( !$esSources->isOK() )
// until it was discovered that
// the bad articles were already marked 'OK'.
if
(
!
count
(
$esSources
->
value
)
)
{
$numBad
++;
if
(
$this
->
mVerbose
)
{
$this
->
output
(
$title
->
getText
()
.
"
\n
"
);
}
if
(
!
$this
->
mNope
)
{
$page
=
new
WikiPage
(
$title
);
$page
->
doEditContent
(
$page
->
getContent
(),
'This changes nothing'
,
EDIT_UPDATE
,
false
,
$wgUser
);
$this
->
output
(
$title
->
getText
()
.
" fixed
\n
"
);
}
}
}
}
while
(
$res
->
numRows
()
);
$this
->
output
(
"Found $numBad hidden articles out of $numArticles.
\n\n
"
);
}
}
$maintClass
=
PurgeUnindexedPages
::
class
;
require_once
RUN_MAINTENANCE_IF_MAIN
;
File Metadata
Details
Attached
Mime Type
text/plain; charset=utf-8
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
6906826
Default Alt Text
Unindexed pages (4 KB)
Attached To
Mode
P7943 Unindexed pages
Attached
Detach File
Event Timeline
Log In to Comment