Page MenuHomePhabricator
Paste P5989

Proposed patch for alternate watchlist query
ActivePublic

Authored by Bawolff on Sep 12 2017, 10:39 AM.
Tags
None
Referenced Files
F9478521: Proposed patch for alternate watchlist query
Sep 12 2017, 10:39 AM
Subscribers
None
From c172448fbfdc7bb9494b10173da5ebaeabe8c781 Mon Sep 17 00:00:00 2001
From: Brian Wolff <bawolff+wn@gmail.com>
Date: Tue, 12 Sep 2017 09:08:30 +0000
Subject: [PATCH] Attempt to optimize watchlist query
Switch watchlist (in the common case) to use two queries intead
of one. In the first stage it simply gets a list of rc_ids to
display. The second stage fetches the information about those ids.
The implementation is hacky, however I did not want to change
the interface that extensions see, and I was also wary of
changing the query for some of the more exotic filtering
options.
Its hoped this will improve performance in the case where the
query plan generates a temporary table, by minimizing the
record length size of the temporary table, making the temp table
smaller. Additionally the new query uses a covering index for
the watchlist table access, and index condition pushdown for
detecting if a recentchange entry is too old to be displayed.
Bug: T171027
Change-Id: I41f0a0cbe35e247cfdb861d6e9bd106b4110a2ce
---
includes/specials/SpecialWatchlist.php | 221 ++++++++++++++++++++++++++++++---
1 file changed, 206 insertions(+), 15 deletions(-)
diff --git a/includes/specials/SpecialWatchlist.php b/includes/specials/SpecialWatchlist.php
index 7049744..629e340 100644
--- a/includes/specials/SpecialWatchlist.php
+++ b/includes/specials/SpecialWatchlist.php
@@ -23,6 +23,7 @@
use MediaWiki\MediaWikiServices;
use Wikimedia\Rdbms\ResultWrapper;
+use Wikimedia\Rdbms\FakeResultWrapper;
use Wikimedia\Rdbms\IDatabase;
/**
@@ -408,18 +409,9 @@ class SpecialWatchlist extends ChangesListSpecialPage {
// Log entries with DELETED_ACTION must not show up unless the user has
// the necessary rights.
- if ( !$user->isAllowed( 'deletedhistory' ) ) {
- $bitmask = LogPage::DELETED_ACTION;
- } elseif ( !$user->isAllowedAny( 'suppressrevision', 'viewsuppressed' ) ) {
- $bitmask = LogPage::DELETED_ACTION | LogPage::DELETED_RESTRICTED;
- } else {
- $bitmask = 0;
- }
- if ( $bitmask ) {
- $conds[] = $dbr->makeList( [
- 'rc_type != ' . RC_LOG,
- $dbr->bitAnd( 'rc_deleted', $bitmask ) . " != $bitmask",
- ], LIST_OR );
+ $revDel = $this->makeRevDelFilteringCond( $dbr, $user );
+ if ( $revDel ) {
+ $conds[] = $revDel;
}
$tagFilter = $opts['tagfilter'] ? explode( '|', $opts['tagfilter'] ) : [];
@@ -455,14 +447,213 @@ class SpecialWatchlist extends ChangesListSpecialPage {
// they so desire, override the ORDER BY / LIMIT condition(s)
$query_options = array_merge( $orderByAndLimit, $query_options );
- return $dbr->select(
+ if ( $this->canSplitIntoMultiQuery(
+ $tables, $fields, $conds, $query_options, $join_conds
+ ) ) {
+ return $this->selectAsMultiStep(
+ $dbr,
+ $tables,
+ $fields,
+ $conds,
+ $query_options,
+ $join_conds
+ );
+ } else {
+ return $dbr->select(
+ $tables,
+ $fields,
+ $conds,
+ __METHOD__,
+ $query_options,
+ $join_conds
+ );
+ }
+ }
+
+ /**
+ * Make the where condition to filter rev deleted stuff.
+ *
+ * Log entries with DELETED_ACTION must not show up.
+ *
+ * @param IDatabse $dbr
+ * @param User $user The user to do permission checks with.
+ */
+ private function makeRevDelFilteringCond( IDatabase $dbr, User $user ) {
+ if ( !$user->isAllowed( 'deletedhistory' ) ) {
+ $bitmask = LogPage::DELETED_ACTION;
+ } elseif ( !$user->isAllowedAny( 'suppressrevision', 'viewsuppressed' ) ) {
+ $bitmask = LogPage::DELETED_ACTION | LogPage::DELETED_RESTRICTED;
+ } else {
+ $bitmask = 0;
+ }
+ if ( $bitmask ) {
+ return $dbr->makeList( [
+ 'rc_type != ' . RC_LOG,
+ $dbr->bitAnd( 'rc_deleted', $bitmask ) . " != $bitmask",
+ ], LIST_OR );
+ }
+ }
+
+ /**
+ * Detect if the query is recognizable enough that it can be split
+ *
+ * This query rewriting system is really hacky...
+ *
+ * @return boolean
+ */
+ private function canSplitIntoMultiQuery(
+ $tables,
+ $fields,
+ $conds,
+ $query_options,
+ $join_conds
+ ) {
+ // First make sure nothing was added to the select fields, since
+ // we are selecting them separate from the complex join.
+ foreach( $fields as $alias => $def ) {
+ if (
+ $alias === 'ts_tags' ||
+ substr( $def, 0, 3 ) === 'rc_' ||
+ $def === 'wl_notificationtimestamp' ||
+ $def === 'page_latest' ||
+ $def === 'NULL'
+ ) {
+ continue;
+ }
+ // Some field from some join was added.
+ return false;
+ }
+
+ if (
+ $query_options['ORDER BY'] !== 'rc_timestamp DESC' ||
+ count( $query_options ) !== 2 /* LIMIT and ORDER BY */
+ ) {
+ // We mess around with the ordering, so bail if not default.
+ // If there are other options, probably best not to mess around.
+ // In particular this disqualifies multi-tag filtering.
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Do the query in multiple steps.
+ *
+ * This first selects what the rc_id are, and gets details
+ * in a second query.
+ *
+ * The intention of these optimizations are as follows (See T171027):
+ * * Use a covering index for watchlist by fetching wl_notificationtimestamp later
+ * * Minimize the fields in the SELECT list, to minimize the record lengths in the
+ * temporary table when the query plan uses a temporary table. Smaller table will
+ * hopefully move less data around and be faster.
+ * * Use rc_id in place of rc_timestamp where possible - Sorting by rc_id may
+ * reduce size of temporary table slightly. Using rc_id instead of rc_timestamp in
+ * the WHERE condition can allow us to use index condition pushdown with the
+ * name_title index on recentchanges.
+ *
+ * This method assumes arguments have already been checked by canSplitIntoMultiQuery()
+ */
+ private function selectAsMultiStep(
+ IDatabase $dbr,
+ $tables,
+ $fields,
+ $conds,
+ $query_options,
+ $join_conds
+ ) {
+ // Replace rc_timestamp > foo with rc_id > some number.
+ // rc_id monotonically increases so this should be equivalent,
+ // and allows index condition pushdown to be used when using
+ // the index (name_title) we assume we will use in this case.
+ // TODO: Maybe we should just always use rc_id and change it at
+ // the source? I'm unsure how that would affect other variations
+ // of this query that don't use this multi-stage select method.
+ foreach( $conds as $index => &$cond ) {
+ if ( !is_numeric( $index ) || !is_string( $cond ) ) {
+ continue;
+ }
+
+ $m = [];
+ // Hacky... Also may not work on all db backends
+ // But should fallback safely on the db backends it doesn't work on.
+ if ( preg_match( '/^rc_timestamp > ["\'](\d{14})["\']$/', $cond, $m ) ) {
+ $startingId = $dbr->selectField(
+ 'recentchanges',
+ 'rc_id',
+ 'rc_timestamp > ' . $dbr->addQuotes( $m[1] ),
+ __METHOD__ . '-id to ts',
+ [ 'LIMIT' => 1, 'ORDER BY' => 'rc_timestamp ASC' ]
+ );
+ if ( $startingId === false ) {
+ // no results? Lets just leave it alone in that case.
+ continue;
+ }
+ $cond = 'rc_id >= ' . ((int)$startingId);
+ }
+ }
+
+ $query_options['ORDER BY'] = 'rc_id DESC';
+
+ $rcIds = $dbr->selectFieldValues(
$tables,
- $fields,
+ 'rc_id',
$conds,
- __METHOD__,
+ __METHOD__ . '-filter',
$query_options,
$join_conds
);
+ if ( !$rcIds ) {
+ return false;
+ }
+
+ // This doesn't just use $join_cond
+ // as extensions add a bunch of extra joins to that
+ // that we don't need here. All we do here are the
+ // joins that are needed to get the SELECT fields
+ // (page_latest and wl_notificationtimestamp)
+ $detailJoinCond = [
+ 'watchlist' => [
+ 'INNER JOIN',
+ [
+ 'wl_user' => $this->getUser()->getId(),
+ 'wl_namespace=rc_namespace',
+ 'wl_title=rc_title'
+ ],
+ ],
+ 'page' => [
+ 'LEFT JOIN',
+ 'rc_cur_id=page_id'
+ ]
+ ];
+
+ $results = [];
+ // Split into chunks so we don't send a query that
+ // has rc_id IN ( 10000 values here ).
+ // TODO: Is 100 a good chunk size? Does it even
+ // make sense to split into chunks?
+ $chunks = array_chunk( $rcIds, 100 );
+ foreach ( $chunks as $chunk ) {
+ $detailConds = [ 'rc_id' => $chunk ];
+ // As paranoia measure, recheck permissions.
+ $revDel = $this->makeRevDelFilteringCond( $dbr, $this->getUser() );
+ if ( $revDel ) {
+ $detailConds[] = $revDel;
+ }
+
+ $res = $dbr->select(
+ [ 'recentchanges', 'page', 'watchlist' ],
+ $fields,
+ $detailConds,
+ __METHOD__ . '-details',
+ [ 'ORDER BY' => 'rc_id DESC' ],
+ $detailJoinCond
+ );
+ foreach( $res as $row ) {
+ $results[] = $row;
+ }
+ }
+ return new FakeResultWrapper( $results );
}
protected function runMainQueryHook( &$tables, &$fields, &$conds, &$query_options,
--
1.9.5 (Apple Git-50.3)