From c172448fbfdc7bb9494b10173da5ebaeabe8c781 Mon Sep 17 00:00:00 2001 From: Brian Wolff Date: Tue, 12 Sep 2017 09:08:30 +0000 Subject: [PATCH] Attempt to optimize watchlist query Switch watchlist (in the common case) to use two queries intead of one. In the first stage it simply gets a list of rc_ids to display. The second stage fetches the information about those ids. The implementation is hacky, however I did not want to change the interface that extensions see, and I was also wary of changing the query for some of the more exotic filtering options. Its hoped this will improve performance in the case where the query plan generates a temporary table, by minimizing the record length size of the temporary table, making the temp table smaller. Additionally the new query uses a covering index for the watchlist table access, and index condition pushdown for detecting if a recentchange entry is too old to be displayed. Bug: T171027 Change-Id: I41f0a0cbe35e247cfdb861d6e9bd106b4110a2ce --- includes/specials/SpecialWatchlist.php | 221 ++++++++++++++++++++++++++++++--- 1 file changed, 206 insertions(+), 15 deletions(-) diff --git a/includes/specials/SpecialWatchlist.php b/includes/specials/SpecialWatchlist.php index 7049744..629e340 100644 --- a/includes/specials/SpecialWatchlist.php +++ b/includes/specials/SpecialWatchlist.php @@ -23,6 +23,7 @@ use MediaWiki\MediaWikiServices; use Wikimedia\Rdbms\ResultWrapper; +use Wikimedia\Rdbms\FakeResultWrapper; use Wikimedia\Rdbms\IDatabase; /** @@ -408,18 +409,9 @@ class SpecialWatchlist extends ChangesListSpecialPage { // Log entries with DELETED_ACTION must not show up unless the user has // the necessary rights. - if ( !$user->isAllowed( 'deletedhistory' ) ) { - $bitmask = LogPage::DELETED_ACTION; - } elseif ( !$user->isAllowedAny( 'suppressrevision', 'viewsuppressed' ) ) { - $bitmask = LogPage::DELETED_ACTION | LogPage::DELETED_RESTRICTED; - } else { - $bitmask = 0; - } - if ( $bitmask ) { - $conds[] = $dbr->makeList( [ - 'rc_type != ' . RC_LOG, - $dbr->bitAnd( 'rc_deleted', $bitmask ) . " != $bitmask", - ], LIST_OR ); + $revDel = $this->makeRevDelFilteringCond( $dbr, $user ); + if ( $revDel ) { + $conds[] = $revDel; } $tagFilter = $opts['tagfilter'] ? explode( '|', $opts['tagfilter'] ) : []; @@ -455,14 +447,213 @@ class SpecialWatchlist extends ChangesListSpecialPage { // they so desire, override the ORDER BY / LIMIT condition(s) $query_options = array_merge( $orderByAndLimit, $query_options ); - return $dbr->select( + if ( $this->canSplitIntoMultiQuery( + $tables, $fields, $conds, $query_options, $join_conds + ) ) { + return $this->selectAsMultiStep( + $dbr, + $tables, + $fields, + $conds, + $query_options, + $join_conds + ); + } else { + return $dbr->select( + $tables, + $fields, + $conds, + __METHOD__, + $query_options, + $join_conds + ); + } + } + + /** + * Make the where condition to filter rev deleted stuff. + * + * Log entries with DELETED_ACTION must not show up. + * + * @param IDatabse $dbr + * @param User $user The user to do permission checks with. + */ + private function makeRevDelFilteringCond( IDatabase $dbr, User $user ) { + if ( !$user->isAllowed( 'deletedhistory' ) ) { + $bitmask = LogPage::DELETED_ACTION; + } elseif ( !$user->isAllowedAny( 'suppressrevision', 'viewsuppressed' ) ) { + $bitmask = LogPage::DELETED_ACTION | LogPage::DELETED_RESTRICTED; + } else { + $bitmask = 0; + } + if ( $bitmask ) { + return $dbr->makeList( [ + 'rc_type != ' . RC_LOG, + $dbr->bitAnd( 'rc_deleted', $bitmask ) . " != $bitmask", + ], LIST_OR ); + } + } + + /** + * Detect if the query is recognizable enough that it can be split + * + * This query rewriting system is really hacky... + * + * @return boolean + */ + private function canSplitIntoMultiQuery( + $tables, + $fields, + $conds, + $query_options, + $join_conds + ) { + // First make sure nothing was added to the select fields, since + // we are selecting them separate from the complex join. + foreach( $fields as $alias => $def ) { + if ( + $alias === 'ts_tags' || + substr( $def, 0, 3 ) === 'rc_' || + $def === 'wl_notificationtimestamp' || + $def === 'page_latest' || + $def === 'NULL' + ) { + continue; + } + // Some field from some join was added. + return false; + } + + if ( + $query_options['ORDER BY'] !== 'rc_timestamp DESC' || + count( $query_options ) !== 2 /* LIMIT and ORDER BY */ + ) { + // We mess around with the ordering, so bail if not default. + // If there are other options, probably best not to mess around. + // In particular this disqualifies multi-tag filtering. + return false; + } + return true; + } + + /** + * Do the query in multiple steps. + * + * This first selects what the rc_id are, and gets details + * in a second query. + * + * The intention of these optimizations are as follows (See T171027): + * * Use a covering index for watchlist by fetching wl_notificationtimestamp later + * * Minimize the fields in the SELECT list, to minimize the record lengths in the + * temporary table when the query plan uses a temporary table. Smaller table will + * hopefully move less data around and be faster. + * * Use rc_id in place of rc_timestamp where possible - Sorting by rc_id may + * reduce size of temporary table slightly. Using rc_id instead of rc_timestamp in + * the WHERE condition can allow us to use index condition pushdown with the + * name_title index on recentchanges. + * + * This method assumes arguments have already been checked by canSplitIntoMultiQuery() + */ + private function selectAsMultiStep( + IDatabase $dbr, + $tables, + $fields, + $conds, + $query_options, + $join_conds + ) { + // Replace rc_timestamp > foo with rc_id > some number. + // rc_id monotonically increases so this should be equivalent, + // and allows index condition pushdown to be used when using + // the index (name_title) we assume we will use in this case. + // TODO: Maybe we should just always use rc_id and change it at + // the source? I'm unsure how that would affect other variations + // of this query that don't use this multi-stage select method. + foreach( $conds as $index => &$cond ) { + if ( !is_numeric( $index ) || !is_string( $cond ) ) { + continue; + } + + $m = []; + // Hacky... Also may not work on all db backends + // But should fallback safely on the db backends it doesn't work on. + if ( preg_match( '/^rc_timestamp > ["\'](\d{14})["\']$/', $cond, $m ) ) { + $startingId = $dbr->selectField( + 'recentchanges', + 'rc_id', + 'rc_timestamp > ' . $dbr->addQuotes( $m[1] ), + __METHOD__ . '-id to ts', + [ 'LIMIT' => 1, 'ORDER BY' => 'rc_timestamp ASC' ] + ); + if ( $startingId === false ) { + // no results? Lets just leave it alone in that case. + continue; + } + $cond = 'rc_id >= ' . ((int)$startingId); + } + } + + $query_options['ORDER BY'] = 'rc_id DESC'; + + $rcIds = $dbr->selectFieldValues( $tables, - $fields, + 'rc_id', $conds, - __METHOD__, + __METHOD__ . '-filter', $query_options, $join_conds ); + if ( !$rcIds ) { + return false; + } + + // This doesn't just use $join_cond + // as extensions add a bunch of extra joins to that + // that we don't need here. All we do here are the + // joins that are needed to get the SELECT fields + // (page_latest and wl_notificationtimestamp) + $detailJoinCond = [ + 'watchlist' => [ + 'INNER JOIN', + [ + 'wl_user' => $this->getUser()->getId(), + 'wl_namespace=rc_namespace', + 'wl_title=rc_title' + ], + ], + 'page' => [ + 'LEFT JOIN', + 'rc_cur_id=page_id' + ] + ]; + + $results = []; + // Split into chunks so we don't send a query that + // has rc_id IN ( 10000 values here ). + // TODO: Is 100 a good chunk size? Does it even + // make sense to split into chunks? + $chunks = array_chunk( $rcIds, 100 ); + foreach ( $chunks as $chunk ) { + $detailConds = [ 'rc_id' => $chunk ]; + // As paranoia measure, recheck permissions. + $revDel = $this->makeRevDelFilteringCond( $dbr, $this->getUser() ); + if ( $revDel ) { + $detailConds[] = $revDel; + } + + $res = $dbr->select( + [ 'recentchanges', 'page', 'watchlist' ], + $fields, + $detailConds, + __METHOD__ . '-details', + [ 'ORDER BY' => 'rc_id DESC' ], + $detailJoinCond + ); + foreach( $res as $row ) { + $results[] = $row; + } + } + return new FakeResultWrapper( $results ); } protected function runMainQueryHook( &$tables, &$fields, &$conds, &$query_options, -- 1.9.5 (Apple Git-50.3)