Page MenuHomePhabricator

T124404-REL1_29.patch

Authored By
Bawolff
Nov 13 2017, 6:24 PM
Size
5 KB
Referenced Files
None
Subscribers
None

T124404-REL1_29.patch

From ed8bc226841e508ac07af4c62f1f2cb6cc131935 Mon Sep 17 00:00:00 2001
From: Brian Wolff <bawolff+wn@gmail.com>
Date: Sun, 24 Jan 2016 05:29:10 -0500
Subject: [PATCH] SECURITY: XSS in langconverter when regex hits
pcre.backtrack_limit
Adjust regexes for what not to convert to avoid backtracking by
preferring possesive quantifiers
Add check that we really have matched to the end of the string, and
log error if the regex hits some sort of error preventing the
entire string from being matched. Should the regex not match to the
end, then language conversion is disabled for the string.
Bug: T124404
Change-Id: I4f0c171c7da804e9c1508ef1f59556665a318f6a
---
languages/LanguageConverter.php | 57 +++++++++++++++++------
tests/phpunit/languages/LanguageConverterTest.php | 19 ++++++++
2 files changed, 61 insertions(+), 15 deletions(-)
diff --git a/languages/LanguageConverter.php b/languages/LanguageConverter.php
index 4c3e5be..e5a437f 100644
--- a/languages/LanguageConverter.php
+++ b/languages/LanguageConverter.php
@@ -20,6 +20,8 @@
*/
use MediaWiki\MediaWikiServices;
+use MediaWiki\Logger\LoggerFactory;
+
/**
* Base class for language conversion.
* @ingroup Language
@@ -359,24 +361,30 @@ class LanguageConverter {
}
/* we convert everything except:
- * 1. HTML markups (anything between < and >)
- * 2. HTML entities
- * 3. placeholders created by the parser
- */
- $marker = '|' . Parser::MARKER_PREFIX . '[\-a-zA-Z0-9]+';
+ 1. HTML markups (anything between < and >)
+ 2. HTML entities
+ 3. placeholders created by the parser
+ IMPORTANT: Beware of failure from pcre.backtrack_limit (T124404).
+ Minimize use of backtracking where possible.
+ */
+ $marker = '|' . Parser::MARKER_PREFIX . '[^\x7f]++\x7f';
// this one is needed when the text is inside an HTML markup
- $htmlfix = '|<[^>]+$|^[^<>]*>';
+ $htmlfix = '|<[^>\004]++(?=\004$)|^[^<>]*+>';
+
+ // Optimize for the common case where these tags have
+ // few or no children. Thus try and possesively get as much as
+ // possible, and only engage in backtracking when we hit a '<'.
// disable convert to variants between <code> tags
- $codefix = '<code>.+?<\/code>|';
+ $codefix = '<code>[^<]*+(?:(?:(?!<\/code>).)[^<]*+)*+<\/code>|';
// disable conversion of <script> tags
- $scriptfix = '<script.*?>.*?<\/script>|';
+ $scriptfix = '<script[^>]*+>[^<]*+(?:(?:(?!<\/script>).)[^<]*+)*+<\/script>|';
// disable conversion of <pre> tags
- $prefix = '<pre.*?>.*?<\/pre>|';
+ $prefix = '<pre[^>]*+>[^<]*+(?:(?:(?!<\/pre>).)[^<]*+)*+<\/pre>|';
$reg = '/' . $codefix . $scriptfix . $prefix .
- '<[^>]+>|&[a-zA-Z#][a-z0-9]+;' . $marker . $htmlfix . '/s';
+ '<[^>]++>|&[a-zA-Z#][a-z0-9]++;' . $marker . $htmlfix . '|\004$/s';
$startPos = 0;
$sourceBlob = '';
$literalBlob = '';
@@ -387,15 +395,34 @@ class LanguageConverter {
$markupMatches = null;
$elementMatches = null;
+
+ // We add a marker (\004) at the end of text, to ensure we always match the
+ // entire text (Otherwise, pcre.backtrack_limit might cause silent failure)
while ( $startPos < strlen( $text ) ) {
- if ( preg_match( $reg, $text, $markupMatches, PREG_OFFSET_CAPTURE, $startPos ) ) {
+ if ( preg_match( $reg, $text . "\004", $markupMatches, PREG_OFFSET_CAPTURE, $startPos ) ) {
$elementPos = $markupMatches[0][1];
$element = $markupMatches[0][0];
+ if ( $element === "\004" ) {
+ // We hit the end.
+ $elementPos = strlen( $text );
+ $element = '';
+ }
} else {
- $elementPos = strlen( $text );
- $element = '';
+ // If we hit here, then Language Converter could be tricked
+ // into doing an XSS, so we refuse to translate.
+ // If non-crazy input manages to reach this code path,
+ // we should consider it a bug.
+ $log = LoggerFactory::getInstance( 'languageconverter' );
+ $log->error( "Hit pcre.backtrack_limit in " . __METHOD__
+ . ". Disabling language conversion for this page.",
+ array(
+ "method" => __METHOD__,
+ "variant" => $toVariant,
+ "startOfText" => substr( $text, 0, 500 )
+ )
+ );
+ return $text;
}
-
// Queue the part before the markup for translation in a batch
$sourceBlob .= substr( $text, $startPos, $elementPos - $startPos ) . "\000";
@@ -404,7 +431,7 @@ class LanguageConverter {
// Translate any alt or title attributes inside the matched element
if ( $element !== ''
- && preg_match( '/^(<[^>\s]*)\s([^>]*)(.*)$/', $element, $elementMatches )
+ && preg_match( '/^(<[^>\s]*+)\s([^>]*+)(.*+)$/', $element, $elementMatches )
) {
$attrs = Sanitizer::decodeTagAttributes( $elementMatches[2] );
$changed = false;
diff --git a/tests/phpunit/languages/LanguageConverterTest.php b/tests/phpunit/languages/LanguageConverterTest.php
index 81184aa..331368e 100644
--- a/tests/phpunit/languages/LanguageConverterTest.php
+++ b/tests/phpunit/languages/LanguageConverterTest.php
@@ -157,6 +157,25 @@ class LanguageConverterTest extends MediaWikiLangTestCase {
$wgRequest->setVal( 'variant', null );
$this->assertEquals( 'tg', $this->lc->getPreferredVariant() );
}
+
+ /**
+ * Test exhausting pcre.backtrack_limit
+ */
+ public function testAutoConvertT124404() {
+ $testString = '';
+ for ( $i = 0; $i < 1000; $i++ ) {
+ $testString .= 'xxx xxx xxx';
+ }
+ $testString .= "\n<big id='в'></big>";
+ $old = ini_set('pcre.backtrack_limit', 200 );
+ $result = $this->lc->autoConvert( $testString, 'tg-latn' );
+ ini_set( 'pcre.backtrack_limit', $old );
+ // The в in the id attribute should not get converted to a v
+ $this->assertFalse(
+ strpos( $result, 'v' ),
+ "в converted to v despite being in attribue"
+ );
+ }
}
/**
--
1.9.5 (Apple Git-50.3)

File Metadata

Mime Type
text/x-diff
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
5087912
Default Alt Text
T124404-REL1_29.patch (5 KB)

Event Timeline