Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F12217
0001-SECURITY-UtfNormal-add-remaining-noncharacters.patch
acl*security
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Authored By
•
bzimport
Nov 22 2014, 2:22 AM
2014-11-22 02:22:30 (UTC+0)
Size
4 KB
Referenced Files
None
Subscribers
None
0001-SECURITY-UtfNormal-add-remaining-noncharacters.patch
View Options
From 840a28c7044b3a2d4e5fdfa31858599d04065783 Mon Sep 17 00:00:00 2001
From: Kevin Israel <pleasestand@live.com>
Date: Fri, 11 Oct 2013 22:13:06 -0400
Subject: [PATCH] SECURITY: UtfNormal: add remaining noncharacters
It may be wise to replace all noncharacters with U+FFFD, not just
U+FFFE and U+FFFF, to avoid triggering errors, which might not be
properly handled, in libraries (e.g. PCRE 8.32) and/or other
applications.
Bug: 55548
Change-Id: I63ca3217a882fb4b156e0706285b1216319906f0
---
includes/normal/RandomTest.php | 15 ++++++++++++---
includes/normal/Utf8Test.php | 4 ++--
includes/normal/UtfNormal.php | 43 ++++++++++++++++++++++++++++--------------
3 files changed, 43 insertions(+), 19 deletions(-)
diff --git a/includes/normal/RandomTest.php b/includes/normal/RandomTest.php
index 0602986..c1ed0b9 100644
--- a/includes/normal/RandomTest.php
+++ b/includes/normal/RandomTest.php
@@ -48,9 +48,18 @@ function randomString( $length, $nullOk, $ascii = false ) {
/* Duplicate of the cleanUp() path for ICU usage */
function donorm( $str ) {
# We exclude a few chars that ICU would not.
- $str = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $str );
- $str = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $str );
- $str = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $str );
+ $str = preg_replace( '/
+
+ # Control characters illegal in XML
+ [\x00-\x08\x0b\x0c\x0e-\x1f] |
+
+ # U+FDD0..U+FDEF, U+FFFE, U+FFFF
+ \xef(?:\xb7[\x90-\xaf]|\xbf[\xbe\xbf]) |
+
+ # U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, ..., U+10FFFE, U+10FFFF
+ [\xf0-\xf4][\x8f\x9f\xaf\xbf]\xbf[\xbe\xbf]
+
+ /Sx', UTF8_REPLACEMENT, $str );
# UnicodeString constructor fails if the string ends with a head byte.
# Add a junk char at the end, we'll strip it off
diff --git a/includes/normal/Utf8Test.php b/includes/normal/Utf8Test.php
index c5c1be5..734ff7d 100644
--- a/includes/normal/Utf8Test.php
+++ b/includes/normal/Utf8Test.php
@@ -74,8 +74,8 @@ $exceptions = array(
# sequences beyond what is now considered legal.
'2.1.5', '2.1.6', '2.2.4', '2.2.5', '2.2.6', '2.3.5',
- # Literal 0xffff, which is illegal
- '2.2.3' );
+ # Noncharacters, which we have chosen to replace with U+FFFD
+ '2.2.3', '2.3.4' );
$longTests = array(
# These tests span multiple lines
diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php
index 5a091af..6809bb8 100644
--- a/includes/normal/UtfNormal.php
+++ b/includes/normal/UtfNormal.php
@@ -382,14 +382,24 @@ class UtfNormal {
|| ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B)
|| ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C)
- # U+FFFE and U+FFFF are explicitly forbidden in Unicode.
- || ($n == 0xef &&
- ($sequence == UTF8_FFFE)
- || ($sequence == UTF8_FFFF) )
+ # Noncharacters are reserved for internal use. The Unicode Standard
+ # doesn't require that we do anything with them, though we replace
+ # U+FFFE and U+FFFF because they are forbidden in XML, and the rest
+ # because some libraries have trouble with them too (e.g. PCRE 8.32).
+ #
+ # Also, Unicode has been limited to 21 bits; longer sequences
+ # (those greater than UTF8_MAX) are not allowed.
- # Unicode has been limited to 21 bits; longer
- # sequences are not allowed.
- || ($n >= 0xf0 && $sequence > UTF8_MAX) ) {
+ || ($n == 0xef && (
+ ($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF)
+ || ($sequence == UTF8_FFFE)
+ || ($sequence == UTF8_FFFF) ) )
+
+ || ($n >= 0xf0 && (
+ ($sequence & "\xf0\x8f\xbf\xbe") == "\xf0\x8f\xbf\xbe"
+ || ($sequence > UTF8_MAX) ) )
+
+ ) {
$replace[] = array( UTF8_REPLACEMENT,
$base + $i + 1 - strlen( $sequence ),
@@ -766,12 +776,17 @@ class UtfNormal {
* @return String String with the character codes replaced.
*/
private static function replaceForNativeNormalize( $string ) {
- $string = preg_replace(
- '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
- UTF8_REPLACEMENT,
- $string );
- $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
- $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string );
- return $string;
+ return preg_replace( '/
+
+ # Control characters illegal in XML
+ [\x00-\x08\x0b\x0c\x0e-\x1f] |
+
+ # U+FDD0..U+FDEF, U+FFFE, U+FFFF
+ \xef(?:\xb7[\x90-\xaf]|\xbf[\xbe\xbf]) |
+
+ # U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, ..., U+10FFFE, U+10FFFF
+ [\xf0-\xf4][\x8f\x9f\xaf\xbf]\xbf[\xbe\xbf]
+
+ /Sx', UTF8_REPLACEMENT, $string );
}
}
--
1.8.4.2
File Metadata
Details
Attached
Mime Type
text/x-diff
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
11674
Default Alt Text
0001-SECURITY-UtfNormal-add-remaining-noncharacters.patch (4 KB)
Attached To
Mode
T57548: Html::expandAttributes can be tricked into omitting necessary quotes
Attached
Detach File
Event Timeline
Log In to Comment