Page MenuHomePhabricator

bug71394-tokenizer.patch

Authored By
bzimport
Nov 22 2014, 3:46 AM
Size
53 KB
Referenced Files
None
Subscribers
None

bug71394-tokenizer.patch

From b2bf756c1483c0dc65110bdd1b63710e9cdd6edc Mon Sep 17 00:00:00 2001
From: csteipp <csteipp@wikimedia.org>
Date: Wed, 5 Nov 2014 15:42:20 -0800
Subject: [PATCH] SECURITY: Properly remove html from conversion text
When converting a text to a variant, html should not be converted. This
patch parses the text as html5, and protects and html from translation.
Change-Id: I268fdb9be3c9f7f020aab3a0200db6b7a0beddaa
---
includes/AutoLoader.php | 1 +
includes/Html5Tokenizer.php | 1364 +++++++++++++++++++++++++
languages/LanguageConverter.php | 89 +-
tests/phpunit/includes/Html5TokenizerTest.php | 132 +++
4 files changed, 1559 insertions(+), 27 deletions(-)
create mode 100644 includes/Html5Tokenizer.php
create mode 100644 tests/phpunit/includes/Html5TokenizerTest.php
diff --git a/includes/AutoLoader.php b/includes/AutoLoader.php
index 172bd49..460730a 100644
--- a/includes/AutoLoader.php
+++ b/includes/AutoLoader.php
@@ -82,6 +82,7 @@ $wgAutoloadLocalClasses = array(
'HistoryBlobStub' => 'includes/HistoryBlob.php',
'Hooks' => 'includes/Hooks.php',
'Html' => 'includes/Html.php',
+ 'Html5Tokenizer' => 'includes/Html5Tokenizer.php',
'HtmlFormatter' => 'includes/HtmlFormatter.php',
'HTMLApiField' => 'includes/htmlform/HTMLApiField.php',
'HTMLAutoCompleteSelectField' => 'includes/htmlform/HTMLAutoCompleteSelectField.php',
diff --git a/includes/Html5Tokenizer.php b/includes/Html5Tokenizer.php
new file mode 100644
index 0000000..e551c44
--- /dev/null
+++ b/includes/Html5Tokenizer.php
@@ -0,0 +1,1364 @@
+<?php
+/**
+ * HTML Tokenizer for %MediaWiki. Parses a string according to the html 5
+ * tokenizing spec [http://dev.w3.org/html5/spec-preview/tokenization.html],
+ * except for when we could optimize for this specific MediaWiki task.
+ *
+ * This is based heavily on the html5lib-php project
+ * (https://github.com/html5lib/html5lib-php), licensed as:
+ *
+ * Copyright 2007 Jeroen van der Meer <http://jero.net/>
+ * Copyright 2008 Edward Z. Yang <http://htmlpurifier.org/>
+ * Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * @file
+ */
+
+// In general:
+// /* */ indicates verbatim text from the HTML 5 specification
+// // indicates regular comments
+
+class Html5Tokenizer {
+
+ /**
+ * HTML5_InputStream the stream we parse to find each chunk of html
+ */
+ private $stream;
+
+ /**
+ * index into the data where the next html chunk starts
+ */
+ public $elementStart;
+
+ /**
+ * The next html chunk
+ */
+ public $element;
+
+ /**
+ * Array of elements where we return the entire
+ * content. Usually array( 'pre', 'code', 'style', 'script' );
+ */
+ private $entireElements;
+
+ /**
+ * Do we return /^[^<]*>/ as an html token?
+ */
+ private $flagCloseOnly;
+
+ /**
+ * Do we return /<[^>]*$/ as an html token?
+ */
+ private $flagOpenOnly;
+
+ /**
+ * Current token that is being built, but not yet emitted. Also
+ * is the last token emitted, if applicable.
+ */
+ protected $token;
+
+ // These are constants describing tokens
+ const DOCTYPE = 0;
+ const STARTTAG = 1;
+ const ENDTAG = 2;
+ const COMMENT = 3;
+ const CHARACTER = 4;
+ const SPACECHARACTER = 5;
+ const EOF = 6;
+ const PARSEERROR = 7;
+
+ // These are constants representing bunches of characters.
+ const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
+ const UPPER_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
+ const LOWER_ALPHA = 'abcdefghijklmnopqrstuvwxyz';
+ const DIGIT = '0123456789';
+ const HEX = '0123456789ABCDEFabcdef';
+ const WHITESPACE = "\t\n\x0c ";
+
+ /**
+ * @param string $data the html string to tokenize
+ * @param array list of elements names to get the entire contents of
+ */
+ public function __construct( $data, array $entireElements = array() ) {
+ $this->stream = new HTML5_InputStream( $data );
+ $this->entireElements = $entireElements;
+ $this->flagCloseOnly = false;
+ $this->flagOpenOnly = false;
+ }
+
+ /**
+ * @param bool $flag whether to match ^[^<]*>
+ */
+ public function setFlagCloseOnly( $flag ) {
+ $this->flagCloseOnly = $flag;
+ }
+
+ /**
+ * @param bool $flag whether to match <[^>]*$
+ */
+ public function setFlagOpenOnly( $flag ) {
+ $this->flagOpenOnly = $flag;
+ }
+
+ /**
+ * @return array
+ */
+ public function getLastToken() {
+ return $this->token;
+ }
+
+
+ public function checkEntireElementMatching( $matchingEntireElement ) {
+ $sc = isset( $this->token['self-closing'] ) ? $this->token['self-closing'] : false;
+ return !$matchingEntireElement || (
+ ( $this->token['type'] === self::ENDTAG || $sc )
+ && $this->token['name'] === $matchingEntireElement
+ );
+ }
+
+ /**
+ * Performs the actual parsing of the document. Each call will return
+ * the next chunk of html in the string. We only handle PCDATA content model.
+ *
+ * Access the html chunk and it's offset in the string by this public $element
+ * and $elementStart members of this class.
+ *
+ * @return bool true of we identified a chunk of html in the remaining string
+ */
+ public function parse() {
+ // Current state
+ $state = 'data';
+
+ // This is used to avoid having to have look-behind in the data state.
+ $lastFourChars = '';
+
+ /**
+ * Escape flag as specified by the HTML5 specification: "used to
+ * control the behavior of the tokeniser. It is either true or
+ * false, and initially must be set to the false state."
+ */
+ $escape = false;
+
+ // Have we started marking an html tag to return?
+ $haveElement = false;
+
+ // Are we matching the entire body of a specific element?
+ $matchingEntireElement = false;
+
+ // In case we need to handle flagCloseOnly
+ $this->elementStart = $this->stream->getPos();
+
+ $this->element = null;
+
+ while( $state !== null ) {
+
+ switch($state) {
+
+ case 'data':
+ /* Consume the next input character */
+ $char = $this->stream->char();
+ $lastFourChars .= $char;
+ if ( strlen( $lastFourChars ) > 4 ) {
+ $lastFourChars = substr( $lastFourChars, -4 );
+ }
+
+ /* U+003C LESS-THAN SIGN (<) */
+ if ( $char === '<' ) {
+ /* When the content model flag is set to the PCDATA state: switch
+ to the tag open state. */
+ $state = 'tag open';
+ if ( !$matchingEntireElement ) {
+ $this->elementStart = $this->stream->getPos() - 1;
+ $haveElement = true;
+ }
+
+ } elseif ( $char === '>'
+ && !$haveElement
+ && $this->flagCloseOnly
+ ) {
+ // For MediaWiki, and unopened closing tag could mean
+ // the begining of this fragment is in an element context
+ if ( !$matchingEntireElement ) {
+ $this->element = $this->stream->getSubstr(
+ $this->elementStart,
+ $this->stream->getPos()
+ );
+ return true;
+ }
+
+ } elseif ( $char === false ) {
+ /* EOF : Emit an end-of-file token. */
+ $state = null;
+ if ( $haveElement && $this->flagOpenOnly
+ || $matchingEntireElement
+ ) {
+ $this->element = $this->stream->getSubstr(
+ $this->elementStart,
+ $this->stream->getPos()
+ );
+ return true;
+ }
+
+ return false;
+
+ } elseif ( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' ) {
+ // Directly after emitting a token you switch back to the "data
+ // state". At that point spaceCharacters are important so they are
+ // emitted separately.
+ $chars = $this->stream->charsWhile( self::WHITESPACE );
+ $lastFourChars .= $chars;
+ if ( strlen( $lastFourChars ) > 4 ) {
+ $lastFourChars = substr( $lastFourChars, -4 );
+ }
+
+ } else {
+ /* Anything else
+ THIS IS AN OPTIMIZATION: Get as many character that
+ otherwise would also be treated as a character token and emit it
+ as a single character token. Stay in the data state. */
+ $chars = $this->stream->charsUntil( '<>' );
+ $lastFourChars .= $chars;
+ if ( strlen( $lastFourChars ) > 4 ) {
+ $lastFourChars = substr( $lastFourChars, -4 );
+ }
+ $state = 'data';
+ }
+ break;
+
+ case 'tag open':
+ $char = $this->stream->char();
+
+ if ( $char === '!' ) {
+ /* U+0021 EXCLAMATION MARK (!)
+ Switch to the markup declaration open state. */
+ $state = 'markup declaration open';
+
+ } elseif ( $char === '/' ) {
+ /* U+002F SOLIDUS (/)
+ Switch to the close tag open state. */
+ $state = 'close tag open';
+
+ } elseif( 'A' <= $char && $char <= 'Z' ) {
+ /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
+ Create a new start tag token, set its tag name to the lowercase
+ version of the input character (add 0x0020 to the character's code
+ point), then switch to the tag name state. (Don't emit the token
+ yet; further details will be filled in before it is emitted.) */
+ $this->token = array(
+ 'name' => strtolower( $char ),
+ 'type' => self::STARTTAG,
+ 'attr' => array()
+ );
+
+ $state = 'tag name';
+
+ } elseif( 'a' <= $char && $char <= 'z' ) {
+ /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
+ Create a new start tag token, set its tag name to the input
+ character, then switch to the tag name state. (Don't emit
+ the token yet; further details will be filled in before it
+ is emitted.) */
+ $this->token = array(
+ 'name' => $char,
+ 'type' => self::STARTTAG,
+ 'attr' => array()
+ );
+
+ $state = 'tag name';
+
+ } elseif ( $char === '>' ) {
+ /* U+003E GREATER-THAN SIGN (>)
+ Parse error. Emit a U+003C LESS-THAN SIGN character token and a
+ U+003E GREATER-THAN SIGN character token. Switch to the data state. */
+ // For MediaWiki, we don't care about returning "<>"
+ $state = 'data';
+
+ } elseif ( $char === '?' ) {
+ /* U+003F QUESTION MARK (?)
+ Parse error. Switch to the bogus comment state. */
+ $this->token = array(
+ 'data' => '?',
+ 'type' => self::COMMENT
+ );
+ $state = 'bogus comment';
+
+ } else {
+ /* Anything else
+ Parse error. Emit a U+003C LESS-THAN SIGN character token and
+ reconsume the current input character in the data state. */
+ $state = 'data';
+ $this->stream->unget();
+ }
+ break;
+
+ case 'close tag open':
+ $char = $this->stream->char();
+
+ if ( 'A' <= $char && $char <= 'Z' ) {
+ /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
+ Create a new end tag token, set its tag name to the lowercase version
+ of the input character (add 0x0020 to the character's code point), then
+ switch to the tag name state. (Don't emit the token yet; further details
+ will be filled in before it is emitted.) */
+ $this->token = array(
+ 'name' => strtolower( $char ),
+ 'type' => self::ENDTAG
+ );
+ $state = 'tag name';
+
+ } elseif ( 'a' <= $char && $char <= 'z' ) {
+ /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
+ Create a new end tag token, set its tag name to the
+ input character, then switch to the tag name state.
+ (Don't emit the token yet; further details will be
+ filled in before it is emitted.) */
+ $this->token = array(
+ 'name' => $char,
+ 'type' => self::ENDTAG
+ );
+ $state = 'tag name';
+
+ } elseif ( $char === '>' ) {
+ /* U+003E GREATER-THAN SIGN (>)
+ Parse error. Switch to the data state. */
+ // e.g., </>. For MediaWiki, we want to return this
+ $state = 'data';
+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+ $this->element = $this->stream->getSubstr(
+ $this->elementStart,
+ $this->stream->getPos()
+ );
+ return true;
+ }
+
+ } elseif ( $char === false ) {
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* Parse error. Switch to the bogus comment state. */
+ $this->token = array(
+ 'data' => $char,
+ 'type' => self::COMMENT
+ );
+ $state = 'bogus comment';
+ }
+ break;
+
+ case 'tag name':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if ( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
+ /* U+0009 CHARACTER TABULATION
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ Switch to the before attribute name state. */
+ $state = 'before attribute name';
+
+ } elseif ( $char === '/' ) {
+ /* U+002F SOLIDUS (/)
+ Switch to the self-closing start tag state. */
+ $state = 'self-closing start tag';
+
+ } elseif ( $char === '>' ) {
+ /* U+003E GREATER-THAN SIGN (>)
+ Emit the current tag token. Switch to the data state. */
+ $state = 'data';
+ if ( $this->checkEntireElementMatching( $matchingEntireElement) ) {
+ $this->element = $this->stream->getSubstr(
+ $this->elementStart,
+ $this->stream->getPos()
+ );
+ return true;
+ }
+
+ } elseif ( 'A' <= $char && $char <= 'Z' ) {
+ /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
+ Append the lowercase version of the current input
+ character (add 0x0020 to the character's code point) to
+ the current tag token's tag name. Stay in the tag name state. */
+ $chars = $this->stream->charsWhile( self::UPPER_ALPHA );
+ $this->token['name'] .= strtolower($char . $chars);
+ $state = 'tag name';
+
+ } elseif ( $char === false ) {
+ /* EOF
+ Parse error. Reconsume the EOF character in the data state. */
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* Anything else
+ Append the current input character to the current tag token's tag name.
+ Stay in the tag name state. */
+ $chars = $this->stream->charsUntil( "\t\n\x0C />" . self::UPPER_ALPHA );
+ $this->token['name'] .= $char . $chars;
+ $state = 'tag name';
+ }
+
+ if ( $this->token['name'] && in_array( $this->token['name'], $this->entireElements ) ) {
+ $matchingEntireElement = $this->token['name'];
+ }
+ break;
+
+ case 'before attribute name':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' ) {
+ /* U+0009 CHARACTER TABULATION
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ Stay in the before attribute name state. */
+ $state = 'before attribute name';
+
+ } elseif ( $char === '/' ) {
+ /* U+002F SOLIDUS (/)
+ Switch to the self-closing start tag state. */
+ $state = 'self-closing start tag';
+
+ } elseif ( $char === '>' ) {
+ /* U+003E GREATER-THAN SIGN (>)
+ Emit the current tag token. Switch to the data state. */
+ $state = 'data';
+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+ $this->element = $this->stream->getSubstr(
+ $this->elementStart,
+ $this->stream->getPos()
+ );
+ return true;
+ }
+
+ } elseif ('A' <= $char && $char <= 'Z' ) {
+ /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
+ Start a new attribute in the current tag token. Set that
+ attribute's name to the lowercase version of the current
+ input character (add 0x0020 to the character's code
+ point), and its value to the empty string. Switch to the
+ attribute name state.*/
+ $this->token['attr'][] = array(
+ 'name' => strtolower( $char ),
+ 'value' => ''
+ );
+
+ $state = 'attribute name';
+
+ } elseif ( $char === false ) {
+ /* EOF
+ Parse error. Reconsume the EOF character in the data state. */
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* U+0022 QUOTATION MARK (")
+ U+0027 APOSTROPHE (')
+ U+003C LESS-THAN SIGN (<)
+ U+003D EQUALS SIGN (=)
+ Parse error. Treat it as per the "anything else" entry
+ below.
+
+ Anything else
+ Start a new attribute in the current tag token. Set that attribute's
+ name to the current input character, and its value to the empty string.
+ Switch to the attribute name state. */
+ $this->token['attr'][] = array(
+ 'name' => $char,
+ 'value' => ''
+ );
+
+ $state = 'attribute name';
+ }
+ break;
+
+ case 'attribute name':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ // this conditional is optimized, check bottom
+ if ( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' ) {
+ /* U+0009 CHARACTER TABULATION
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ Switch to the after attribute name state. */
+ $state = 'after attribute name';
+
+ } elseif ( $char === '/' ) {
+ /* U+002F SOLIDUS (/)
+ Switch to the self-closing start tag state. */
+ $state = 'self-closing start tag';
+
+ } elseif ( $char === '=' ) {
+ /* U+003D EQUALS SIGN (=)
+ Switch to the before attribute value state. */
+ $state = 'before attribute value';
+
+ } elseif ( $char === '>' ) {
+ /* U+003E GREATER-THAN SIGN (>)
+ Emit the current tag token. Switch to the data state. */
+ $state = 'data';
+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+ $this->element = $this->stream->getSubstr(
+ $this->elementStart,
+ $this->stream->getPos()
+ );
+ return true;
+ }
+
+ } elseif ( 'A' <= $char && $char <= 'Z' ) {
+ /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
+ Append the lowercase version of the current input
+ character (add 0x0020 to the character's code point) to
+ the current attribute's name. Stay in the attribute name
+ state. */
+ $chars = $this->stream->charsWhile( self::UPPER_ALPHA );
+ $last = count( $this->token['attr'] ) - 1;
+ $this->token['attr'][$last]['name'] .= strtolower( $char . $chars );
+
+ $state = 'attribute name';
+
+ } elseif ( $char === false ) {
+ /* EOF
+ Parse error. Reconsume the EOF character in the data state. */
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* U+0022 QUOTATION MARK (")
+ U+0027 APOSTROPHE (')
+ U+003C LESS-THAN SIGN (<)
+ Parse error. Treat it as per the "anything else"
+ entry below.
+
+ Anything else
+ Append the current input character to the current attribute's name.
+ Stay in the attribute name state. */
+ $chars = $this->stream->charsUntil( "\t\n\x0C /=>\"'" . self::UPPER_ALPHA );
+ $last = count( $this->token['attr'] ) - 1;
+ $this->token['attr'][$last]['name'] .= $char . $chars;
+
+ $state = 'attribute name';
+ }
+
+ break;
+
+ case 'after attribute name':
+ // Consume the next input character:
+ $char = $this->stream->char();
+
+ // this is an optimized conditional, check the bottom
+ if ( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' ) {
+ /* U+0009 CHARACTER TABULATION
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ Stay in the after attribute name state. */
+ $state = 'after attribute name';
+
+ } elseif ( $char === '/' ) {
+ /* U+002F SOLIDUS (/)
+ Switch to the self-closing start tag state. */
+ $state = 'self-closing start tag';
+
+ } elseif ( $char === '=' ) {
+ /* U+003D EQUALS SIGN (=)
+ Switch to the before attribute value state. */
+ $state = 'before attribute value';
+
+ } elseif ( $char === '>' ) {
+ /* U+003E GREATER-THAN SIGN (>)
+ Emit the current tag token. Switch to the data state. */
+ $state = 'data';
+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+ $this->element = $this->stream->getSubstr(
+ $this->elementStart,
+ $this->stream->getPos()
+ );
+ return true;
+ }
+
+ } elseif ( 'A' <= $char && $char <= 'Z' ) {
+ /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
+ Start a new attribute in the current tag token. Set that
+ attribute's name to the lowercase version of the current
+ input character (add 0x0020 to the character's code
+ point), and its value to the empty string. Switch to the
+ attribute name state. */
+ $this->token['attr'][] = array(
+ 'name' => strtolower( $char ),
+ 'value' => ''
+ );
+ $state = 'attribute name';
+
+ } elseif ( $char === false ) {
+ /* EOF
+ Parse error. Reconsume the EOF character in the data state. */
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* U+0022 QUOTATION MARK (")
+ U+0027 APOSTROPHE (')
+ U+003C LESS-THAN SIGN(<)
+ Parse error. Treat it as per the "anything else"
+ entry below.
+
+ Anything else
+ Start a new attribute in the current tag token. Set that attribute's
+ name to the current input character, and its value to the empty string.
+ Switch to the attribute name state. */
+ $this->token['attr'][] = array(
+ 'name' => $char,
+ 'value' => ''
+ );
+
+ $state = 'attribute name';
+ }
+ break;
+
+ case 'before attribute value':
+ // Consume the next input character:
+ $char = $this->stream->char();
+
+ // this is an optimized conditional
+ if( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' ) {
+ /* U+0009 CHARACTER TABULATION
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ Stay in the before attribute value state. */
+ $state = 'before attribute value';
+
+ } elseif ( $char === '"' ) {
+ /* U+0022 QUOTATION MARK (")
+ Switch to the attribute value (double-quoted) state. */
+ $state = 'attribute value (double-quoted)';
+
+ } elseif ( $char === '&' ) {
+ /* U+0026 AMPERSAND (&)
+ Switch to the attribute value (unquoted) state and reconsume
+ this input character. */
+ $this->stream->unget();
+ $state = 'attribute value (unquoted)';
+
+ } elseif ($char === '\'' ) {
+ /* U+0027 APOSTROPHE (')
+ Switch to the attribute value (single-quoted) state. */
+ $state = 'attribute value (single-quoted)';
+
+ } elseif ( $char === '>' ) {
+ /* U+003E GREATER-THAN SIGN (>)
+ Parse error. Emit the current tag token. Switch to the data state. */
+ $state = 'data';
+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+ $this->element = $this->stream->getSubstr(
+ $this->elementStart,
+ $this->stream->getPos()
+ );
+ return true;
+ }
+
+ } elseif ( $char === false ) {
+ /* EOF
+ Parse error. Reconsume the EOF character in the data state. */
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* U+003D EQUALS SIGN (=)
+ * U+003C LESS-THAN SIGN (<)
+ Parse error. Treat it as per the "anything else" entry below.
+
+ Anything else
+ Append the current input character to the current attribute's value.
+ Switch to the attribute value (unquoted) state. */
+ $last = count( $this->token['attr'] ) - 1;
+ $this->token['attr'][$last]['value'] .= $char;
+ $state = 'attribute value (unquoted)';
+ }
+ break;
+
+ case 'attribute value (double-quoted)':
+ // Consume the next input character:
+ $char = $this->stream->char();
+
+ if( $char === '"' ) {
+ /* U+0022 QUOTATION MARK (")
+ Switch to the after attribute value (quoted) state. */
+ $state = 'after attribute value (quoted)';
+
+ } elseif ( $char === false ) {
+ /* EOF
+ Parse error. Reconsume the EOF character in the data state. */
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* Anything else
+ Append the current input character to the current attribute's value.
+ Stay in the attribute value (double-quoted) state. */
+ $chars = $this->stream->charsUntil( '"' );
+ $last = count( $this->token['attr'] ) - 1;
+ $this->token['attr'][$last]['value'] .= $char . $chars;
+ $state = 'attribute value (double-quoted)';
+ }
+ break;
+
+ case 'attribute value (single-quoted)':
+ // Consume the next input character:
+ $char = $this->stream->char();
+
+ if( $char === "'" ) {
+ /* U+0022 QUOTATION MARK (')
+ Switch to the after attribute value state. */
+ $state = 'after attribute value (quoted)';
+
+ } elseif ( $char === false ) {
+ /* EOF
+ Parse error. Reconsume the EOF character in the data state. */
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* Anything else
+ Append the current input character to the current attribute's value.
+ Stay in the attribute value (single-quoted) state. */
+ $chars = $this->stream->charsUntil( "'" );
+ $last = count( $this->token['attr'] ) - 1;
+ $this->token['attr'][$last]['value'] .= $char . $chars;
+ $state = 'attribute value (single-quoted)';
+ }
+ break;
+
+ case 'attribute value (unquoted)':
+ // Consume the next input character:
+ $char = $this->stream->char();
+
+ if( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' ) {
+ /* U+0009 CHARACTER TABULATION
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ Switch to the before attribute name state. */
+ $state = 'before attribute name';
+
+ } elseif ( $char === '>' ) {
+ /* U+003E GREATER-THAN SIGN (>)
+ Emit the current tag token. Switch to the data state. */
+ $state = 'data';
+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+ $this->element = $this->stream->getSubstr(
+ $this->elementStart,
+ $this->stream->getPos()
+ );
+ return true;
+ }
+
+ } elseif ($char === false) {
+ /* EOF
+ Parse error. Reconsume the EOF character in the data state. */
+ $this->stream->unget();
+ $state = 'data';
+ } else {
+ /* U+0022 QUOTATION MARK (")
+ U+0027 APOSTROPHE (')
+ U+003C LESS-THAN SIGN (<)
+ U+003D EQUALS SIGN (=)
+ Parse error. Treat it as per the "anything else"
+ entry below.
+
+ Anything else
+ Append the current input character to the current attribute's value.
+ Stay in the attribute value (unquoted) state. */
+ $chars = $this->stream->charsUntil( "\t\n\x0c &>\"'=" );
+
+ $last = count( $this->token['attr'] ) - 1;
+ $this->token['attr'][$last]['value'] .= $char . $chars;
+
+ $state = 'attribute value (unquoted)';
+ }
+ break;
+
+ case 'after attribute value (quoted)':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if ( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' ) {
+ /* U+0009 CHARACTER TABULATION
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ Switch to the before attribute name state. */
+ $state = 'before attribute name';
+
+ } elseif ( $char === '/' ) {
+ /* U+002F SOLIDUS (/)
+ Switch to the self-closing start tag state. */
+ $state = 'self-closing start tag';
+
+ } elseif ( $char === '>' ) {
+ /* U+003E GREATER-THAN SIGN (>)
+ Emit the current tag token. Switch to the data state. */
+ $state = 'data';
+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+ $this->element = $this->stream->getSubstr(
+ $this->elementStart,
+ $this->stream->getPos()
+ );
+ return true;
+ }
+
+ } elseif ( $char === false ) {
+ /* EOF
+ Parse error. Reconsume the EOF character in the data state. */
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* Anything else
+ Parse error. Reconsume the character in the before attribute
+ name state. */
+ $this->stream->unget();
+ $state = 'before attribute name';
+ }
+ break;
+
+ case 'self-closing start tag':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if ( $char === '>' ) {
+ /* U+003E GREATER-THAN SIGN (>)
+ Set the self-closing flag of the current tag token.
+ Emit the current tag token. Switch to the data state. */
+ // not sure if this is the name we want
+ $this->token['self-closing'] = true;
+ $state = 'data';
+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+ $this->element = $this->stream->getSubstr(
+ $this->elementStart,
+ $this->stream->getPos()
+ );
+ return true;
+ }
+
+ } elseif ( $char === false ) {
+ /* EOF
+ Parse error. Reconsume the EOF character in the data state. */
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* Anything else
+ Parse error. Reconsume the character in the before attribute name state. */
+ $this->stream->unget();
+ $state = 'before attribute name';
+ }
+ break;
+
+ case 'bogus comment':
+ /* Consume every character up to the first U+003E GREATER-THAN SIGN
+ character (>) or the end of the file (EOF), whichever comes first. Emit
+ a comment token whose data is the concatenation of all the characters
+ starting from and including the character that caused the state machine
+ to switch into the bogus comment state, up to and including the last
+ consumed character before the U+003E character, if any, or up to the
+ end of the file otherwise. (If the comment was started by the end of
+ the file (EOF), the token is empty.) */
+ $this->token['data'] .= (string) $this->stream->charsUntil( '>' );
+ $this->stream->char();
+
+ /* Switch to the data state. */
+ $state = 'data';
+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+ $this->element = $this->stream->getSubstr(
+ $this->elementStart,
+ $this->stream->getPos()
+ );
+ return true;
+ }
+ break;
+
+ case 'markup declaration open':
+ // Consume for below
+ $hyphens = $this->stream->charsWhile( '-', 2 );
+ if ( $hyphens === '-' ) {
+ $this->stream->unget();
+ }
+ if ( $hyphens !== '--' ) {
+ $alpha = $this->stream->charsWhile( self::ALPHA, 7 );
+ }
+
+ /* If the next two characters are both U+002D HYPHEN-MINUS (-)
+ characters, consume those two characters, create a comment token whose
+ data is the empty string, and switch to the comment state. */
+ if ( $hyphens === '--' ) {
+ $state = 'comment start';
+ $this->token = array(
+ 'data' => '',
+ 'type' => self::COMMENT
+ );
+
+ /* Otherwise if the next seven characters are a case-insensitive match
+ for the word "DOCTYPE", then consume those characters and switch to the
+ DOCTYPE state. */
+ } elseif ( strtoupper( $alpha ) === 'DOCTYPE' ) {
+ # $state = 'DOCTYPE';
+ // For MediaWiki, we're simplifying and saying DOCTYPE
+ // is just another self-closing tag
+ $state = 'attribute name';
+ $this->token = array(
+ 'name' => '!DOCTYPE',
+ 'data' => '',
+ 'type' => self::STARTTAG,
+ 'self-closing' => true,
+ );
+
+ // XXX not implemented
+ /* Otherwise, if the insertion mode is "in foreign content"
+ and the current node is not an element in the HTML namespace
+ and the next seven characters are an ASCII case-sensitive
+ match for the string "[CDATA[" (the five uppercase letters
+ "CDATA" with a U+005B LEFT SQUARE BRACKET character before
+ and after), then consume those characters and switch to the
+ CDATA section state (which is unrelated to the content model
+ flag's CDATA state).
+
+ Otherwise, is is a parse error. Switch to the bogus comment state.
+ The next character that is consumed, if any, is the first character
+ that will be in the comment. */
+ } else {
+ $this->token = array(
+ 'data' => (string) $alpha,
+ 'type' => self::COMMENT
+ );
+ $state = 'bogus comment';
+ }
+ break;
+
+ case 'comment start':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if ( $char === '-' ) {
+ /* U+002D HYPHEN-MINUS (-)
+ Switch to the comment start dash state. */
+ $state = 'comment start dash';
+ } elseif ( $char === '>' ) {
+ /* U+003E GREATER-THAN SIGN (>)
+ Parse error. Emit the comment token. Switch to the
+ data state. */
+ // E.g., <!-->. For MediaWiki we should return this
+ $state = 'data';
+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+ $this->element = $this->stream->getSubstr(
+ $this->elementStart,
+ $this->stream->getPos()
+ );
+ return true;
+ }
+
+ } elseif ( $char === false ) {
+ /* EOF
+ Parse error. Emit the comment token. Reconsume the
+ EOF character in the data state. */
+ $this->stream->unget();
+ $state = 'data';
+ } else {
+ /* Anything else
+ Append the input character to the comment token's
+ data. Switch to the comment state. */
+ $this->token['data'] .= $char;
+ $state = 'comment';
+ }
+ break;
+
+ case 'comment start dash':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+ if ( $char === '-' ) {
+ /* U+002D HYPHEN-MINUS (-)
+ Switch to the comment end state */
+ $state = 'comment end';
+ } elseif ( $char === '>' ) {
+ /* U+003E GREATER-THAN SIGN (>)
+ Parse error. Emit the comment token. Switch to the
+ data state. */
+ // E.g., <!--->. For MediaWiki, we return this
+ $state = 'data';
+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+ $this->element = $this->stream->getSubstr(
+ $this->elementStart,
+ $this->stream->getPos()
+ );
+ return true;
+ }
+
+ } elseif ( $char === false ) {
+ /* Parse error. Emit the comment token. Reconsume the
+ EOF character in the data state. */
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ $this->token['data'] .= '-' . $char;
+ $state = 'comment';
+ }
+ break;
+
+ case 'comment':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if( $char === '-' ) {
+ /* U+002D HYPHEN-MINUS (-)
+ Switch to the comment end dash state */
+ $state = 'comment end dash';
+
+ } elseif ( $char === false ) {
+ /* EOF
+ Parse error. Emit the comment token. Reconsume the EOF character
+ in the data state. */
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* Anything else
+ Append the input character to the comment token's data. Stay in
+ the comment state. */
+ $chars = $this->stream->charsUntil( '-' );
+ $this->token['data'] .= $char . $chars;
+ }
+ break;
+
+ case 'comment end dash':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if( $char === '-' ) {
+ /* U+002D HYPHEN-MINUS (-)
+ Switch to the comment end state */
+ $state = 'comment end';
+
+ } elseif( $char === false ) {
+ /* EOF
+ Parse error. Emit the comment token. Reconsume the EOF character
+ in the data state. */
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* Anything else
+ Append a U+002D HYPHEN-MINUS (-) character and the input
+ character to the comment token's data. Switch to the comment state. */
+ $this->token['data'] .= '-' . $char;
+ $state = 'comment';
+ }
+ break;
+
+ case 'comment end':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if( $char === '>' ) {
+ /* U+003E GREATER-THAN SIGN (>)
+ Emit the comment token. Switch to the data state. */
+ $state = 'data';
+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+ $this->element = $this->stream->getSubstr(
+ $this->elementStart,
+ $this->stream->getPos()
+ );
+ return true;
+ }
+
+ } elseif ( $char === '-' ) {
+ /* U+002D HYPHEN-MINUS (-)
+ Parse error. Append a U+002D HYPHEN-MINUS (-) character
+ to the comment token's data. Stay in the comment end
+ state. */
+ $this->token['data'] .= '-';
+
+ } elseif ( $char === "\t" || $char === "\n" || $char === "\x0a" || $char === ' ' ) {
+ $this->token['data'] .= '--' . $char;
+ $state = 'comment end space';
+
+ } elseif ( $char === '!' ) {
+
+ $state = 'comment end bang';
+
+ } elseif ( $char === false ) {
+ /* EOF
+ Parse error. Emit the comment token. Reconsume the
+ EOF character in the data state. */
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* Anything else
+ Parse error. Append two U+002D HYPHEN-MINUS (-)
+ characters and the input character to the comment token's
+ data. Switch to the comment state. */
+
+ $this->token['data'] .= '--' . $char;
+ $state = 'comment';
+ }
+ break;
+
+ case 'comment end bang':
+ $char = $this->stream->char();
+ if ( $char === '>' ) {
+ $state = 'data';
+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+ $this->element = $this->stream->getSubstr(
+ $this->elementStart,
+ $this->stream->getPos()
+ );
+ return true;
+ }
+ } elseif ( $char === "-" ) {
+ $this->token['data'] .= '--!';
+ $state = 'comment end dash';
+ } elseif ( $char === false ) {
+ $this->stream->unget();
+ $state = 'data';
+ } else {
+ $this->token['data'] .= '--!' . $char;
+ $state = 'comment';
+ }
+ break;
+
+ case 'comment end space':
+ $char = $this->stream->char();
+ if ( $char === '>' ) {
+ $state = 'data';
+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+ $this->element = $this->stream->getSubstr(
+ $this->elementStart,
+ $this->stream->getPos()
+ );
+ return true;
+ }
+ } elseif ( $char === '-' ) {
+ $state = 'comment end dash';
+ } elseif ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
+ $this->token['data'] .= $char;
+ } elseif ($char === false) {
+ $this->stream->unget();
+ $state = 'data';
+ } else {
+ $this->token['data'] .= $char;
+ $state = 'comment';
+ }
+ break;
+
+ // case 'cdataSection':
+
+ }
+ }
+ }
+
+}
+
+
+
+/*
+
+Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+*/
+
+class HTML5_InputStream {
+ /**
+ * The string data we're parsing.
+ */
+ private $data;
+
+ /**
+ * The current integer byte position we are in $data
+ */
+ private $char;
+
+ /**
+ * Length of $data; when $char === $data, we are at the end-of-file.
+ */
+ private $EOF;
+
+ /**
+ * Parse errors.
+ */
+ public $errors = array();
+
+ /**
+ * @param $data Data to parse
+ */
+ public function __construct( $data ) {
+
+ /* One leading U+FEFF BYTE ORDER MARK character must be
+ ignored if any are present. */
+ if ( substr( $data, 0, 3 ) === "\xEF\xBB\xBF" ) {
+ $data = substr( $data, 3 );
+ }
+
+ /* All U+0000 NULL characters in the input must be replaced
+ by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such
+ characters is a parse error.
+
+ U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED
+ (LF) characters are treated specially. Any CR characters
+ that are followed by LF characters must be removed, and any
+ CR characters not followed by LF characters must be converted
+ to LF characters. Thus, newlines in HTML DOMs are represented
+ by LF characters, and there are never any CR characters in the
+ input to the tokenization stage. */
+ $data = str_replace(
+ array(
+ "\0",
+ "\r\n",
+ "\r"
+ ),
+ array(
+ "\xEF\xBF\xBD",
+ "\n",
+ "\n"
+ ),
+ $data
+ );
+
+ $this->data = $data;
+ $this->char = 0;
+ $this->EOF = strlen( $data );
+ }
+
+
+ /**
+ * Retrieve the currently consume character.
+ * @note This performs bounds checking
+ */
+ public function char() {
+ return ( $this->char++ < $this->EOF )
+ ? $this->data[$this->char - 1]
+ : false;
+ }
+
+ /**
+ * Matches as far as possible until we reach a certain set of bytes
+ * and returns the matched substring.
+ * @param $bytes Bytes to match.
+ */
+ public function charsUntil( $bytes, $max = null ) {
+ if ( $this->char < $this->EOF ) {
+ if ( $max === 0 || $max ) {
+ $len = strcspn( $this->data, $bytes, $this->char, $max );
+ } else {
+ $len = strcspn( $this->data, $bytes, $this->char );
+ }
+ $string = (string) substr( $this->data, $this->char, $len );
+ $this->char += $len;
+ return $string;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Matches as far as possible with a certain set of bytes
+ * and returns the matched substring.
+ * @param $bytes Bytes to match.
+ */
+ public function charsWhile( $bytes, $max = null ) {
+ if ( $this->char < $this->EOF ) {
+ if ( $max === 0 || $max ) {
+ $len = strspn( $this->data, $bytes, $this->char, $max );
+ } else {
+ $len = strspn( $this->data, $bytes, $this->char );
+ }
+ $string = (string) substr( $this->data, $this->char, $len );
+ $this->char += $len;
+ return $string;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Unconsume one character.
+ */
+ public function unget() {
+ if ( $this->char <= $this->EOF ) {
+ $this->char--;
+ }
+ }
+
+ /**
+ * Get the current pointer
+ * @return int
+ */
+ public function getPos() {
+ return $this->char;
+ }
+
+ /**
+ * Get the current pointer
+ * @param int pinter into the data
+ */
+ public function setPos( $ndx ) {
+ $this->char = $ndx;
+ }
+
+ /**
+ * Get a substring of the data.
+ * @param int start
+ * @param int end
+ * @return string
+ */
+ public function getSubstr( $start, $end ) {
+ if ( $end < $start ) {
+ throw new Exception( 'End was before start?' );
+ }
+ $length = $end - $start;
+ return substr( $this->data, $start, $length );
+ }
+
+}
diff --git a/languages/LanguageConverter.php b/languages/LanguageConverter.php
index eae77fb..0c7809f 100644
--- a/languages/LanguageConverter.php
+++ b/languages/LanguageConverter.php
@@ -353,55 +353,66 @@ class LanguageConverter {
return $text;
}
- /* we convert everything except:
+ /* Do the conversion. We convert everything except:
1. HTML markups (anything between < and >)
2. HTML entities
3. placeholders created by the parser
*/
+
+ // Get regex for parser placeholders
global $wgParser;
+ $marker = false;
if ( isset( $wgParser ) && $wgParser->UniqPrefix() != '' ) {
- $marker = '|' . $wgParser->UniqPrefix() . '[\-a-zA-Z0-9]+';
- } else {
- $marker = '';
+ $marker = '/' . $wgParser->UniqPrefix() . '[\-a-zA-Z0-9]+/s';
}
- // this one is needed when the text is inside an HTML markup
- $htmlfix = '|<[^>]+$|^[^<>]*>';
-
- // disable convert to variants between <code> tags
- $codefix = '<code>.+?<\/code>|';
- // disable conversion of <script> tags
- $scriptfix = '<script.*?>.*?<\/script>|';
- // disable conversion of <pre> tags
- $prefix = '<pre.*?>.*?<\/pre>|';
+ // Guard against delimiter nulls in the input
+ $text = str_replace( "\000", '', $text );
- $reg = '/' . $codefix . $scriptfix . $prefix .
- '<[^>]+>|&[a-zA-Z#][a-z0-9]+;' . $marker . $htmlfix . '/s';
$startPos = 0;
$sourceBlob = '';
$literalBlob = '';
-
- // Guard against delimiter nulls in the input
- $text = str_replace( "\000", '', $text );
-
$markupMatches = null;
$elementMatches = null;
- while ( $startPos < strlen( $text ) ) {
- if ( preg_match( $reg, $text, $markupMatches, PREG_OFFSET_CAPTURE, $startPos ) ) {
- $elementPos = $markupMatches[0][1];
- $element = $markupMatches[0][0];
+
+ $htmlTokenizer = new Html5Tokenizer( $text, array( 'code', 'script', 'pre', 'style' ) );
+ // Match /^[^<>]*>/ in case the $text we're working on starts with the end of
+ // a tag. We set this to false after the first match.
+ $htmlTokenizer->setFlagCloseOnly( true );
+ // Match /<[^>]+$/ in case the $text stops part way through a tag
+ $htmlTokenizer->setFlagOpenOnly( true );
+
+ do {
+ $sourceEnd = strlen( $text );
+ $hasHtml = $htmlTokenizer->parse();
+
+ if ( $hasHtml ) {
+ $elementPos = $htmlTokenizer->elementStart;
+ $element = $htmlTokenizer->element;
+ $htmlTokenizer->setFlagCloseOnly( false );
} else {
$elementPos = strlen( $text );
$element = '';
}
- // Queue the part before the markup for translation in a batch
- $sourceBlob .= substr( $text, $startPos, $elementPos - $startPos ) . "\000";
-
+ // Check for parser markers in the non-html chunk
+ list( $sources, $literals ) = $this->tokenizeParserMarkers(
+ substr( $text, $startPos, $elementPos - $startPos ),
+ $marker
+ );
+ foreach ( $sources as $s ) {
+ $sourceBlob .= "$s\000";
+ }
+ foreach ( $literals as $l ) {
+ $literalBlob .= "$l\000";
+ }
// Advance to the next position
$startPos = $elementPos + strlen( $element );
// Translate any alt or title attributes inside the matched element
+ // TODO: $htmlTokenizer already has the attributs parsed, so it would be
+ // more efficient to pull them from $htmlTokenizer->token. But let's do that
+ // in a public patch.
if ( $element !== ''
&& preg_match( '/^(<[^>\s]*)\s([^>]*)(.*)$/', $element, $elementMatches )
) {
@@ -430,7 +441,7 @@ class LanguageConverter {
}
}
$literalBlob .= $element . "\000";
- }
+ } while ( $hasHtml );
// Do the main translation batch
$translatedBlob = $this->translate( $sourceBlob, $toVariant );
@@ -451,6 +462,30 @@ class LanguageConverter {
}
/**
+ * @return array exactly two arrays, the first of sources that should have
+ * language conversion applied, the second should not.
+ */
+ private function tokenizeParserMarkers( $text, $regex ) {
+ if ( !$regex ) {
+ // We don't actually want to match anything
+ return array( array( $text ), array() );
+ }
+ $sourceBlobs = array();
+ $literalBlobs = array();
+ $lastEnd = 0;
+ if ( preg_match_all( $regex, $text, $markupMatches, PREG_OFFSET_CAPTURE ) ) {
+ foreach ( $markupMatches[0] as $match ) {
+ list( $string, $offset ) = $match;
+ $sourceBlobs[] = substr( $text, $lastEnd, $offset - $lastEnd );
+ $literalBlobs[] = $string;
+ $lastEnd = $offset + strlen( $string );
+ }
+ }
+ $sourceBlobs[] = substr( $text, $lastEnd );
+ return array( $sourceBlobs, $literalBlobs );
+ }
+
+ /**
* Translate a string to a variant.
* Doesn't parse rules or do any of that other stuff, for that use
* convert() or convertTo().
diff --git a/tests/phpunit/includes/Html5TokenizerTest.php b/tests/phpunit/includes/Html5TokenizerTest.php
new file mode 100644
index 0000000..7aea2df
--- /dev/null
+++ b/tests/phpunit/includes/Html5TokenizerTest.php
@@ -0,0 +1,132 @@
+<?php
+
+/**
+ * @group medium
+ */
+class Html5TokenizerTest extends MediaWikiTestCase {
+
+ /**
+ * @dataProvider getHtmlFragments
+ */
+ public function testTransform( $input, $expectedFinds, $msg ) {
+ $result = array();
+ $ht = new Html5Tokenizer( $input, array( 'pre', 'code', 'style', 'script' ) );
+ $ht->setFlagCloseOnly( true );
+ $ht->setFlagOpenOnly( true );
+ do {
+ $r = $ht->parse();
+ if ( $r ) {
+ $result[] = $ht->element;
+ }
+ } while ( $r );
+
+ $this->assertArrayEquals( $expectedFinds, $result, $msg );
+ }
+
+ public function getHtmlFragments() {
+ return array(
+ array(
+ "<div bar=asdf>asdf</div>",
+ array( '<div bar=asdf>', '</div>' ),
+ 'Basic unquoted attrs'
+ ),
+ array(
+ "<div bar='asdf' baz=\"123\">asdf</div>",
+ array( "<div bar='asdf' baz=\"123\">", '</div>' ),
+ 'Basic quoted attrs'
+ ),
+ array(
+ "<div b'ar='asdf' b\"az=\"123\">asdf</div>",
+ array( "<div b'ar='asdf' b\"az=\"123\">", '</div>' ),
+ 'Quoted attrs with quote in attr name'
+ ),
+ array(
+ "<div bar='as>df' baz=\"123\">asdf</div>",
+ array( "<div bar='as>df' baz=\"123\">", '</div>' ),
+ 'Quoted attr containing >'
+ ),
+ array(
+ "<!--div bar='as>df' baz=\"123\">asdf</div -->",
+ array( "<!--div bar='as>df' baz=\"123\">asdf</div -->" ),
+ 'Commented elements'
+ ),
+ array(
+ "<div bar='as>>></>>>>df\' b'az='1<23'>asdf</div>",
+ array( "<div bar='as>>></>>>>df\' b'az='1<23'>", '</div>' ),
+ 'Quoted attr containing <'
+ ),
+ array(
+ "<div bar=foo>1</></div>",
+ array( '<div bar=foo>', '</>', '</div>' ),
+ 'Immediately closed end tag'
+ ),
+ array(
+ '<div "=foo>1</div>',
+ array( '<div "=foo>', '</div>' ),
+ 'Attr name is single quote'
+ ),
+ array(
+ "start<div \"=foo",
+ array( '<div "=foo' ),
+ 'Unclosed element'
+ ),
+ array(
+ 'a div "=foo>end',
+ array( 'a div "=foo>' ),
+ 'Unopened element close'
+ ),
+ array(
+ '<pre>a div "=foo></pre>end',
+ array( '<pre>a div "=foo></pre>' ),
+ 'Match entire element'
+ ),
+ array(
+ '<pre id=123 id="asdf">a div "=foo></pre>end',
+ array( '<pre id=123 id="asdf">a div "=foo></pre>' ),
+ 'Match entire element, with attributes'
+ ),
+ array(
+ '<pre id=123 id="asdf"></prea div>< "=foo></pre>end',
+ array( '<pre id=123 id="asdf"></prea div>< "=foo></pre>' ),
+ 'Check entire element matching close logic'
+ ),
+ array(
+ '<pre> <? bogus comment></prea div>< "=foo></pre>end',
+ array( '<pre> <? bogus comment></prea div>< "=foo></pre>' ),
+ 'Check entire element matching, don\'t break on bogus comment'
+ ),
+ array(
+ '<pre> something <!-- <bar "=foo></pre> --!> asdf</pre>end',
+ array( '<pre> something <!-- <bar "=foo></pre> --!> asdf</pre>' ),
+ 'Check entire element matching, don\'t break on end tag inside comment'
+ ),
+ array(
+ '<pre> something <!-- <bar "=foo></pre> asdf</pre>end',
+ array( '<pre> something <!-- <bar "=foo></pre> asdf</pre>end' ),
+ 'Check entire element matching, don\'t break on end tag inside comment that doesn\'t end'
+ ),
+ array(
+ 'start<pre> something <br/> asdf</pre>end',
+ array( '<pre> something <br/> asdf</pre>' ),
+ 'Check entire element matching, don\t break on self-closing tags'
+ ),
+ array(
+ 'start<pre/> something <br/> asdf</pre>end',
+ array( '<pre/>', '<br/>', '</pre>' ),
+ 'Check entire element matching, return if tag is self closing'
+ ),
+ array(
+ 'start<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Stric>t//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">end',
+ array( '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Stric>', 't//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' ),
+ 'Check DOCTYPE matching'
+ ),
+ array(
+ 'start<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"',
+ array( '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"' ),
+ 'Check unclosed DOCTYPE matching'
+ ),
+
+ );
+ }
+
+}
--
1.8.4.5

File Metadata

Mime Type
text/x-diff
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
14200
Default Alt Text
bug71394-tokenizer.patch (53 KB)

Event Timeline