bug71394-tokenizer.patch
acl*security
Actions

Authored By

	• bzimport
	Nov 22 2014, 3:46 AM

Size

53 KB

Referenced Files

None

Subscribers

None

bug71394-tokenizer.patch
View Options

	From b2bf756c1483c0dc65110bdd1b63710e9cdd6edc Mon Sep 17 00:00:00 2001
	From: csteipp <csteipp@wikimedia.org>
	Date: Wed, 5 Nov 2014 15:42:20 -0800
	Subject: [PATCH] SECURITY: Properly remove html from conversion text

	When converting a text to a variant, html should not be converted. This
	patch parses the text as html5, and protects and html from translation.

	Change-Id: I268fdb9be3c9f7f020aab3a0200db6b7a0beddaa
	---
	includes/AutoLoader.php \| 1 +
	includes/Html5Tokenizer.php \| 1364 +++++++++++++++++++++++++
	languages/LanguageConverter.php \| 89 +-
	tests/phpunit/includes/Html5TokenizerTest.php \| 132 +++
	4 files changed, 1559 insertions(+), 27 deletions(-)
	create mode 100644 includes/Html5Tokenizer.php
	create mode 100644 tests/phpunit/includes/Html5TokenizerTest.php

	diff --git a/includes/AutoLoader.php b/includes/AutoLoader.php
	index 172bd49..460730a 100644
	--- a/includes/AutoLoader.php
	+++ b/includes/AutoLoader.php
	@@ -82,6 +82,7 @@ $wgAutoloadLocalClasses = array(
	'HistoryBlobStub' => 'includes/HistoryBlob.php',
	'Hooks' => 'includes/Hooks.php',
	'Html' => 'includes/Html.php',
	+ 'Html5Tokenizer' => 'includes/Html5Tokenizer.php',
	'HtmlFormatter' => 'includes/HtmlFormatter.php',
	'HTMLApiField' => 'includes/htmlform/HTMLApiField.php',
	'HTMLAutoCompleteSelectField' => 'includes/htmlform/HTMLAutoCompleteSelectField.php',
	diff --git a/includes/Html5Tokenizer.php b/includes/Html5Tokenizer.php
	new file mode 100644
	index 0000000..e551c44
	--- /dev/null
	+++ b/includes/Html5Tokenizer.php
	@@ -0,0 +1,1364 @@
	+<?php
	+/**
	+ * HTML Tokenizer for %MediaWiki. Parses a string according to the html 5
	+ * tokenizing spec [http://dev.w3.org/html5/spec-preview/tokenization.html],
	+ * except for when we could optimize for this specific MediaWiki task.
	+ *
	+ * This is based heavily on the html5lib-php project
	+ * (https://github.com/html5lib/html5lib-php), licensed as:
	+ *
	+ * Copyright 2007 Jeroen van der Meer <http://jero.net/>
	+ * Copyright 2008 Edward Z. Yang <http://htmlpurifier.org/>
	+ * Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
	+ *
	+ * Permission is hereby granted, free of charge, to any person obtaining a
	+ * copy of this software and associated documentation files (the
	+ * "Software"), to deal in the Software without restriction, including
	+ * without limitation the rights to use, copy, modify, merge, publish,
	+ * distribute, sublicense, and/or sell copies of the Software, and to
	+ * permit persons to whom the Software is furnished to do so, subject to
	+ * the following conditions:
	+ *
	+ * The above copyright notice and this permission notice shall be included
	+ * in all copies or substantial portions of the Software.
	+ *
	+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
	+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
	+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
	+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
	+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
	+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
	+ *
	+ * @file
	+ */
	+
	+// In general:
	+// /* */ indicates verbatim text from the HTML 5 specification
	+// // indicates regular comments
	+
	+class Html5Tokenizer {
	+
	+ /**
	+ * HTML5_InputStream the stream we parse to find each chunk of html
	+ */
	+ private $stream;
	+
	+ /**
	+ * index into the data where the next html chunk starts
	+ */
	+ public $elementStart;
	+
	+ /**
	+ * The next html chunk
	+ */
	+ public $element;
	+
	+ /**
	+ * Array of elements where we return the entire
	+ * content. Usually array( 'pre', 'code', 'style', 'script' );
	+ */
	+ private $entireElements;
	+
	+ /**
	+ * Do we return /^[^<]*>/ as an html token?
	+ */
	+ private $flagCloseOnly;
	+
	+ /**
	+ * Do we return /<[^>]*$/ as an html token?
	+ */
	+ private $flagOpenOnly;
	+
	+ /**
	+ * Current token that is being built, but not yet emitted. Also
	+ * is the last token emitted, if applicable.
	+ */
	+ protected $token;
	+
	+ // These are constants describing tokens
	+ const DOCTYPE = 0;
	+ const STARTTAG = 1;
	+ const ENDTAG = 2;
	+ const COMMENT = 3;
	+ const CHARACTER = 4;
	+ const SPACECHARACTER = 5;
	+ const EOF = 6;
	+ const PARSEERROR = 7;
	+
	+ // These are constants representing bunches of characters.
	+ const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
	+ const UPPER_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
	+ const LOWER_ALPHA = 'abcdefghijklmnopqrstuvwxyz';
	+ const DIGIT = '0123456789';
	+ const HEX = '0123456789ABCDEFabcdef';
	+ const WHITESPACE = "\t\n\x0c ";
	+
	+ /**
	+ * @param string $data the html string to tokenize
	+ * @param array list of elements names to get the entire contents of
	+ */
	+ public function __construct( $data, array $entireElements = array() ) {
	+ $this->stream = new HTML5_InputStream( $data );
	+ $this->entireElements = $entireElements;
	+ $this->flagCloseOnly = false;
	+ $this->flagOpenOnly = false;
	+ }
	+
	+ /**
	+ * @param bool $flag whether to match ^[^<]*>
	+ */
	+ public function setFlagCloseOnly( $flag ) {
	+ $this->flagCloseOnly = $flag;
	+ }
	+
	+ /**
	+ * @param bool $flag whether to match <[^>]*$
	+ */
	+ public function setFlagOpenOnly( $flag ) {
	+ $this->flagOpenOnly = $flag;
	+ }
	+
	+ /**
	+ * @return array
	+ */
	+ public function getLastToken() {
	+ return $this->token;
	+ }
	+
	+
	+ public function checkEntireElementMatching( $matchingEntireElement ) {
	+ $sc = isset( $this->token['self-closing'] ) ? $this->token['self-closing'] : false;
	+ return !$matchingEntireElement \|\| (
	+ ( $this->token['type'] === self::ENDTAG \|\| $sc )
	+ && $this->token['name'] === $matchingEntireElement
	+ );
	+ }
	+
	+ /**
	+ * Performs the actual parsing of the document. Each call will return
	+ * the next chunk of html in the string. We only handle PCDATA content model.
	+ *
	+ * Access the html chunk and it's offset in the string by this public $element
	+ * and $elementStart members of this class.
	+ *
	+ * @return bool true of we identified a chunk of html in the remaining string
	+ */
	+ public function parse() {
	+ // Current state
	+ $state = 'data';
	+
	+ // This is used to avoid having to have look-behind in the data state.
	+ $lastFourChars = '';
	+
	+ /**
	+ * Escape flag as specified by the HTML5 specification: "used to
	+ * control the behavior of the tokeniser. It is either true or
	+ * false, and initially must be set to the false state."
	+ */
	+ $escape = false;
	+
	+ // Have we started marking an html tag to return?
	+ $haveElement = false;
	+
	+ // Are we matching the entire body of a specific element?
	+ $matchingEntireElement = false;
	+
	+ // In case we need to handle flagCloseOnly
	+ $this->elementStart = $this->stream->getPos();
	+
	+ $this->element = null;
	+
	+ while( $state !== null ) {
	+
	+ switch($state) {
	+
	+ case 'data':
	+ /* Consume the next input character */
	+ $char = $this->stream->char();
	+ $lastFourChars .= $char;
	+ if ( strlen( $lastFourChars ) > 4 ) {
	+ $lastFourChars = substr( $lastFourChars, -4 );
	+ }
	+
	+ /* U+003C LESS-THAN SIGN (<) */
	+ if ( $char === '<' ) {
	+ /* When the content model flag is set to the PCDATA state: switch
	+ to the tag open state. */
	+ $state = 'tag open';
	+ if ( !$matchingEntireElement ) {
	+ $this->elementStart = $this->stream->getPos() - 1;
	+ $haveElement = true;
	+ }
	+
	+ } elseif ( $char === '>'
	+ && !$haveElement
	+ && $this->flagCloseOnly
	+ ) {
	+ // For MediaWiki, and unopened closing tag could mean
	+ // the begining of this fragment is in an element context
	+ if ( !$matchingEntireElement ) {
	+ $this->element = $this->stream->getSubstr(
	+ $this->elementStart,
	+ $this->stream->getPos()
	+ );
	+ return true;
	+ }
	+
	+ } elseif ( $char === false ) {
	+ /* EOF : Emit an end-of-file token. */
	+ $state = null;
	+ if ( $haveElement && $this->flagOpenOnly
	+ \|\| $matchingEntireElement
	+ ) {
	+ $this->element = $this->stream->getSubstr(
	+ $this->elementStart,
	+ $this->stream->getPos()
	+ );
	+ return true;
	+ }
	+
	+ return false;
	+
	+ } elseif ( $char === "\t" \|\| $char === "\n" \|\| $char === "\x0c" \|\| $char === ' ' ) {
	+ // Directly after emitting a token you switch back to the "data
	+ // state". At that point spaceCharacters are important so they are
	+ // emitted separately.
	+ $chars = $this->stream->charsWhile( self::WHITESPACE );
	+ $lastFourChars .= $chars;
	+ if ( strlen( $lastFourChars ) > 4 ) {
	+ $lastFourChars = substr( $lastFourChars, -4 );
	+ }
	+
	+ } else {
	+ /* Anything else
	+ THIS IS AN OPTIMIZATION: Get as many character that
	+ otherwise would also be treated as a character token and emit it
	+ as a single character token. Stay in the data state. */
	+ $chars = $this->stream->charsUntil( '<>' );
	+ $lastFourChars .= $chars;
	+ if ( strlen( $lastFourChars ) > 4 ) {
	+ $lastFourChars = substr( $lastFourChars, -4 );
	+ }
	+ $state = 'data';
	+ }
	+ break;
	+
	+ case 'tag open':
	+ $char = $this->stream->char();
	+
	+ if ( $char === '!' ) {
	+ /* U+0021 EXCLAMATION MARK (!)
	+ Switch to the markup declaration open state. */
	+ $state = 'markup declaration open';
	+
	+ } elseif ( $char === '/' ) {
	+ /* U+002F SOLIDUS (/)
	+ Switch to the close tag open state. */
	+ $state = 'close tag open';
	+
	+ } elseif( 'A' <= $char && $char <= 'Z' ) {
	+ /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
	+ Create a new start tag token, set its tag name to the lowercase
	+ version of the input character (add 0x0020 to the character's code
	+ point), then switch to the tag name state. (Don't emit the token
	+ yet; further details will be filled in before it is emitted.) */
	+ $this->token = array(
	+ 'name' => strtolower( $char ),
	+ 'type' => self::STARTTAG,
	+ 'attr' => array()
	+ );
	+
	+ $state = 'tag name';
	+
	+ } elseif( 'a' <= $char && $char <= 'z' ) {
	+ /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
	+ Create a new start tag token, set its tag name to the input
	+ character, then switch to the tag name state. (Don't emit
	+ the token yet; further details will be filled in before it
	+ is emitted.) */
	+ $this->token = array(
	+ 'name' => $char,
	+ 'type' => self::STARTTAG,
	+ 'attr' => array()
	+ );
	+
	+ $state = 'tag name';
	+
	+ } elseif ( $char === '>' ) {
	+ /* U+003E GREATER-THAN SIGN (>)
	+ Parse error. Emit a U+003C LESS-THAN SIGN character token and a
	+ U+003E GREATER-THAN SIGN character token. Switch to the data state. */
	+ // For MediaWiki, we don't care about returning "<>"
	+ $state = 'data';
	+
	+ } elseif ( $char === '?' ) {
	+ /* U+003F QUESTION MARK (?)
	+ Parse error. Switch to the bogus comment state. */
	+ $this->token = array(
	+ 'data' => '?',
	+ 'type' => self::COMMENT
	+ );
	+ $state = 'bogus comment';
	+
	+ } else {
	+ /* Anything else
	+ Parse error. Emit a U+003C LESS-THAN SIGN character token and
	+ reconsume the current input character in the data state. */
	+ $state = 'data';
	+ $this->stream->unget();
	+ }
	+ break;
	+
	+ case 'close tag open':
	+ $char = $this->stream->char();
	+
	+ if ( 'A' <= $char && $char <= 'Z' ) {
	+ /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
	+ Create a new end tag token, set its tag name to the lowercase version
	+ of the input character (add 0x0020 to the character's code point), then
	+ switch to the tag name state. (Don't emit the token yet; further details
	+ will be filled in before it is emitted.) */
	+ $this->token = array(
	+ 'name' => strtolower( $char ),
	+ 'type' => self::ENDTAG
	+ );
	+ $state = 'tag name';
	+
	+ } elseif ( 'a' <= $char && $char <= 'z' ) {
	+ /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
	+ Create a new end tag token, set its tag name to the
	+ input character, then switch to the tag name state.
	+ (Don't emit the token yet; further details will be
	+ filled in before it is emitted.) */
	+ $this->token = array(
	+ 'name' => $char,
	+ 'type' => self::ENDTAG
	+ );
	+ $state = 'tag name';
	+
	+ } elseif ( $char === '>' ) {
	+ /* U+003E GREATER-THAN SIGN (>)
	+ Parse error. Switch to the data state. */
	+ // e.g., </>. For MediaWiki, we want to return this
	+ $state = 'data';
	+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
	+ $this->element = $this->stream->getSubstr(
	+ $this->elementStart,
	+ $this->stream->getPos()
	+ );
	+ return true;
	+ }
	+
	+ } elseif ( $char === false ) {
	+ $this->stream->unget();
	+ $state = 'data';
	+
	+ } else {
	+ /* Parse error. Switch to the bogus comment state. */
	+ $this->token = array(
	+ 'data' => $char,
	+ 'type' => self::COMMENT
	+ );
	+ $state = 'bogus comment';
	+ }
	+ break;
	+
	+ case 'tag name':
	+ /* Consume the next input character: */
	+ $char = $this->stream->char();
	+
	+ if ( $char === "\t" \|\| $char === "\n" \|\| $char === "\x0c" \|\| $char === ' ') {
	+ /* U+0009 CHARACTER TABULATION
	+ U+000A LINE FEED (LF)
	+ U+000C FORM FEED (FF)
	+ U+0020 SPACE
	+ Switch to the before attribute name state. */
	+ $state = 'before attribute name';
	+
	+ } elseif ( $char === '/' ) {
	+ /* U+002F SOLIDUS (/)
	+ Switch to the self-closing start tag state. */
	+ $state = 'self-closing start tag';
	+
	+ } elseif ( $char === '>' ) {
	+ /* U+003E GREATER-THAN SIGN (>)
	+ Emit the current tag token. Switch to the data state. */
	+ $state = 'data';
	+ if ( $this->checkEntireElementMatching( $matchingEntireElement) ) {
	+ $this->element = $this->stream->getSubstr(
	+ $this->elementStart,
	+ $this->stream->getPos()
	+ );
	+ return true;
	+ }
	+
	+ } elseif ( 'A' <= $char && $char <= 'Z' ) {
	+ /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
	+ Append the lowercase version of the current input
	+ character (add 0x0020 to the character's code point) to
	+ the current tag token's tag name. Stay in the tag name state. */
	+ $chars = $this->stream->charsWhile( self::UPPER_ALPHA );
	+ $this->token['name'] .= strtolower($char . $chars);
	+ $state = 'tag name';
	+
	+ } elseif ( $char === false ) {
	+ /* EOF
	+ Parse error. Reconsume the EOF character in the data state. */
	+ $this->stream->unget();
	+ $state = 'data';
	+
	+ } else {
	+ /* Anything else
	+ Append the current input character to the current tag token's tag name.
	+ Stay in the tag name state. */
	+ $chars = $this->stream->charsUntil( "\t\n\x0C />" . self::UPPER_ALPHA );
	+ $this->token['name'] .= $char . $chars;
	+ $state = 'tag name';
	+ }
	+
	+ if ( $this->token['name'] && in_array( $this->token['name'], $this->entireElements ) ) {
	+ $matchingEntireElement = $this->token['name'];
	+ }
	+ break;
	+
	+ case 'before attribute name':
	+ /* Consume the next input character: */
	+ $char = $this->stream->char();
	+
	+ if( $char === "\t" \|\| $char === "\n" \|\| $char === "\x0c" \|\| $char === ' ' ) {
	+ /* U+0009 CHARACTER TABULATION
	+ U+000A LINE FEED (LF)
	+ U+000C FORM FEED (FF)
	+ U+0020 SPACE
	+ Stay in the before attribute name state. */
	+ $state = 'before attribute name';
	+
	+ } elseif ( $char === '/' ) {
	+ /* U+002F SOLIDUS (/)
	+ Switch to the self-closing start tag state. */
	+ $state = 'self-closing start tag';
	+
	+ } elseif ( $char === '>' ) {
	+ /* U+003E GREATER-THAN SIGN (>)
	+ Emit the current tag token. Switch to the data state. */
	+ $state = 'data';
	+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
	+ $this->element = $this->stream->getSubstr(
	+ $this->elementStart,
	+ $this->stream->getPos()
	+ );
	+ return true;
	+ }
	+
	+ } elseif ('A' <= $char && $char <= 'Z' ) {
	+ /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
	+ Start a new attribute in the current tag token. Set that
	+ attribute's name to the lowercase version of the current
	+ input character (add 0x0020 to the character's code
	+ point), and its value to the empty string. Switch to the
	+ attribute name state.*/
	+ $this->token['attr'][] = array(
	+ 'name' => strtolower( $char ),
	+ 'value' => ''
	+ );
	+
	+ $state = 'attribute name';
	+
	+ } elseif ( $char === false ) {
	+ /* EOF
	+ Parse error. Reconsume the EOF character in the data state. */
	+ $this->stream->unget();
	+ $state = 'data';
	+
	+ } else {
	+ /* U+0022 QUOTATION MARK (")
	+ U+0027 APOSTROPHE (')
	+ U+003C LESS-THAN SIGN (<)
	+ U+003D EQUALS SIGN (=)
	+ Parse error. Treat it as per the "anything else" entry
	+ below.
	+
	+ Anything else
	+ Start a new attribute in the current tag token. Set that attribute's
	+ name to the current input character, and its value to the empty string.
	+ Switch to the attribute name state. */
	+ $this->token['attr'][] = array(
	+ 'name' => $char,
	+ 'value' => ''
	+ );
	+
	+ $state = 'attribute name';
	+ }
	+ break;
	+
	+ case 'attribute name':
	+ /* Consume the next input character: */
	+ $char = $this->stream->char();
	+
	+ // this conditional is optimized, check bottom
	+ if ( $char === "\t" \|\| $char === "\n" \|\| $char === "\x0c" \|\| $char === ' ' ) {
	+ /* U+0009 CHARACTER TABULATION
	+ U+000A LINE FEED (LF)
	+ U+000C FORM FEED (FF)
	+ U+0020 SPACE
	+ Switch to the after attribute name state. */
	+ $state = 'after attribute name';
	+
	+ } elseif ( $char === '/' ) {
	+ /* U+002F SOLIDUS (/)
	+ Switch to the self-closing start tag state. */
	+ $state = 'self-closing start tag';
	+
	+ } elseif ( $char === '=' ) {
	+ /* U+003D EQUALS SIGN (=)
	+ Switch to the before attribute value state. */
	+ $state = 'before attribute value';
	+
	+ } elseif ( $char === '>' ) {
	+ /* U+003E GREATER-THAN SIGN (>)
	+ Emit the current tag token. Switch to the data state. */
	+ $state = 'data';
	+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
	+ $this->element = $this->stream->getSubstr(
	+ $this->elementStart,
	+ $this->stream->getPos()
	+ );
	+ return true;
	+ }
	+
	+ } elseif ( 'A' <= $char && $char <= 'Z' ) {
	+ /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
	+ Append the lowercase version of the current input
	+ character (add 0x0020 to the character's code point) to
	+ the current attribute's name. Stay in the attribute name
	+ state. */
	+ $chars = $this->stream->charsWhile( self::UPPER_ALPHA );
	+ $last = count( $this->token['attr'] ) - 1;
	+ $this->token['attr'][$last]['name'] .= strtolower( $char . $chars );
	+
	+ $state = 'attribute name';
	+
	+ } elseif ( $char === false ) {
	+ /* EOF
	+ Parse error. Reconsume the EOF character in the data state. */
	+ $this->stream->unget();
	+ $state = 'data';
	+
	+ } else {
	+ /* U+0022 QUOTATION MARK (")
	+ U+0027 APOSTROPHE (')
	+ U+003C LESS-THAN SIGN (<)
	+ Parse error. Treat it as per the "anything else"
	+ entry below.
	+
	+ Anything else
	+ Append the current input character to the current attribute's name.
	+ Stay in the attribute name state. */
	+ $chars = $this->stream->charsUntil( "\t\n\x0C /=>\"'" . self::UPPER_ALPHA );
	+ $last = count( $this->token['attr'] ) - 1;
	+ $this->token['attr'][$last]['name'] .= $char . $chars;
	+
	+ $state = 'attribute name';
	+ }
	+
	+ break;
	+
	+ case 'after attribute name':
	+ // Consume the next input character:
	+ $char = $this->stream->char();
	+
	+ // this is an optimized conditional, check the bottom
	+ if ( $char === "\t" \|\| $char === "\n" \|\| $char === "\x0c" \|\| $char === ' ' ) {
	+ /* U+0009 CHARACTER TABULATION
	+ U+000A LINE FEED (LF)
	+ U+000C FORM FEED (FF)
	+ U+0020 SPACE
	+ Stay in the after attribute name state. */
	+ $state = 'after attribute name';
	+
	+ } elseif ( $char === '/' ) {
	+ /* U+002F SOLIDUS (/)
	+ Switch to the self-closing start tag state. */
	+ $state = 'self-closing start tag';
	+
	+ } elseif ( $char === '=' ) {
	+ /* U+003D EQUALS SIGN (=)
	+ Switch to the before attribute value state. */
	+ $state = 'before attribute value';
	+
	+ } elseif ( $char === '>' ) {
	+ /* U+003E GREATER-THAN SIGN (>)
	+ Emit the current tag token. Switch to the data state. */
	+ $state = 'data';
	+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
	+ $this->element = $this->stream->getSubstr(
	+ $this->elementStart,
	+ $this->stream->getPos()
	+ );
	+ return true;
	+ }
	+
	+ } elseif ( 'A' <= $char && $char <= 'Z' ) {
	+ /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
	+ Start a new attribute in the current tag token. Set that
	+ attribute's name to the lowercase version of the current
	+ input character (add 0x0020 to the character's code
	+ point), and its value to the empty string. Switch to the
	+ attribute name state. */
	+ $this->token['attr'][] = array(
	+ 'name' => strtolower( $char ),
	+ 'value' => ''
	+ );
	+ $state = 'attribute name';
	+
	+ } elseif ( $char === false ) {
	+ /* EOF
	+ Parse error. Reconsume the EOF character in the data state. */
	+ $this->stream->unget();
	+ $state = 'data';
	+
	+ } else {
	+ /* U+0022 QUOTATION MARK (")
	+ U+0027 APOSTROPHE (')
	+ U+003C LESS-THAN SIGN(<)
	+ Parse error. Treat it as per the "anything else"
	+ entry below.
	+
	+ Anything else
	+ Start a new attribute in the current tag token. Set that attribute's
	+ name to the current input character, and its value to the empty string.
	+ Switch to the attribute name state. */
	+ $this->token['attr'][] = array(
	+ 'name' => $char,
	+ 'value' => ''
	+ );
	+
	+ $state = 'attribute name';
	+ }
	+ break;
	+
	+ case 'before attribute value':
	+ // Consume the next input character:
	+ $char = $this->stream->char();
	+
	+ // this is an optimized conditional
	+ if( $char === "\t" \|\| $char === "\n" \|\| $char === "\x0c" \|\| $char === ' ' ) {
	+ /* U+0009 CHARACTER TABULATION
	+ U+000A LINE FEED (LF)
	+ U+000C FORM FEED (FF)
	+ U+0020 SPACE
	+ Stay in the before attribute value state. */
	+ $state = 'before attribute value';
	+
	+ } elseif ( $char === '"' ) {
	+ /* U+0022 QUOTATION MARK (")
	+ Switch to the attribute value (double-quoted) state. */
	+ $state = 'attribute value (double-quoted)';
	+
	+ } elseif ( $char === '&' ) {
	+ /* U+0026 AMPERSAND (&)
	+ Switch to the attribute value (unquoted) state and reconsume
	+ this input character. */
	+ $this->stream->unget();
	+ $state = 'attribute value (unquoted)';
	+
	+ } elseif ($char === '\'' ) {
	+ /* U+0027 APOSTROPHE (')
	+ Switch to the attribute value (single-quoted) state. */
	+ $state = 'attribute value (single-quoted)';
	+
	+ } elseif ( $char === '>' ) {
	+ /* U+003E GREATER-THAN SIGN (>)
	+ Parse error. Emit the current tag token. Switch to the data state. */
	+ $state = 'data';
	+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
	+ $this->element = $this->stream->getSubstr(
	+ $this->elementStart,
	+ $this->stream->getPos()
	+ );
	+ return true;
	+ }
	+
	+ } elseif ( $char === false ) {
	+ /* EOF
	+ Parse error. Reconsume the EOF character in the data state. */
	+ $this->stream->unget();
	+ $state = 'data';
	+
	+ } else {
	+ /* U+003D EQUALS SIGN (=)
	+ * U+003C LESS-THAN SIGN (<)
	+ Parse error. Treat it as per the "anything else" entry below.
	+
	+ Anything else
	+ Append the current input character to the current attribute's value.
	+ Switch to the attribute value (unquoted) state. */
	+ $last = count( $this->token['attr'] ) - 1;
	+ $this->token['attr'][$last]['value'] .= $char;
	+ $state = 'attribute value (unquoted)';
	+ }
	+ break;
	+
	+ case 'attribute value (double-quoted)':
	+ // Consume the next input character:
	+ $char = $this->stream->char();
	+
	+ if( $char === '"' ) {
	+ /* U+0022 QUOTATION MARK (")
	+ Switch to the after attribute value (quoted) state. */
	+ $state = 'after attribute value (quoted)';
	+
	+ } elseif ( $char === false ) {
	+ /* EOF
	+ Parse error. Reconsume the EOF character in the data state. */
	+ $this->stream->unget();
	+ $state = 'data';
	+
	+ } else {
	+ /* Anything else
	+ Append the current input character to the current attribute's value.
	+ Stay in the attribute value (double-quoted) state. */
	+ $chars = $this->stream->charsUntil( '"' );
	+ $last = count( $this->token['attr'] ) - 1;
	+ $this->token['attr'][$last]['value'] .= $char . $chars;
	+ $state = 'attribute value (double-quoted)';
	+ }
	+ break;
	+
	+ case 'attribute value (single-quoted)':
	+ // Consume the next input character:
	+ $char = $this->stream->char();
	+
	+ if( $char === "'" ) {
	+ /* U+0022 QUOTATION MARK (')
	+ Switch to the after attribute value state. */
	+ $state = 'after attribute value (quoted)';
	+
	+ } elseif ( $char === false ) {
	+ /* EOF
	+ Parse error. Reconsume the EOF character in the data state. */
	+ $this->stream->unget();
	+ $state = 'data';
	+
	+ } else {
	+ /* Anything else
	+ Append the current input character to the current attribute's value.
	+ Stay in the attribute value (single-quoted) state. */
	+ $chars = $this->stream->charsUntil( "'" );
	+ $last = count( $this->token['attr'] ) - 1;
	+ $this->token['attr'][$last]['value'] .= $char . $chars;
	+ $state = 'attribute value (single-quoted)';
	+ }
	+ break;
	+
	+ case 'attribute value (unquoted)':
	+ // Consume the next input character:
	+ $char = $this->stream->char();
	+
	+ if( $char === "\t" \|\| $char === "\n" \|\| $char === "\x0c" \|\| $char === ' ' ) {
	+ /* U+0009 CHARACTER TABULATION
	+ U+000A LINE FEED (LF)
	+ U+000C FORM FEED (FF)
	+ U+0020 SPACE
	+ Switch to the before attribute name state. */
	+ $state = 'before attribute name';
	+
	+ } elseif ( $char === '>' ) {
	+ /* U+003E GREATER-THAN SIGN (>)
	+ Emit the current tag token. Switch to the data state. */
	+ $state = 'data';
	+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
	+ $this->element = $this->stream->getSubstr(
	+ $this->elementStart,
	+ $this->stream->getPos()
	+ );
	+ return true;
	+ }
	+
	+ } elseif ($char === false) {
	+ /* EOF
	+ Parse error. Reconsume the EOF character in the data state. */
	+ $this->stream->unget();
	+ $state = 'data';
	+ } else {
	+ /* U+0022 QUOTATION MARK (")
	+ U+0027 APOSTROPHE (')
	+ U+003C LESS-THAN SIGN (<)
	+ U+003D EQUALS SIGN (=)
	+ Parse error. Treat it as per the "anything else"
	+ entry below.
	+
	+ Anything else
	+ Append the current input character to the current attribute's value.
	+ Stay in the attribute value (unquoted) state. */
	+ $chars = $this->stream->charsUntil( "\t\n\x0c &>\"'=" );
	+
	+ $last = count( $this->token['attr'] ) - 1;
	+ $this->token['attr'][$last]['value'] .= $char . $chars;
	+
	+ $state = 'attribute value (unquoted)';
	+ }
	+ break;
	+
	+ case 'after attribute value (quoted)':
	+ /* Consume the next input character: */
	+ $char = $this->stream->char();
	+
	+ if ( $char === "\t" \|\| $char === "\n" \|\| $char === "\x0c" \|\| $char === ' ' ) {
	+ /* U+0009 CHARACTER TABULATION
	+ U+000A LINE FEED (LF)
	+ U+000C FORM FEED (FF)
	+ U+0020 SPACE
	+ Switch to the before attribute name state. */
	+ $state = 'before attribute name';
	+
	+ } elseif ( $char === '/' ) {
	+ /* U+002F SOLIDUS (/)
	+ Switch to the self-closing start tag state. */
	+ $state = 'self-closing start tag';
	+
	+ } elseif ( $char === '>' ) {
	+ /* U+003E GREATER-THAN SIGN (>)
	+ Emit the current tag token. Switch to the data state. */
	+ $state = 'data';
	+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
	+ $this->element = $this->stream->getSubstr(
	+ $this->elementStart,
	+ $this->stream->getPos()
	+ );
	+ return true;
	+ }
	+
	+ } elseif ( $char === false ) {
	+ /* EOF
	+ Parse error. Reconsume the EOF character in the data state. */
	+ $this->stream->unget();
	+ $state = 'data';
	+
	+ } else {
	+ /* Anything else
	+ Parse error. Reconsume the character in the before attribute
	+ name state. */
	+ $this->stream->unget();
	+ $state = 'before attribute name';
	+ }
	+ break;
	+
	+ case 'self-closing start tag':
	+ /* Consume the next input character: */
	+ $char = $this->stream->char();
	+
	+ if ( $char === '>' ) {
	+ /* U+003E GREATER-THAN SIGN (>)
	+ Set the self-closing flag of the current tag token.
	+ Emit the current tag token. Switch to the data state. */
	+ // not sure if this is the name we want
	+ $this->token['self-closing'] = true;
	+ $state = 'data';
	+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
	+ $this->element = $this->stream->getSubstr(
	+ $this->elementStart,
	+ $this->stream->getPos()
	+ );
	+ return true;
	+ }
	+
	+ } elseif ( $char === false ) {
	+ /* EOF
	+ Parse error. Reconsume the EOF character in the data state. */
	+ $this->stream->unget();
	+ $state = 'data';
	+
	+ } else {
	+ /* Anything else
	+ Parse error. Reconsume the character in the before attribute name state. */
	+ $this->stream->unget();
	+ $state = 'before attribute name';
	+ }
	+ break;
	+
	+ case 'bogus comment':
	+ /* Consume every character up to the first U+003E GREATER-THAN SIGN
	+ character (>) or the end of the file (EOF), whichever comes first. Emit
	+ a comment token whose data is the concatenation of all the characters
	+ starting from and including the character that caused the state machine
	+ to switch into the bogus comment state, up to and including the last
	+ consumed character before the U+003E character, if any, or up to the
	+ end of the file otherwise. (If the comment was started by the end of
	+ the file (EOF), the token is empty.) */
	+ $this->token['data'] .= (string) $this->stream->charsUntil( '>' );
	+ $this->stream->char();
	+
	+ /* Switch to the data state. */
	+ $state = 'data';
	+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
	+ $this->element = $this->stream->getSubstr(
	+ $this->elementStart,
	+ $this->stream->getPos()
	+ );
	+ return true;
	+ }
	+ break;
	+
	+ case 'markup declaration open':
	+ // Consume for below
	+ $hyphens = $this->stream->charsWhile( '-', 2 );
	+ if ( $hyphens === '-' ) {
	+ $this->stream->unget();
	+ }
	+ if ( $hyphens !== '--' ) {
	+ $alpha = $this->stream->charsWhile( self::ALPHA, 7 );
	+ }
	+
	+ /* If the next two characters are both U+002D HYPHEN-MINUS (-)
	+ characters, consume those two characters, create a comment token whose
	+ data is the empty string, and switch to the comment state. */
	+ if ( $hyphens === '--' ) {
	+ $state = 'comment start';
	+ $this->token = array(
	+ 'data' => '',
	+ 'type' => self::COMMENT
	+ );
	+
	+ /* Otherwise if the next seven characters are a case-insensitive match
	+ for the word "DOCTYPE", then consume those characters and switch to the
	+ DOCTYPE state. */
	+ } elseif ( strtoupper( $alpha ) === 'DOCTYPE' ) {
	+ # $state = 'DOCTYPE';
	+ // For MediaWiki, we're simplifying and saying DOCTYPE
	+ // is just another self-closing tag
	+ $state = 'attribute name';
	+ $this->token = array(
	+ 'name' => '!DOCTYPE',
	+ 'data' => '',
	+ 'type' => self::STARTTAG,
	+ 'self-closing' => true,
	+ );
	+
	+ // XXX not implemented
	+ /* Otherwise, if the insertion mode is "in foreign content"
	+ and the current node is not an element in the HTML namespace
	+ and the next seven characters are an ASCII case-sensitive
	+ match for the string "[CDATA[" (the five uppercase letters
	+ "CDATA" with a U+005B LEFT SQUARE BRACKET character before
	+ and after), then consume those characters and switch to the
	+ CDATA section state (which is unrelated to the content model
	+ flag's CDATA state).
	+
	+ Otherwise, is is a parse error. Switch to the bogus comment state.
	+ The next character that is consumed, if any, is the first character
	+ that will be in the comment. */
	+ } else {
	+ $this->token = array(
	+ 'data' => (string) $alpha,
	+ 'type' => self::COMMENT
	+ );
	+ $state = 'bogus comment';
	+ }
	+ break;
	+
	+ case 'comment start':
	+ /* Consume the next input character: */
	+ $char = $this->stream->char();
	+
	+ if ( $char === '-' ) {
	+ /* U+002D HYPHEN-MINUS (-)
	+ Switch to the comment start dash state. */
	+ $state = 'comment start dash';
	+ } elseif ( $char === '>' ) {
	+ /* U+003E GREATER-THAN SIGN (>)
	+ Parse error. Emit the comment token. Switch to the
	+ data state. */
	+ // E.g., <!-->. For MediaWiki we should return this
	+ $state = 'data';
	+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
	+ $this->element = $this->stream->getSubstr(
	+ $this->elementStart,
	+ $this->stream->getPos()
	+ );
	+ return true;
	+ }
	+
	+ } elseif ( $char === false ) {
	+ /* EOF
	+ Parse error. Emit the comment token. Reconsume the
	+ EOF character in the data state. */
	+ $this->stream->unget();
	+ $state = 'data';
	+ } else {
	+ /* Anything else
	+ Append the input character to the comment token's
	+ data. Switch to the comment state. */
	+ $this->token['data'] .= $char;
	+ $state = 'comment';
	+ }
	+ break;
	+
	+ case 'comment start dash':
	+ /* Consume the next input character: */
	+ $char = $this->stream->char();
	+ if ( $char === '-' ) {
	+ /* U+002D HYPHEN-MINUS (-)
	+ Switch to the comment end state */
	+ $state = 'comment end';
	+ } elseif ( $char === '>' ) {
	+ /* U+003E GREATER-THAN SIGN (>)
	+ Parse error. Emit the comment token. Switch to the
	+ data state. */
	+ // E.g., <!--->. For MediaWiki, we return this
	+ $state = 'data';
	+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
	+ $this->element = $this->stream->getSubstr(
	+ $this->elementStart,
	+ $this->stream->getPos()
	+ );
	+ return true;
	+ }
	+
	+ } elseif ( $char === false ) {
	+ /* Parse error. Emit the comment token. Reconsume the
	+ EOF character in the data state. */
	+ $this->stream->unget();
	+ $state = 'data';
	+
	+ } else {
	+ $this->token['data'] .= '-' . $char;
	+ $state = 'comment';
	+ }
	+ break;
	+
	+ case 'comment':
	+ /* Consume the next input character: */
	+ $char = $this->stream->char();
	+
	+ if( $char === '-' ) {
	+ /* U+002D HYPHEN-MINUS (-)
	+ Switch to the comment end dash state */
	+ $state = 'comment end dash';
	+
	+ } elseif ( $char === false ) {
	+ /* EOF
	+ Parse error. Emit the comment token. Reconsume the EOF character
	+ in the data state. */
	+ $this->stream->unget();
	+ $state = 'data';
	+
	+ } else {
	+ /* Anything else
	+ Append the input character to the comment token's data. Stay in
	+ the comment state. */
	+ $chars = $this->stream->charsUntil( '-' );
	+ $this->token['data'] .= $char . $chars;
	+ }
	+ break;
	+
	+ case 'comment end dash':
	+ /* Consume the next input character: */
	+ $char = $this->stream->char();
	+
	+ if( $char === '-' ) {
	+ /* U+002D HYPHEN-MINUS (-)
	+ Switch to the comment end state */
	+ $state = 'comment end';
	+
	+ } elseif( $char === false ) {
	+ /* EOF
	+ Parse error. Emit the comment token. Reconsume the EOF character
	+ in the data state. */
	+ $this->stream->unget();
	+ $state = 'data';
	+
	+ } else {
	+ /* Anything else
	+ Append a U+002D HYPHEN-MINUS (-) character and the input
	+ character to the comment token's data. Switch to the comment state. */
	+ $this->token['data'] .= '-' . $char;
	+ $state = 'comment';
	+ }
	+ break;
	+
	+ case 'comment end':
	+ /* Consume the next input character: */
	+ $char = $this->stream->char();
	+
	+ if( $char === '>' ) {
	+ /* U+003E GREATER-THAN SIGN (>)
	+ Emit the comment token. Switch to the data state. */
	+ $state = 'data';
	+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
	+ $this->element = $this->stream->getSubstr(
	+ $this->elementStart,
	+ $this->stream->getPos()
	+ );
	+ return true;
	+ }
	+
	+ } elseif ( $char === '-' ) {
	+ /* U+002D HYPHEN-MINUS (-)
	+ Parse error. Append a U+002D HYPHEN-MINUS (-) character
	+ to the comment token's data. Stay in the comment end
	+ state. */
	+ $this->token['data'] .= '-';
	+
	+ } elseif ( $char === "\t" \|\| $char === "\n" \|\| $char === "\x0a" \|\| $char === ' ' ) {
	+ $this->token['data'] .= '--' . $char;
	+ $state = 'comment end space';
	+
	+ } elseif ( $char === '!' ) {
	+
	+ $state = 'comment end bang';
	+
	+ } elseif ( $char === false ) {
	+ /* EOF
	+ Parse error. Emit the comment token. Reconsume the
	+ EOF character in the data state. */
	+ $this->stream->unget();
	+ $state = 'data';
	+
	+ } else {
	+ /* Anything else
	+ Parse error. Append two U+002D HYPHEN-MINUS (-)
	+ characters and the input character to the comment token's
	+ data. Switch to the comment state. */
	+
	+ $this->token['data'] .= '--' . $char;
	+ $state = 'comment';
	+ }
	+ break;
	+
	+ case 'comment end bang':
	+ $char = $this->stream->char();
	+ if ( $char === '>' ) {
	+ $state = 'data';
	+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
	+ $this->element = $this->stream->getSubstr(
	+ $this->elementStart,
	+ $this->stream->getPos()
	+ );
	+ return true;
	+ }
	+ } elseif ( $char === "-" ) {
	+ $this->token['data'] .= '--!';
	+ $state = 'comment end dash';
	+ } elseif ( $char === false ) {
	+ $this->stream->unget();
	+ $state = 'data';
	+ } else {
	+ $this->token['data'] .= '--!' . $char;
	+ $state = 'comment';
	+ }
	+ break;
	+
	+ case 'comment end space':
	+ $char = $this->stream->char();
	+ if ( $char === '>' ) {
	+ $state = 'data';
	+ if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
	+ $this->element = $this->stream->getSubstr(
	+ $this->elementStart,
	+ $this->stream->getPos()
	+ );
	+ return true;
	+ }
	+ } elseif ( $char === '-' ) {
	+ $state = 'comment end dash';
	+ } elseif ($char === "\t" \|\| $char === "\n" \|\| $char === "\x0c" \|\| $char === ' ') {
	+ $this->token['data'] .= $char;
	+ } elseif ($char === false) {
	+ $this->stream->unget();
	+ $state = 'data';
	+ } else {
	+ $this->token['data'] .= $char;
	+ $state = 'comment';
	+ }
	+ break;
	+
	+ // case 'cdataSection':
	+
	+ }
	+ }
	+ }
	+
	+}
	+
	+
	+
	+/*
	+
	+Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
	+
	+Permission is hereby granted, free of charge, to any person obtaining a
	+copy of this software and associated documentation files (the
	+"Software"), to deal in the Software without restriction, including
	+without limitation the rights to use, copy, modify, merge, publish,
	+distribute, sublicense, and/or sell copies of the Software, and to
	+permit persons to whom the Software is furnished to do so, subject to
	+the following conditions:
	+
	+The above copyright notice and this permission notice shall be included
	+in all copies or substantial portions of the Software.
	+
	+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
	+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
	+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
	+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
	+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
	+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
	+
	+*/
	+
	+class HTML5_InputStream {
	+ /**
	+ * The string data we're parsing.
	+ */
	+ private $data;
	+
	+ /**
	+ * The current integer byte position we are in $data
	+ */
	+ private $char;
	+
	+ /**
	+ * Length of $data; when $char === $data, we are at the end-of-file.
	+ */
	+ private $EOF;
	+
	+ /**
	+ * Parse errors.
	+ */
	+ public $errors = array();
	+
	+ /**
	+ * @param $data Data to parse
	+ */
	+ public function __construct( $data ) {
	+
	+ /* One leading U+FEFF BYTE ORDER MARK character must be
	+ ignored if any are present. */
	+ if ( substr( $data, 0, 3 ) === "\xEF\xBB\xBF" ) {
	+ $data = substr( $data, 3 );
	+ }
	+
	+ /* All U+0000 NULL characters in the input must be replaced
	+ by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such
	+ characters is a parse error.
	+
	+ U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED
	+ (LF) characters are treated specially. Any CR characters
	+ that are followed by LF characters must be removed, and any
	+ CR characters not followed by LF characters must be converted
	+ to LF characters. Thus, newlines in HTML DOMs are represented
	+ by LF characters, and there are never any CR characters in the
	+ input to the tokenization stage. */
	+ $data = str_replace(
	+ array(
	+ "\0",
	+ "\r\n",
	+ "\r"
	+ ),
	+ array(
	+ "\xEF\xBF\xBD",
	+ "\n",
	+ "\n"
	+ ),
	+ $data
	+ );
	+
	+ $this->data = $data;
	+ $this->char = 0;
	+ $this->EOF = strlen( $data );
	+ }
	+
	+
	+ /**
	+ * Retrieve the currently consume character.
	+ * @note This performs bounds checking
	+ */
	+ public function char() {
	+ return ( $this->char++ < $this->EOF )
	+ ? $this->data[$this->char - 1]
	+ : false;
	+ }
	+
	+ /**
	+ * Matches as far as possible until we reach a certain set of bytes
	+ * and returns the matched substring.
	+ * @param $bytes Bytes to match.
	+ */
	+ public function charsUntil( $bytes, $max = null ) {
	+ if ( $this->char < $this->EOF ) {
	+ if ( $max === 0 \|\| $max ) {
	+ $len = strcspn( $this->data, $bytes, $this->char, $max );
	+ } else {
	+ $len = strcspn( $this->data, $bytes, $this->char );
	+ }
	+ $string = (string) substr( $this->data, $this->char, $len );
	+ $this->char += $len;
	+ return $string;
	+ } else {
	+ return false;
	+ }
	+ }
	+
	+ /**
	+ * Matches as far as possible with a certain set of bytes
	+ * and returns the matched substring.
	+ * @param $bytes Bytes to match.
	+ */
	+ public function charsWhile( $bytes, $max = null ) {
	+ if ( $this->char < $this->EOF ) {
	+ if ( $max === 0 \|\| $max ) {
	+ $len = strspn( $this->data, $bytes, $this->char, $max );
	+ } else {
	+ $len = strspn( $this->data, $bytes, $this->char );
	+ }
	+ $string = (string) substr( $this->data, $this->char, $len );
	+ $this->char += $len;
	+ return $string;
	+ } else {
	+ return false;
	+ }
	+ }
	+
	+ /**
	+ * Unconsume one character.
	+ */
	+ public function unget() {
	+ if ( $this->char <= $this->EOF ) {
	+ $this->char--;
	+ }
	+ }
	+
	+ /**
	+ * Get the current pointer
	+ * @return int
	+ */
	+ public function getPos() {
	+ return $this->char;
	+ }
	+
	+ /**
	+ * Get the current pointer
	+ * @param int pinter into the data
	+ */
	+ public function setPos( $ndx ) {
	+ $this->char = $ndx;
	+ }
	+
	+ /**
	+ * Get a substring of the data.
	+ * @param int start
	+ * @param int end
	+ * @return string
	+ */
	+ public function getSubstr( $start, $end ) {
	+ if ( $end < $start ) {
	+ throw new Exception( 'End was before start?' );
	+ }
	+ $length = $end - $start;
	+ return substr( $this->data, $start, $length );
	+ }
	+
	+}
	diff --git a/languages/LanguageConverter.php b/languages/LanguageConverter.php
	index eae77fb..0c7809f 100644
	--- a/languages/LanguageConverter.php
	+++ b/languages/LanguageConverter.php
	@@ -353,55 +353,66 @@ class LanguageConverter {
	return $text;
	}

	- /* we convert everything except:
	+ /* Do the conversion. We convert everything except:
	1. HTML markups (anything between < and >)
	2. HTML entities
	3. placeholders created by the parser
	*/
	+
	+ // Get regex for parser placeholders
	global $wgParser;
	+ $marker = false;
	if ( isset( $wgParser ) && $wgParser->UniqPrefix() != '' ) {
	- $marker = '\|' . $wgParser->UniqPrefix() . '[\-a-zA-Z0-9]+';
	- } else {
	- $marker = '';
	+ $marker = '/' . $wgParser->UniqPrefix() . '[\-a-zA-Z0-9]+/s';
	}

	- // this one is needed when the text is inside an HTML markup
	- $htmlfix = '\|<[^>]+$\|^[^<>]*>';
	-
	- // disable convert to variants between <code> tags
	- $codefix = '<code>.+?<\/code>\|';
	- // disable conversion of <script> tags
	- $scriptfix = '<script.?>.?<\/script>\|';
	- // disable conversion of <pre> tags
	- $prefix = '<pre.?>.?<\/pre>\|';
	+ // Guard against delimiter nulls in the input
	+ $text = str_replace( "\000", '', $text );

	- $reg = '/' . $codefix . $scriptfix . $prefix .
	- '<[^>]+>\|&[a-zA-Z#][a-z0-9]+;' . $marker . $htmlfix . '/s';
	$startPos = 0;
	$sourceBlob = '';
	$literalBlob = '';
	-
	- // Guard against delimiter nulls in the input
	- $text = str_replace( "\000", '', $text );
	-
	$markupMatches = null;
	$elementMatches = null;
	- while ( $startPos < strlen( $text ) ) {
	- if ( preg_match( $reg, $text, $markupMatches, PREG_OFFSET_CAPTURE, $startPos ) ) {
	- $elementPos = $markupMatches[0][1];
	- $element = $markupMatches[0][0];
	+
	+ $htmlTokenizer = new Html5Tokenizer( $text, array( 'code', 'script', 'pre', 'style' ) );
	+ // Match /^[^<>]*>/ in case the $text we're working on starts with the end of
	+ // a tag. We set this to false after the first match.
	+ $htmlTokenizer->setFlagCloseOnly( true );
	+ // Match /<[^>]+$/ in case the $text stops part way through a tag
	+ $htmlTokenizer->setFlagOpenOnly( true );
	+
	+ do {
	+ $sourceEnd = strlen( $text );
	+ $hasHtml = $htmlTokenizer->parse();
	+
	+ if ( $hasHtml ) {
	+ $elementPos = $htmlTokenizer->elementStart;
	+ $element = $htmlTokenizer->element;
	+ $htmlTokenizer->setFlagCloseOnly( false );
	} else {
	$elementPos = strlen( $text );
	$element = '';
	}

	- // Queue the part before the markup for translation in a batch
	- $sourceBlob .= substr( $text, $startPos, $elementPos - $startPos ) . "\000";
	-
	+ // Check for parser markers in the non-html chunk
	+ list( $sources, $literals ) = $this->tokenizeParserMarkers(
	+ substr( $text, $startPos, $elementPos - $startPos ),
	+ $marker
	+ );
	+ foreach ( $sources as $s ) {
	+ $sourceBlob .= "$s\000";
	+ }
	+ foreach ( $literals as $l ) {
	+ $literalBlob .= "$l\000";
	+ }
	// Advance to the next position
	$startPos = $elementPos + strlen( $element );

	// Translate any alt or title attributes inside the matched element
	+ // TODO: $htmlTokenizer already has the attributs parsed, so it would be
	+ // more efficient to pull them from $htmlTokenizer->token. But let's do that
	+ // in a public patch.
	if ( $element !== ''
	&& preg_match( '/^(<[^>\s])\s([^>])(.*)$/', $element, $elementMatches )
	) {
	@@ -430,7 +441,7 @@ class LanguageConverter {
	}
	}
	$literalBlob .= $element . "\000";
	- }
	+ } while ( $hasHtml );

	// Do the main translation batch
	$translatedBlob = $this->translate( $sourceBlob, $toVariant );
	@@ -451,6 +462,30 @@ class LanguageConverter {
	}

	/**
	+ * @return array exactly two arrays, the first of sources that should have
	+ * language conversion applied, the second should not.
	+ */
	+ private function tokenizeParserMarkers( $text, $regex ) {
	+ if ( !$regex ) {
	+ // We don't actually want to match anything
	+ return array( array( $text ), array() );
	+ }
	+ $sourceBlobs = array();
	+ $literalBlobs = array();
	+ $lastEnd = 0;
	+ if ( preg_match_all( $regex, $text, $markupMatches, PREG_OFFSET_CAPTURE ) ) {
	+ foreach ( $markupMatches[0] as $match ) {
	+ list( $string, $offset ) = $match;
	+ $sourceBlobs[] = substr( $text, $lastEnd, $offset - $lastEnd );
	+ $literalBlobs[] = $string;
	+ $lastEnd = $offset + strlen( $string );
	+ }
	+ }
	+ $sourceBlobs[] = substr( $text, $lastEnd );
	+ return array( $sourceBlobs, $literalBlobs );
	+ }
	+
	+ /**
	* Translate a string to a variant.
	* Doesn't parse rules or do any of that other stuff, for that use
	* convert() or convertTo().
	diff --git a/tests/phpunit/includes/Html5TokenizerTest.php b/tests/phpunit/includes/Html5TokenizerTest.php
	new file mode 100644
	index 0000000..7aea2df
	--- /dev/null
	+++ b/tests/phpunit/includes/Html5TokenizerTest.php
	@@ -0,0 +1,132 @@
	+<?php
	+
	+/**
	+ * @group medium
	+ */
	+class Html5TokenizerTest extends MediaWikiTestCase {
	+
	+ /**
	+ * @dataProvider getHtmlFragments
	+ */
	+ public function testTransform( $input, $expectedFinds, $msg ) {
	+ $result = array();
	+ $ht = new Html5Tokenizer( $input, array( 'pre', 'code', 'style', 'script' ) );
	+ $ht->setFlagCloseOnly( true );
	+ $ht->setFlagOpenOnly( true );
	+ do {
	+ $r = $ht->parse();
	+ if ( $r ) {
	+ $result[] = $ht->element;
	+ }
	+ } while ( $r );
	+
	+ $this->assertArrayEquals( $expectedFinds, $result, $msg );
	+ }
	+
	+ public function getHtmlFragments() {
	+ return array(
	+ array(
	+ "<div bar=asdf>asdf</div>",
	+ array( '<div bar=asdf>', '</div>' ),
	+ 'Basic unquoted attrs'
	+ ),
	+ array(
	+ "<div bar='asdf' baz=\"123\">asdf</div>",
	+ array( "<div bar='asdf' baz=\"123\">", '</div>' ),
	+ 'Basic quoted attrs'
	+ ),
	+ array(
	+ "<div b'ar='asdf' b\"az=\"123\">asdf</div>",
	+ array( "<div b'ar='asdf' b\"az=\"123\">", '</div>' ),
	+ 'Quoted attrs with quote in attr name'
	+ ),
	+ array(
	+ "<div bar='as>df' baz=\"123\">asdf</div>",
	+ array( "<div bar='as>df' baz=\"123\">", '</div>' ),
	+ 'Quoted attr containing >'
	+ ),
	+ array(
	+ "<!--div bar='as>df' baz=\"123\">asdf</div -->",
	+ array( "<!--div bar='as>df' baz=\"123\">asdf</div -->" ),
	+ 'Commented elements'
	+ ),
	+ array(
	+ "<div bar='as>>></>>>>df\' b'az='1<23'>asdf</div>",
	+ array( "<div bar='as>>></>>>>df\' b'az='1<23'>", '</div>' ),
	+ 'Quoted attr containing <'
	+ ),
	+ array(
	+ "<div bar=foo>1</></div>",
	+ array( '<div bar=foo>', '</>', '</div>' ),
	+ 'Immediately closed end tag'
	+ ),
	+ array(
	+ '<div "=foo>1</div>',
	+ array( '<div "=foo>', '</div>' ),
	+ 'Attr name is single quote'
	+ ),
	+ array(
	+ "start<div \"=foo",
	+ array( '<div "=foo' ),
	+ 'Unclosed element'
	+ ),
	+ array(
	+ 'a div "=foo>end',
	+ array( 'a div "=foo>' ),
	+ 'Unopened element close'
	+ ),
	+ array(
	+ '<pre>a div "=foo></pre>end',
	+ array( '<pre>a div "=foo></pre>' ),
	+ 'Match entire element'
	+ ),
	+ array(
	+ '<pre id=123 id="asdf">a div "=foo></pre>end',
	+ array( '<pre id=123 id="asdf">a div "=foo></pre>' ),
	+ 'Match entire element, with attributes'
	+ ),
	+ array(
	+ '<pre id=123 id="asdf"></prea div>< "=foo></pre>end',
	+ array( '<pre id=123 id="asdf"></prea div>< "=foo></pre>' ),
	+ 'Check entire element matching close logic'
	+ ),
	+ array(
	+ '<pre> <? bogus comment></prea div>< "=foo></pre>end',
	+ array( '<pre> <? bogus comment></prea div>< "=foo></pre>' ),
	+ 'Check entire element matching, don\'t break on bogus comment'
	+ ),
	+ array(
	+ '<pre> something <!-- <bar "=foo></pre> --!> asdf</pre>end',
	+ array( '<pre> something <!-- <bar "=foo></pre> --!> asdf</pre>' ),
	+ 'Check entire element matching, don\'t break on end tag inside comment'
	+ ),
	+ array(
	+ '<pre> something <!-- <bar "=foo></pre> asdf</pre>end',
	+ array( '<pre> something <!-- <bar "=foo></pre> asdf</pre>end' ),
	+ 'Check entire element matching, don\'t break on end tag inside comment that doesn\'t end'
	+ ),
	+ array(
	+ 'start<pre> something <br/> asdf</pre>end',
	+ array( '<pre> something <br/> asdf</pre>' ),
	+ 'Check entire element matching, don\t break on self-closing tags'
	+ ),
	+ array(
	+ 'start<pre/> something <br/> asdf</pre>end',
	+ array( '<pre/>', '<br/>', '</pre>' ),
	+ 'Check entire element matching, return if tag is self closing'
	+ ),
	+ array(
	+ 'start<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Stric>t//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">end',
	+ array( '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Stric>', 't//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' ),
	+ 'Check DOCTYPE matching'
	+ ),
	+ array(
	+ 'start<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"',
	+ array( '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"' ),
	+ 'Check unclosed DOCTYPE matching'
	+ ),
	+
	+ );
	+ }
	+
	+}
	--
	1.8.4.5

File Metadata

Mime Type: text/x-diff
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 14200
Default Alt Text: bug71394-tokenizer.patch (53 KB)

bug71394-tokenizer.patchacl*securityActions

bug71394-tokenizer.patchView Options

File Metadata

Event Timeline

bug71394-tokenizer.patch
acl*security
Actions

bug71394-tokenizer.patch
View Options