Page MenuHomePhabricator

5019d.patch

Authored By
bzimport
Nov 21 2014, 9:08 PM
Size
13 KB
Referenced Files
None
Subscribers
None

5019d.patch

Index: includes/EditPage.php
===================================================================
--- includes/EditPage.php (revision 15665)
+++ includes/EditPage.php (working copy)
@@ -1549,18 +1549,7 @@
*/
function sectionAnchor( $text ) {
$headline = Sanitizer::decodeCharReferences( $text );
- # strip out HTML
- $headline = preg_replace( '/<.*?' . '>/', '', $headline );
- $headline = trim( $headline );
- $sectionanchor = '#' . urlencode( str_replace( ' ', '_', $headline ) );
- $replacearray = array(
- '%3A' => ':',
- '%' => '.'
- );
- return str_replace(
- array_keys( $replacearray ),
- array_values( $replacearray ),
- $sectionanchor );
+ return '#'.wfCreateAnchor( $text );
}
/**
Index: includes/GlobalFunctions.php
===================================================================
--- includes/GlobalFunctions.php (revision 15665)
+++ includes/GlobalFunctions.php (working copy)
@@ -2047,4 +2047,144 @@
return false;
}
+/**
+ * Create an anchor for a section heading (you know, the thing after the # that
+ * lets you jump to a specific place on the page).
+ *
+ * @param string text of the heading as inputted by editors. Links MAY but NEED
+ * NOT be converted from wiki to HTML; they should be treated identically.
+ * If they aren't, that's a bug, which is hopefully fixable.
+ * @param array all previous anchors created for this page
+ * @return string anchor
+ */
+function wfCreateAnchor($current, $previous = array()) {
+ $punctuation = '-_:\. \''; // Punctuation allowed in id, plus " '"
+ $printable = 'A-Za-z0-9'.$punctuation; // All chars allowed in id, plus " '"
+ $accented_conversion = array(
+ "\xC3\x80" => 'A', // capital A with grave
+ "\xC3\x81" => 'A', // capital A with acute
+ "\xC3\x82" => 'A', // capital A with circumflex
+ "\xC3\x83" => 'A', // capital A with tilde
+ "\xC3\x84" => 'A', // capital A with diaresis
+ "\xC3\x85" => 'A', // capital A with ring above
+ "\xC3\x86" => 'AE', // capital AE
+ "\xC3\x87" => 'C', // capital C with cedilla
+ "\xC3\x88" => 'E', // capital E with grave
+ "\xC3\x89" => 'E', // capital E with acute
+ "\xC3\x8A" => 'E', // capital E with circumflex
+ "\xC3\x8B" => 'E', // capital E with diaresis
+ "\xC3\x8C" => 'I', // capital I with grave
+ "\xC3\x8D" => 'I', // capital I with acute
+ "\xC3\x8E" => 'I', // capital I with circumflex
+ "\xC3\x8F" => 'I', // capital I with diaresis
+ "\xC3\x90" => 'D', // capital eth
+ "\xC3\x91" => 'N', // capital N with tilde
+ "\xC3\x92" => 'O', // capital O with grave
+ "\xC3\x93" => 'O', // capital O with acute
+ "\xC3\x94" => 'O', // capital O with circumflex
+ "\xC3\x95" => 'O', // capital O with tilde
+ "\xC3\x96" => 'O', // capital O with diaresis
+ "\xC3\x98" => 'O', // capital O with stroke
+ "\xC3\x99" => 'U', // capital U with grave
+ "\xC3\x9A" => 'U', // capital U with acute
+ "\xC3\x9B" => 'U', // capital U with circumflex
+ "\xC3\x9C" => 'U', // capital U with diaresis
+ "\xC3\x9D" => 'Y', // capital Y with acute
+ "\xC3\x9E" => 'TH', // capital thorn
+ "\xC3\x9F" => 's', // small sharp letter S
+ "\xC3\xA0" => 'a', // small a with grave
+ "\xC3\xA1" => 'a', // small a with acute
+ "\xC3\xA2" => 'a', // small a with circumflex
+ "\xC3\xA3" => 'a', // small a with tilde
+ "\xC3\xA4" => 'a', // small a with diaresis
+ "\xC3\xA5" => 'a', // small a with ring above
+ "\xC3\xA6" => 'ae', // small ae
+ "\xC3\xA7" => 'c', // small c with cedilla
+ "\xC3\xA8" => 'e', // small e with grave
+ "\xC3\xA9" => 'e', // small e with acute
+ "\xC3\xAA" => 'e', // small e with circumflex
+ "\xC3\xAB" => 'e', // small e with diaresis
+ "\xC3\xAC" => 'i', // small i with grave
+ "\xC3\xAD" => 'i', // small i with acute
+ "\xC3\xAE" => 'i', // small i with circumflex
+ "\xC3\xAF" => 'i', // small i with diaresis
+ "\xC3\xB0" => 'd', // small eth
+ "\xC3\xB1" => 'n', // small n with tilde
+ "\xC3\xB2" => 'o', // small o with grave
+ "\xC3\xB3" => 'o', // small o with acute
+ "\xC3\xB4" => 'o', // small o with circumflex
+ "\xC3\xB5" => 'o', // small o with tilde
+ "\xC3\xB6" => 'o', // small o with diaresis
+ "\xC3\xB8" => 'o', // small o with stroke
+ "\xC3\xB9" => 'u', // small u with grave
+ "\xC3\xBA" => 'u', // small u with acute
+ "\xC3\xBB" => 'u', // small u with circumflex
+ "\xC3\xBC" => 'u', // small u with diaresis
+ "\xC3\xBD" => 'y', // small y with acute
+ "\xC3\xBE" => 'th', // small thorn
+ "\xC3\xBF" => 'y' // small y with diaresis
+ );
+ $modified = $current;
+
+ // Replace all Latin characters in ISO 8859-1 with their approximate ASCII
+ // equivalents
+ $modified = strtr($modified,$accented_conversion);
+ $modified = preg_replace_callback(
+ // Remove linebreaks
+ '/\n|'.
+ // Delete image/category inclusions:
+ '\[\[ *(?:Image|Category):[^\]]+\]\]|'.
+ // Remove SGML tags:
+ '<[^>]+>|'.
+ // Convert internal links:
+ '\[\[(?:[^\|\]]+\|)?([^|\]]+)\]\]|'.
+ // Convert unaliased external links:
+ '\[([a-z]+:\/\/[^ \]]+) *\]|'.
+ // Convert aliased external links:
+ '\[[a-z]+:\/\/[^ \]]+ ([^\]]+)\]|'.
+ // Replace template parameters with defaults:
+ '\{\{\{[^|}]+\|([^}]*)\}\}\}|'.
+ // Compress unprintables into punctuation and start/end
+ "^[^$printable]+|[^$printable]+$|([$punctuation])[^$printable]+'.
+ '|[^$printable]+([$punctuation])/i",
+ create_function(
+ '$matches',
+ 'foreach($matches as $match) {
+ if ($match != "" && $match != $matches[0])
+ return $match;
+ }; return "";'
+ ), $modified
+ );
+ // Turn all other unprintable sequences to underscores
+ $modified = preg_replace("/[^$printable]+/",'_',$modified);
+ // Strip apostrophes, spaces to underscores
+ $modified = str_replace(array("'",' '),array('','_'),$modified);
+
+ // Check for null/underscore-only string, e.g. if it's all unprintable
+ // (UTF-8 gibberish is better than numbered underscores)
+ if ( preg_match('/^_*$/',$modified) )
+ $modified = preg_replace("/[^$printable]+/",'.',
+ urlencode(str_replace(' ','_',$current)));
+ // Underscores crop up at the beginning/end of ids, sometimes created
+ // from leading/trailing spaces and sometimes from other chars; strip them
+ // unless they're still the only chars (which means input was all spaces/
+ // underscores)
+ if ( !preg_match('/^_*$/',$modified) )
+ $modified = trim($modified,'_');
+ // Merge consecutive underscores
+ $modified = preg_replace('/_{2,}/','_',$modified);
+ // ids must start with a letter
+ if (!preg_match('/[A-Za-z]/',substr($modified,0,1)))
+ $modified = 'x'.$modified;
+ // Check for conflict
+ if (in_array($modified,$previous)) {
+ for($i = 2; true; ++$i) {
+ if (!in_array($modified.'_'.$i,$previous)) {
+ $modified .= '_'.$i;
+ break;
+ }
+ }
+ }
+ return $modified;
+}
?>
Index: includes/Linker.php
===================================================================
--- includes/Linker.php (revision 15665)
+++ includes/Linker.php (working copy)
@@ -898,13 +898,7 @@
$section = $auto;
# Generate a valid anchor name from the section title.
- # Hackish, but should generally work - we strip wiki
- # syntax, including the magic [[: that is used to
- # "link rather than show" in case of images and
- # interlanguage links.
- $section = str_replace( '[[:', '', $section );
- $section = str_replace( '[[', '', $section );
- $section = str_replace( ']]', '', $section );
+ $section = wfCreateAnchor($section,array());
$sectionTitle = wfClone( $title );
$sectionTitle->mFragment = $section;
$link = $this->makeKnownLinkObj( $sectionTitle, wfMsg( 'sectionlink' ) );
Index: includes/Parser.php
===================================================================
--- includes/Parser.php (revision 15665)
+++ includes/Parser.php (working copy)
@@ -934,7 +934,9 @@
}
/**
- * Parse headers and return html
+ * Parse headers and return html. Put a Base64-encoded version in a comment
+ * tag to hide it from the rest of the parser so it doesn't get eaten before
+ * formatHeadings can create a proper anchor
*
* @private
*/
@@ -943,8 +945,13 @@
wfProfileIn( $fname );
for ( $i = 6; $i >= 1; --$i ) {
$h = str_repeat( '=', $i );
- $text = preg_replace( "/^{$h}(.+){$h}\\s*$/m",
- "<h{$i}>\\1</h{$i}>\\2", $text );
+ $text = preg_replace_callback( "/^{$h}(.+){$h}\\s*$/m",
+ create_function(
+ '$matches',
+ 'return "<h'.$i.'><!--".base64_encode("{$matches[1]}")."-->{$matches[1]}</h'.$i.'>{$matches[2]}";'
+ ),
+ $text
+ );
}
wfProfileOut( $fname );
return $text;
@@ -3198,12 +3205,15 @@
$prevlevel = 0;
$toclevel = 0;
$prevtoclevel = 0;
+
+ $oldheadlines = array();
foreach( $matches[3] as $headline ) {
$istemplate = 0;
$templatetitle = '';
$templatesection = 0;
$numbering = '';
+ $simple_headline = FALSE;
if (preg_match("/<!--MWTEMPLATESECTION=([^&]+)&([^_]+)-->/", $headline, $mat)) {
$istemplate = 1;
@@ -3274,45 +3284,73 @@
}
}
}
- # The canonized header is a version of the header text safe to use for links
+ // Pull out the unmolested heading text for use in the anchor, and
+ // the mutilated heading text for use in other places
+ $anchor = base64_decode( preg_replace("/^<!--(.*?)-->.*$/",
+ '$1', $headline));
+ $displayed_headline = $headline = preg_replace("/^<!--.*?-->(.*)$/",
+ '$1', $headline);
+ // We don't want to waste effort, do we?
+ if ($anchor == $displayed_headline)
+ $simple_headline = TRUE;
# Avoid insertion of weird stuff like <math> by expanding the relevant sections
- $canonized_headline = $this->unstrip( $headline, $this->mStripState );
- $canonized_headline = $this->unstripNoWiki( $canonized_headline, $this->mStripState );
-
+ $anchor = $this->unstrip($anchor,$this->mStripState);
+ $anchor = $this->unstripNoWiki($anchor,$this->mStripState);
# Remove link placeholders by the link text.
# <!--LINK number-->
# turns into
# link text with suffix
- $canonized_headline = preg_replace( '/<!--LINK ([0-9]*)-->/e',
+ $anchor = preg_replace( '/<!--LINK ([0-9]*)-->/e',
"\$this->mLinkHolders['texts'][\$1]",
- $canonized_headline );
+ $anchor );
- $canonized_headline = preg_replace( '/<!--IWLINK ([0-9]*)-->/e',
+ $anchor = preg_replace( '/<!--IWLINK ([0-9]*)-->/e',
"\$this->mInterwikiLinkHolders['texts'][\$1]",
- $canonized_headline );
+ $anchor );
+ if ($simple_headline === TRUE)
+ $displayed_headline = $anchor;
+ else {
+ // These modifications are done to both $anchor and
+ // $displayed_headline but must be repeated if they weren't
+ // identical
+ $displayed_headline = $this->unstrip($displayed_headline,$this->mStripState);
+ $displayed_headline = $this->unstripNoWiki($displayed_headline,$this->mStripState);
+ $displayed_headline = preg_replace( '/<!--LINK ([0-9]*)-->/e',
+ "\$this->mLinkHolders['texts'][\$1]",
+ $displayed_headline );
+ $displayed_headline = preg_replace( '/<!--IWLINK ([0-9]*)-->/e',
+ "\$this->mInterwikiLinkHolders['texts'][\$1]",
+ $displayed_headline );
+ }
+ // $anchor is what's safe to use for ids/links (also $legacy_anchor,
+ // but that's deprecated)
+ $anchor = wfCreateAnchor(html_entity_decode($anchor), $oldheadlines);
+ // $oldheadlines contains a list of previous ids, to avoid repetition
+ $oldheadlines[] = $anchor;
+
+ /* The whole $legacy_anchor business is for reverse-compatibility.
+ We may or may not want to drop it in the future; for the time
+ being, all headers will have both a new and a legacy anchor. */
+
# strip out HTML
- $canonized_headline = preg_replace( '/<.*?' . '>/','',$canonized_headline );
+ $displayed_headline = preg_replace( '/<.*?' . '>/','',$displayed_headline );
- $tocline = trim( $canonized_headline );
+ $tocline = trim( $displayed_headline );
# Save headline for section edit hint before it's escaped
- $headline_hint = trim( $canonized_headline );
+ $headline_hint = trim( $displayed_headline );
- $canonized_headline = Sanitizer::escapeId( $tocline );
+ $legacy_anchor = Sanitizer::escapeId( $tocline );
- $refers[$headlineCount] = $canonized_headline;
+ $refers[$headlineCount] = $legacy_anchor;
-
# count how many in assoc. array so we can track dupes in anchors
- @$refers[$canonized_headline]++;
+ @$refers[$legacy_anchor]++;
- $refcount[$headlineCount]=$refers[$canonized_headline];
+ $refcount[$headlineCount]=$refers[$legacy_anchor];
# Don't number the heading if it is the only one (looks silly)
if( $doNumberHeadings && count( $matches[3] ) > 1) {
# the two are different if the line contains a link
$headline=$numbering . ' ' . $headline;
}
-
- # Create the anchor for linking from the TOC to the section
- $anchor = $canonized_headline;
if($refcount[$headlineCount] > 1 ) {
- $anchor .= '_' . $refcount[$headlineCount];
+ $legacy_anchor .= '_' . $refcount[$headlineCount];
}
if( $enoughToc && ( !isset($wgMaxTocLevel) || $toclevel<$wgMaxTocLevel ) ) {
$toc .= $sk->tocLine($anchor, $tocline, $numbering, $toclevel);
@@ -3327,9 +3365,12 @@
$head[$headlineCount] .= $sk->editSectionLink($this->mTitle, $sectionCount+1, $headline_hint);
}
- # give headline the correct <h#> tag
+ # give headline the correct <h#> tag, work out anchors
- @$head[$headlineCount] .= "<a name=\"$anchor\"></a><h".$level.$matches[2][$headlineCount] .$headline.'</h'.$level.'>';
+ if ($anchor != $legacy_anchor)
+ @$head[$headlineCount] .= "<a name=\"$anchor\"></a><a name=\"$legacy_anchor\"></a><h".$level.$matches[2][$headlineCount] .$headline.'</h'.$level.'>';
+ else @$head[$headlineCount] .= "<a name=\"$anchor\"></a><h".$level.$matches[2][$headlineCount] .$headline.'</h'.$level.'>';
+
$headlineCount++;
if( !$istemplate )
$sectionCount++;

File Metadata

Mime Type
text/x-diff
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2247
Default Alt Text
5019d.patch (13 KB)

Event Timeline