If T222770 is fixed, no_punctuation_char won't be used anywhere except in url and autourl rules.
no_punctuation_char is equivalent to !unispace ![\t\n\r] [^.:,'"&%[\]<>{\x00-\x20\x7F\u180E].
So the url rule can be written as
url "url"
= proto:url_protocol
addr:(urladdr / "")
path:( ( !inline_breaks
c:(!unispace ![\t\n\r] [^.:,'"&%[\]<>{\x00-\x20\x7F\u180E])
{ return c; }
)
/ s:[.:,'] { return s; }
/ comment
/ tplarg_or_template
/ ! ( "&" ( [lL][tT] / [gG][tT] ) ";" )
r:(
& "&" he:htmlentity { return he; }
/ [&%{]
) { return r; }
)*
// Must be at least one character after the protocol
& { return addr.length > 0 || path.length > 0; }
{
return tu.flattenString([proto, addr].concat(path));
}now, !inline_breaks doesn't care about [.,'], so the path sub-rule can be written as:
path:( ( !inline_breaks
c:(!unispace ![\t\n\r] [^"&%[\]<>{\x00-\x20\x7F\u180E])
{ return c; }
)
/ s:[:] { return s; }
/ comment
/ tplarg_or_template
/ ! ( "&" ( [lL][tT] / [gG][tT] ) ";" )
r:(
& "&" he:htmlentity { return he; }
/ [&%{]
) { return r; }
)*Now, we don't need to exclude % and then accept it in [&%{] later, so:
path:( ( !inline_breaks
c:(!unispace ![\t\n\r] [^"&[\]<>{\x00-\x20\x7F\u180E])
{ return c; }
)
/ s:[:] { return s; }
/ comment
/ tplarg_or_template
/ ! ( "&" ( [lL][tT] / [gG][tT] ) ";" )
r:(
& "&" he:htmlentity { return he; }
/ [&{]
) { return r; }
)*or
path:( ( !inline_breaks
c:(!unispace ![\t\n\r] [^"&[\]<>{\x00-\x20\x7F\u180E])
{ return c; }
)
/ comment
/ tplarg_or_template
/ s:[:{] { return s; }
/ ! ( "&" ( [lL][tT] / [gG][tT] ) ";" )
r:$(
& "&" he:htmlentity { return he; }
/ "&"
) { return r; }
)*I tested '\u180E' and the PHP code doesn't exclude it, so it should be removed from the first test. Also, PHP now excludes '\uFFFD', so perhaps it should be added in.
// c = !unispace [^&[\]{<>"\x00-\x20\x7F\uFFFD]
// = PHP's EXT_LINK_URL_CLASS, further excluding "&[]{"
// s = ":" or "{"
// r = HTML entity or "&"
path:( ( !inline_breaks
c:[^&[\]{<>"\x00-\x20\x7F\uFFFD \u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]
{ return c; }
)
/ comment
/ tplarg_or_template
/ s:[:{] { return s; }
/ ! ( "&" ( [lL][tT] / [gG][tT] ) ";" )
r:$(
& "&" he:htmlentity { return he; }
/ "&"
) { return r; }
)*Btw url is a starting rule, and I don't see any test code for this rule