Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Paste
P1843
HTML5Tidy.java
Active
Public
Actions
Authored by
•
tstarling
on Aug 6 2015, 11:27 AM.
Edit Paste
Archive Paste
View Raw File
Subscribe
Mute Notifications
Award Token
Flag For Later
Tags
None
Referenced Files
F608982: HTML5Tidy.java
Aug 6 2015, 11:27 AM
2015-08-06 11:27:22 (UTC+0)
Subscribers
Ricordisamoa
package
org.wikimedia.html5tidy
;
import
javax.servlet.*
;
import
java.io.IOException
;
import
java.io.ByteArrayInputStream
;
import
java.io.ByteArrayOutputStream
;
import
java.util.Enumeration
;
import
java.nio.charset.Charset
;
import
org.xml.sax.ContentHandler
;
import
org.xml.sax.InputSource
;
import
org.xml.sax.SAXException
;
import
nu.validator.encoding.Encoding
;
import
nu.validator.htmlparser.common.XmlViolationPolicy
;
import
nu.validator.htmlparser.sax.HtmlParser
;
import
nu.validator.htmlparser.sax.HtmlSerializer
;
public
class
HTML5Tidy
extends
GenericServlet
{
public
void
service
(
ServletRequest
req
,
ServletResponse
res
)
throws
ServletException
,
IOException
{
req
.
setCharacterEncoding
(
"UTF-8"
);
String
text
=
req
.
getParameter
(
"text"
);
if
(
text
==
null
)
{
throw
new
ServletException
(
"The text parameter must be given"
);
}
Charset
utf8
;
try
{
utf8
=
Charset
.
forName
(
"UTF-8"
);
}
catch
(
IllegalArgumentException
e
)
{
throw
new
ServletException
(
"No UTF-8"
,
e
);
}
byte
[]
input
=
req
.
getParameter
(
"text"
).
getBytes
(
utf8
);
ByteArrayOutputStream
sink
=
new
ByteArrayOutputStream
();
ContentHandler
serializer
=
new
HtmlSerializer
(
sink
);
HtmlParser
parser
=
new
HtmlParser
(
XmlViolationPolicy
.
ALLOW
);
parser
.
setContentHandler
(
serializer
);
try
{
parser
.
setProperty
(
"http://xml.org/sax/properties/lexical-handler"
,
serializer
);
InputSource
source
=
new
InputSource
(
new
ByteArrayInputStream
(
input
));
source
.
setEncoding
(
"UTF-8"
);
parser
.
parse
(
source
);
}
catch
(
SAXException
e
)
{
throw
new
ServletException
(
"Error parsing HTML"
,
e
);
}
// HtmlSerializer writes UTF-8 by default
res
.
setContentType
(
"text/html;charset=UTF-8"
);
res
.
getOutputStream
().
write
(
sink
.
toByteArray
());
}
};
Event Timeline
•
tstarling
edited the content of this paste.
(Show Details)
Aug 6 2015, 11:27 AM
2015-08-06 11:27:22 (UTC+0)
•
tstarling
changed the title of this paste from untitled to
HTML5Tidy.java
.
•
tstarling
updated the paste's language from
autodetect
to
autodetect
.
•
tstarling
mentioned this in
T89331: Replace HTML4 Tidy in MW parser with an equivalent HTML5 based tool
.
Ricordisamoa
subscribed.
Aug 8 2015, 10:55 PM
2015-08-08 22:55:58 (UTC+0)
Fralambert
mentioned this in
T151186: Add monolingual language code fr-CA
.
Nov 21 2016, 1:52 AM
2016-11-21 01:52:09 (UTC+0)
VIGNERON
mentioned this in
T173666: Add lang code gcr
.
Aug 20 2017, 3:48 PM
2017-08-20 15:48:49 (UTC+0)
•
AKhatun_WMF
mentioned this in
T282129: Test triple-analysis functions over a large dataset with Spark
.
May 19 2021, 11:15 AM
2021-05-19 11:15:45 (UTC+0)
Log In to Comment