Page MenuHomePhabricator

HTML5Tidy.java

Authored By
tstarling
Aug 6 2015, 11:27 AM
Size
1 KB
Referenced Files
None
Subscribers
None

HTML5Tidy.java

package org.wikimedia.html5tidy;
import javax.servlet.*;
import java.io.IOException;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.util.Enumeration;
import java.nio.charset.Charset;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import nu.validator.encoding.Encoding;
import nu.validator.htmlparser.common.XmlViolationPolicy;
import nu.validator.htmlparser.sax.HtmlParser;
import nu.validator.htmlparser.sax.HtmlSerializer;
public class HTML5Tidy extends GenericServlet {
public void service(ServletRequest req, ServletResponse res)
throws ServletException, IOException
{
req.setCharacterEncoding("UTF-8");
String text = req.getParameter("text");
if (text == null) {
throw new ServletException("The text parameter must be given");
}
Charset utf8;
try {
utf8 = Charset.forName("UTF-8");
} catch (IllegalArgumentException e) {
throw new ServletException("No UTF-8", e);
}
byte[] input = req.getParameter("text").getBytes(utf8);
ByteArrayOutputStream sink = new ByteArrayOutputStream();
ContentHandler serializer = new HtmlSerializer(sink);
HtmlParser parser = new HtmlParser(XmlViolationPolicy.ALLOW);
parser.setContentHandler(serializer);
try {
parser.setProperty("http://xml.org/sax/properties/lexical-handler",
serializer);
InputSource source = new InputSource(new ByteArrayInputStream(input));
source.setEncoding("UTF-8");
parser.parse(source);
} catch (SAXException e) {
throw new ServletException("Error parsing HTML", e);
}
// HtmlSerializer writes UTF-8 by default
res.setContentType("text/html;charset=UTF-8");
res.getOutputStream().write(sink.toByteArray());
}
};

File Metadata

Mime Type
text/plain; charset=utf-8
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
584829
Default Alt Text
HTML5Tidy.java (1 KB)

Event Timeline