Page MenuHomePhabricator
Authored By
ssastry
Mar 14 2019, 3:30 PM
Size
8 KB
Referenced Files
None
Subscribers
None
#!/usr/bin/env hhvm
<?php
if ( PHP_SAPI !== 'cli' ) {
exit;
}
require __DIR__ . '/../vendor/autoload.php';
use RemexHtml\DOM;
use RemexHtml\Tokenizer;
use RemexHtml\TreeBuilder;
use RemexHtml\Serializer;
class NullHandler implements Tokenizer\TokenHandler {
function startDocument( Tokenizer\Tokenizer $t, $fns, $fn ) {}
function endDocument( $pos ) {}
function error( $text, $pos ) {}
function characters( $text, $start, $length, $sourceStart, $sourceLength ) {}
function startTag( $name, Tokenizer\Attributes $attrs, $selfClose,
$sourceStart, $sourceLength ) {}
function endTag( $name, $sourceStart, $sourceLength ) {}
function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) {}
function comment( $text, $sourceStart, $sourceLength ) {}
}
class NullTreeHandler implements TreeBuilder\TreeHandler {
function startDocument( $fns, $fn ) {}
function endDocument( $pos ) {}
function characters( $parent, $refNode, $text, $start, $length, $sourceStart, $sourceLength ) {}
function insertElement( $parent, $refNode, TreeBuilder\Element $element, $void,
$sourceStart, $sourceLength ) {}
function endTag( TreeBuilder\Element $element, $sourceStart, $sourceLength ) {}
function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) {}
function comment( $parent, $refNode, $text, $sourceStart, $sourceLength ) {}
function error( $text, $pos ) {}
function mergeAttributes( TreeBuilder\Element $element, Tokenizer\Attributes $attrs, $sourceStart ) {}
function removeNode( TreeBuilder\Element $element, $sourceStart ) {}
function reparentChildren( TreeBuilder\Element $element, TreeBuilder\Element $newParent, $sourceStart ) {}
}
function reserialize( $text ) {
$handler = new Tokenizer\TokenSerializer;
$tokenizer = new Tokenizer\Tokenizer( $handler, $text, [] );
$tokenizer->execute( $GLOBALS['executeOptions'] );
print $handler->getOutput() . "\n";
foreach ( $handler->getErrors() as $error ) {
print "Error at {$error[1]}: {$error[0]}\n";
}
}
function reserializeState( $text, $state, $endTag ) {
$handler = new Tokenizer\TokenSerializer;
$tokenizer = new Tokenizer\Tokenizer( $handler, $text, [] );
$tokenizer->execute( [ 'state' => $state, 'appropriateEndTag' => $endTag ] );
print $handler->getOutput() . "\n";
foreach ( $handler->getErrors() as $error ) {
print "Error at {$error[1]}: {$error[0]}\n";
}
}
function reserializeScript( $text ) {
reserializeState( $text, Tokenizer\Tokenizer::STATE_SCRIPT_DATA, 'script' );
}
function reserializeXmp( $text ) {
reserializeState( $text, Tokenizer\Tokenizer::STATE_RCDATA, 'xmp' );
}
function trace( $text ) {
$traceCallback = function ( $msg ) {
print "$msg\n";
};
$formatter = new Serializer\HtmlFormatter;
$serializer = new Serializer\Serializer( $formatter );
$treeTracer = new TreeBuilder\TreeMutationTracer( $serializer, $traceCallback );
$treeBuilder = new TreeBuilder\TreeBuilder( $treeTracer, [] );
$dispatcher = new TreeBuilder\Dispatcher( $treeBuilder );
$dispatchTracer = new TreeBuilder\DispatchTracer( $text, $dispatcher, $traceCallback );
$tokenizer = new Tokenizer\Tokenizer( $dispatchTracer, $text, [] );
$tokenizer->execute( $GLOBALS['executeOptions'] );
print $serializer->getResult() . "\n";
}
function traceDestruct( $text ) {
$traceCallback = function ( $msg ) {
print "$msg\n";
};
$destructTracer = new TreeBuilder\DestructTracer( $traceCallback );
$treeTracer = new TreeBuilder\TreeMutationTracer( $destructTracer, $traceCallback );
$treeBuilder = new TreeBuilder\TreeBuilder( $treeTracer, [] );
$dispatcher = new TreeBuilder\Dispatcher( $treeBuilder );
$dispatchTracer = new TreeBuilder\DispatchTracer( $text, $dispatcher, $traceCallback );
$tokenizer = new Tokenizer\Tokenizer( $dispatchTracer, $text, [] );
$tokenizer->execute( $GLOBALS['executeOptions'] );
}
function tidy( $text ) {
$error = function ( $msg, $pos ) {
// print " * [$pos] $msg\n";
};
$formatter = new Serializer\HtmlFormatter;
$serializer = new Serializer\Serializer( $formatter, $error );
$treeBuilder = new TreeBuilder\TreeBuilder( $serializer, [] );
$dispatcher = new TreeBuilder\Dispatcher( $treeBuilder );
$tokenizer = new Tokenizer\Tokenizer( $dispatcher, $text, $GLOBALS['tokenizerOptions'] );
$tokenizer->execute( $GLOBALS['executeOptions'] );
// print $serializer->getResult() . "\n";
}
function test( $text ) {
$error = function ( $msg, $pos ) {
print " * [$pos] $msg\n";
};
$formatter = new Serializer\TestFormatter;
$serializer = new Serializer\Serializer( $formatter, $error );
$treeBuilder = new TreeBuilder\TreeBuilder( $serializer, [] );
$dispatcher = new TreeBuilder\Dispatcher( $treeBuilder );
$tokenizer = new Tokenizer\Tokenizer( $dispatcher, $text, $GLOBALS['tokenizerOptions'] );
$tokenizer->execute( $GLOBALS['executeOptions'] );
print $serializer->getResult() . "\n";
}
function tidyViaDOM( $text ) {
$error = function ( $msg, $pos ) {
//print " * [$pos] $msg\n";
};
$formatter = new Serializer\HtmlFormatter;
$domBuilder = new DOM\DOMBuilder( $error );
$serializer = new DOM\DOMSerializer( $domBuilder, $formatter );
$treeBuilder = new TreeBuilder\TreeBuilder( $serializer, [] );
$dispatcher = new TreeBuilder\Dispatcher( $treeBuilder );
$tokenizer = new Tokenizer\Tokenizer( $dispatcher, $text, [] );
$tokenizer->execute( $GLOBALS['executeOptions'] );
// print $serializer->getResult() . "\n";
}
function testViaDOM( $text ) {
$error = function ( $msg, $pos ) {
print " * [$pos] $msg\n";
};
$formatter = new Serializer\TestFormatter;
$domBuilder = new DOM\DOMBuilder( $error );
$serializer = new DOM\DOMSerializer( $domBuilder, $formatter );
$treeBuilder = new TreeBuilder\TreeBuilder( $serializer, [] );
$dispatcher = new TreeBuilder\Dispatcher( $treeBuilder );
$tokenizer = new Tokenizer\Tokenizer( $dispatcher, $text, [] );
$tokenizer->execute( $GLOBALS['executeOptions'] );
// print $serializer->getResult() . "\n";
}
function benchmarkTreeBuilder( $text ) {
$time = -microtime( true );
$handler = new NullTreeHandler;
$treeBuilder = new TreeBuilder\TreeBuilder( $handler, [] );
$dispatcher = new TreeBuilder\Dispatcher( $treeBuilder );
$tokenizer = new Tokenizer\Tokenizer( $dispatcher, $text, $GLOBALS['tokenizerOptions'] );
$tokenizer->execute( $GLOBALS['executeOptions'] );
$time += microtime( true );
print "$time\n";
}
function benchmarkDOM( $text ) {
$time = -microtime( true );
$domBuilder = new DOM\DOMBuilder;
$treeBuilder = new TreeBuilder\TreeBuilder( $domBuilder, [ 'ignoreErrors' => true ] );
$dispatcher = new TreeBuilder\Dispatcher( $treeBuilder );
$tokenizer = new Tokenizer\Tokenizer( $dispatcher, $text, $GLOBALS['tokenizerOptions'] );
$tokenizer->execute( $GLOBALS['executeOptions'] );
$time += microtime( true );
print "$time\n";
}
function generate( $text ) {
$generator = Tokenizer\TokenGenerator::generate( $text, $GLOBALS['tokenizerOptions'] );
foreach ( $generator as $token ) {
if ( $token['type'] === 'text' ) {
$token['text'] = substr( $token['text'], $token['start'], $token['length'] );
unset( $token['start'] );
unset( $token['length'] );
}
print_r( $token );
}
}
function benchmarkGenerate( $text ) {
$time = -microtime( true );
$generator = Tokenizer\TokenGenerator::generate( $text, $GLOBALS['tokenizerOptions'] );
foreach ( $generator as $token ) {
}
$time += microtime( true );
print "$time\n";
}
$text = file_get_contents($argv[1]);
$tokenizerOptions = [
'ignoreNulls' => true,
'ignoreCharRefs' => true,
'ignoreErrors' => true,
'skipPreprocess' => true,
];
$executeOptions = [
// 'fragmentNamespace' => \RemexHtml\HTMLData::NS_HTML,
// 'fragmentName' => 'div'
];
print "---- Tree builder ----\n";
for ($i = 0; $i < 10; $i++) {
print "Iteration $i:";
benchmarkTreeBuilder($text);
}
print "---- DOM ----\n";
for ($i = 0; $i < 10; $i++) {
print "Iteration $i:";
benchmarkDOM($text);
}
print "---- Generate? ----\n";
for ($i = 0; $i < 10; $i++) {
print "Iteration $i:";
benchmarkGenerate($text);
}
print "---- DOM + serialize ----\n";
for ($i = 0; $i < 10; $i++) {
print "Iteration $i:";
$time = -microtime( true );
tidyViaDOM($text);
$time += microtime( true );
print "$time\n";
}
print "---- SAX + serialize ----\n";
for ($i = 0; $i < 10; $i++) {
print "Iteration $i:";
$time = -microtime( true );
tidy($text);
$time += microtime( true );
print "$time\n";
}

File Metadata

Mime Type
text/x-c++
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
7181748
Default Alt Text
bm.php (8 KB)

Event Timeline