Page MenuHomePhabricator
Paste P8249

(An Untitled Masterwork)
ActivePublic

Authored by tstarling on Mar 21 2019, 4:10 AM.
Tags
None
Referenced Files
F28432881: raw.txt
Mar 21 2019, 4:10 AM
Subscribers
None
commit abe51c977df423f44513a0e5a860ec5d3b539917 (benchmark-tokenizer)
Author: Tim Starling <tstarling@wikimedia.org>
Date: Fri Dec 18 08:51:48 2015 +1100
Benchmark tokenizer
Change-Id: Id4c64dc61602e924e211e5c1d348d907aa976a74
diff --git a/lib/wt2html/tokenizer.js b/lib/wt2html/tokenizer.js
index cb385f29..144a89bd 100644
--- a/lib/wt2html/tokenizer.js
+++ b/lib/wt2html/tokenizer.js
@@ -295,13 +295,13 @@ PegTokenizer.prototype.tokenizeSync = function(text, args) {
pipelineOffset: this.offsets.startOffset || 0,
startRule: 'start',
sol: true,
- }, args, {
+ }, {
// Some rules use callbacks: start, tlb, toplevelblock.
// All other rules return tokens directly.
cb: function(r) { toks = JSUtils.pushArray(toks, r); },
pegTokenizer: this,
pegIncludes: pegIncludes,
- });
+ }, args);
let start;
if (this.traceTime) {
start = JSUtils.startTime();
diff --git a/tests/benchmark/benchmark-tokenizer.js b/tests/benchmark/benchmark-tokenizer.js
new file mode 100644
index 00000000..82e58bb4
--- /dev/null
+++ b/tests/benchmark/benchmark-tokenizer.js
@@ -0,0 +1,87 @@
+'use strict';
+
+var fs = require('fs');
+var PegTokenizer = require('../../lib/wt2html/tokenizer.js');
+var crypto = require('crypto');
+
+function parse(input) {
+ function nop() {};
+ function returnFalse() {
+ return false;
+ }
+ var env = {
+ log: nop,
+ conf: {
+ wiki: {
+ extConfig: {
+ tags: new Map([
+ ['pre', true],
+ ['nowiki', true],
+ ['gallery', true],
+ ['indicator', true],
+ ['timeline', true],
+ ['hiero', true],
+ ['charinsert', true],
+ ['ref', true],
+ ['references', true],
+ ['inputbox', true],
+ ['imagemap', true],
+ ['source', true],
+ ['syntaxhighlight', true],
+ ['poem', true],
+ ['section', true],
+ ['score', true],
+ ['templatedata', true],
+ ['math', true],
+ ['ce', true],
+ ['chem', true],
+ ['graph', true],
+ ['maplink', true],
+ ['categorytree', true],
+ ]),
+ },
+ getMagicWordMatcher: function () {return {test: returnFalse};},
+ isMagicWord: returnFalse,
+ isExtensionTag: returnFalse,
+ hasValidProtocol: function (prot) {
+ return /^http/.test(prot);
+ },
+ },
+ parsoid: {
+ traceFlags: new Map(),
+ maxDepth: 40,
+ },
+ },
+ immutable: false,
+ langConverterEnabled: function () { return true; },
+ bumpParserResourceUse: nop,
+ newAboutId: returnFalse,
+ };
+ var hash = crypto.createHash('md5');
+ var tokenizer = new PegTokenizer.PegTokenizer(env);
+ tokenizer.initTokenizer();
+ var tokenCount = 0;
+ var args = {
+ cb: function (token) {
+ tokenCount++;
+ hash.update(JSON.stringify(token));
+ },
+ pegTokenizer: tokenizer,
+ pipelineOffset: 0,
+ env: env,
+ pegIncludes: PegTokenizer.pegIncludes,
+ startRule: "start"
+ };
+ var t = process.hrtime();
+ tokenizer.tokenizeSync(input, args);
+ t = process.hrtime(t);
+ console.log("Emitted", tokenCount, "tokens in", t[0] + t[1]/1e9, "s");
+ console.log("Hash:", hash.digest('hex'));
+}
+
+var input = fs.readFileSync(process.argv[2], 'utf8');
+//try {
+ parse(input);
+//} catch (e) {
+// console.log(e);
+//}