Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F5655
HeWiktionary_2_CulmusDic.pl
Public
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Authored By
•
bzimport
Nov 21 2014, 10:35 PM
2014-11-21 22:35:55 (UTC+0)
Size
4 KB
Referenced Files
None
Subscribers
None
HeWiktionary_2_CulmusDic.pl
View Options
#
!
/
usr
/
bin
/
perl
-
C63
#
This
program
has
been
created
by
Maxim
Iorsh
(
iorsh@users
.
sourceforge
.
net
).
#
It
is
a
PUBLIC
DOMAIN
-
you
may
do
with
it
anything
you
wish
.
#
22
-
Aug
-
08
|
iorsh@users
.
sourceforge
.
net
|
Created
#
27
-
Apr
-
08
|
iorsh@users
.
sourceforge
.
net
|
Fixed
for
empty
entries
use
strict
;
use
integer
;
use
XML
::
LibXML
;
use
FileHandle
;
binmode
(
STDOUT
,
":utf8"
);
my
$
anglit
=
"
\x
{
05
d0
}
\x
{
05
e0
}
\x
{
05
d2
}
\x
{
05
dc
}
\x
{
05
d9
}
\x
{
05
ea
}
";
my $diacritics = "
\x
{
05
b0
}
\x
{
05
b1
}
\x
{
05
b2
}
\x
{
05
b3
}
\x
{
05
b4
}
\x
{
05
b5
}
\x
{
05
b6
}
\x
{
05
b7
}
\x
{
05
b8
}
\x
{
05
b9
}
\x
{
05
ba
}
\x
{
05
bb
}
\x
{
05
bc
}
\x
{
05
c1
}
\x
{
05
c2
}
";
my $hafnaya = "
\x
{
05
d4
}
\x
{
05
e4
}
\x
{
05
e0
}
\x
{
05
d9
}
\x
{
05
d4
}
";
my $tav = "
\x
{
05
ea
}
";
my $file = shift;
die "
Can
't find file \"$file\""
unless -f $file;
my $parser = XML::LibXML->new();
my $doc = $parser->parse_file($file);
my $total = 0;
my $xml_out = XML::LibXML::Document->new();
my $root_out = $xml_out->createElement('culmus');
$xml_out->setDocumentElement($root_out);
foreach my $page ($doc->getElementsByTagName('page'))
{
my ($title, $title_re, $rev, $block, $text, $key, $meaning, $trans, $expl, $redir, $var);
$title = $page->getElementsByTagName('title')->item(0)
->getFirstChild->textContent;
# Skip pages belonging to special namespaces
next if $title =~ /:/;
my @chars = ($title =~ /./g); # convert $title to a list of chars
$title_re = $chars[0];
# For the purpose of matching, space can be interchanged with makaf.
for (my $i = 1; $i < @chars; $i ++)
{
if ($chars[$i] =~ /[\s\x{05be}]/)
{
# $chars[$i] = "\[\\s\x{05be}\]";
}
}
for (my $i = 1; $i < @chars; $i ++)
{
$title_re = $title_re . "[" . $diacritics . "]*" . $chars[$i];
}
$rev = $page->getElementsByTagName('revision')->item(0);
$block = $rev->getElementsByTagName('text')->item(0)->getFirstChild;
# Skip empty entries
next if (!$block);
$text = $block->textContent;
my $key_out = $xml_out->createElement('key');
my $keyword_out = $xml_out->createElement('keyword');
my $keyword_text_out = XML::LibXML::Text->new($title);
$keyword_out->appendChild($keyword_text_out);
$key_out->appendChild($keyword_out);
$root_out->appendChild($key_out);
if ($text =~ /^#$hafnaya\s*\[\[(.*?)\]\]/s)
{
$redir = $1;
}
elsif ($text =~ /^#REDIRECT\s*\[\[(.*?)\]\]/s)
{
$redir = $1;
}
if ($redir)
{
if ($redir =~ /^(.*?)\#(.*?)$/s)
{
$redir = $1;
$var = $2;
}
my $ref_out = $xml_out->createElement('reference');
my $ref_text_out = XML::LibXML::Text->new($redir);
$ref_out->appendChild($ref_text_out);
$key_out->appendChild($ref_out);
if ($var)
{
my $var_out = $xml_out->createElement('variant');
my $var_text_out = XML::LibXML::Text->new($var);
$var_out->appendChild($var_text_out);
$key_out->appendChild($var_out);
}
next;
}
my @chunks = split (/(?<!=)==(?!=)/, $text);
my %meanings;
my $n_meanings = @chunks / 2;
my $i;
next if @chunks < 2; # Seems to be invalid page
for ($i = 1; $i <=$n_meanings; $i ++)
{
$key = $chunks[$i*2 - 1];
$chunks[$i*2-1] =~ /^\s*($title_re)/s;
$meaning = $1;
# skip if there is no translation.
next if !($chunks[$i*2] =~ /\{\{$tav\|$anglit\|(.*?)\}\}/s);
$trans = $1;
# Clean the translation a bit
$trans =~ s/\|/, /sg;
$chunks[$i*2] =~ /\n\#\s*(.*?)\./s;
$expl = $1;
# Remove wiki markup from explanation.
$expl =~ s/\[\[.*?\|(.*?)\]\]/$1/sg;
$expl =~ s/[\[\]]//sg;
# Further clean explanation.
$expl =~ s/\{\{/\[/sg; # remove curled braces
$expl =~ s/\}\}/\]/sg;
$expl =~ s/<small>/\[/sg;
$expl =~ s/<\/small>/\]/sg;
my $var_out = $xml_out->createElement('variant');
my $word_out = $xml_out->createElement('word');
my $word_text_out = XML::LibXML::Text->new($meaning);
$word_out->appendChild($word_text_out);
if (!$meaning)
{
# Filter out likely Wiktionary errors:
# ktiv male, multiple words
if (!($title =~ /[\s\x{05d5}\x{05d9}]/s))
{ print STDERR "Bad word in heading: " . $title . "\n"; }
}
my $def_out = $xml_out->createElement('definiton');
my $def_text_out = XML::LibXML::Text->new($expl);
$def_out->appendChild($def_text_out);
my $trans_out = $xml_out->createElement('translation');
my $en_out = $xml_out->createElement('en
'
);
my
$
en_text_out
=
XML
::
LibXML
::
Text
->
new
(
$
trans
);
$
en_out->
appendChild
(
$
en_text_out
);
$
trans_out->
appendChild
(
$
en_out
);
$
var_out->
appendChild
(
$
word_out
);
$
var_out->
appendChild
(
$
def_out
);
$
var_out->
appendChild
(
$
trans_out
);
$
key_out->
appendChild
(
$
var_out
);
$
total++
;
}
}
print
STDERR
$
total
.
"\n"
;
print
$
xml_out->toString
;
File Metadata
Details
Attached
Mime Type
text/x-perl
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
5205
Default Alt Text
HeWiktionary_2_CulmusDic.pl (4 KB)
Attached To
Mode
T20651: hewiktionary pages-articles.xml dump corrupted
Attached
Detach File
Event Timeline
Log In to Comment