Add the Czech WordNet snapshot to dicts
This commit is contained in:
parent
973d1d27ea
commit
8b9c5e0460
|
@ -173,7 +173,7 @@ endforeach ()
|
||||||
add_custom_target (tools DEPENDS ${tools})
|
add_custom_target (tools DEPENDS ${tools})
|
||||||
|
|
||||||
# Example dictionaries
|
# Example dictionaries
|
||||||
file (GLOB dicts_scripts "${PROJECT_SOURCE_DIR}/dicts/*.sh")
|
file (GLOB dicts_scripts "${PROJECT_SOURCE_DIR}/dicts/*.*")
|
||||||
set (dicts_targets)
|
set (dicts_targets)
|
||||||
foreach (dict_script ${dicts_scripts})
|
foreach (dict_script ${dicts_scripts})
|
||||||
get_filename_component (dict_name "${dict_script}" NAME_WE)
|
get_filename_component (dict_name "${dict_script}" NAME_WE)
|
||||||
|
|
|
@ -0,0 +1,82 @@
|
||||||
|
#!/usr/bin/env perl
|
||||||
|
# Czech WordNet 1.9 PDT, CC BY-NC-SA 3.0, newer versions available commercially;
|
||||||
|
# this one's IDs cannot be linked with any release of the Princeton WordNet
|
||||||
|
use warnings;
|
||||||
|
use strict;
|
||||||
|
|
||||||
|
my $base = 'https://lindat.cz/repository/xmlui';
|
||||||
|
my $path = 'handle/11858/00-097C-0000-0001-4880-3';
|
||||||
|
open(my $doc, '-|',
|
||||||
|
"curl -Lo- '$base/bitstream/$path/Czech_WordNet_1.9_PDT.zip'"
|
||||||
|
. ' | zcat | iconv -f latin2 -t UTF-8') or die $!;
|
||||||
|
|
||||||
|
# https://nlp.fi.muni.cz/trac/deb2/wiki/WordNetFormat but not quite;
|
||||||
|
# for terminology see https://wordnet.princeton.edu/documentation/wngloss7wn
|
||||||
|
my %synsets;
|
||||||
|
while (<$doc>) {
|
||||||
|
my $id = m|<ID>(.+?)</ID>| && $1; next unless defined $id;
|
||||||
|
my $pos = m|<POS>(.+?)</POS>| && $1; next if $pos eq 'e';
|
||||||
|
$synsets{$id} = {
|
||||||
|
literals => [map {s| \^\d+||gr} m|<LITERAL>(.+?)<|g],
|
||||||
|
rels => {
|
||||||
|
anto => [m^<ILR>(.+?)<TYPE>near_antonym<^g],
|
||||||
|
hyper => [m^<ILR>(.+?)<TYPE>hypernym<^g],
|
||||||
|
hypo => [m^<ILR>(.+?)<TYPE>hyponym<^g,
|
||||||
|
m^<SUBEVENT>(.+?)</SUBEVENT>^g],
|
||||||
|
super => [m^<ILR>(.+?)<TYPE>holo_part<^g],
|
||||||
|
sub => [m^<ILR>(.+?)<TYPE>(?:holo_member|mero_part|partonym)<^g],
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
# Resolve all synset links to hash references, filtering out what can't be found
|
||||||
|
while (my ($id, $synset) = each %synsets) {
|
||||||
|
while (my ($name, $links) = each %{$synset->{rels}}) {
|
||||||
|
@$links = map {$synsets{$_} || ()} @$links;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Ensure symmetry in relationships, duplicates will be taken care of later
|
||||||
|
my %antitags = qw(anto anto hyper hypo hypo hyper super sub sub super);
|
||||||
|
while (my ($id, $synset) = each %synsets) {
|
||||||
|
while (my ($name, $links) = each %{$synset->{rels}}) {
|
||||||
|
push @{$_->{rels}->{$antitags{$name}}}, $synset for @$links;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Create an inverse index from literals/keywords to their synsets
|
||||||
|
my %literals;
|
||||||
|
while (my ($id, $synset) = each %synsets) {
|
||||||
|
push @{$literals{$_}}, $synset for @{$synset->{literals}};
|
||||||
|
}
|
||||||
|
|
||||||
|
# Output synsets exploded to individual words, with expanded relationships
|
||||||
|
close($doc) or die $?;
|
||||||
|
open(my $tabfile, '|-', 'tabfile', 'czech-wordnet',
|
||||||
|
'--book-name=Czech WordNet 1.9 PDT', "--website=$base/$path",
|
||||||
|
'--date=2011-01-24', '--collation=cs_CZ') or die $!;
|
||||||
|
|
||||||
|
sub expand {
|
||||||
|
my %seen;
|
||||||
|
return grep {!$seen{$_}++} (map {@{$_->{literals}}} @_);
|
||||||
|
}
|
||||||
|
|
||||||
|
for my $keyword (sort {lc $a cmp lc $b} keys %literals) {
|
||||||
|
my @lines;
|
||||||
|
for my $synset (@{$literals{$keyword}}) {
|
||||||
|
my $rels = $synset->{rels};
|
||||||
|
push @lines,
|
||||||
|
(grep {$_ ne $keyword} @{$synset->{literals}}),
|
||||||
|
(map {"$_ ↑"} expand(@{$rels->{hyper}})),
|
||||||
|
(map {"$_ ↓"} expand(@{$rels->{hypo}})),
|
||||||
|
(map {"$_ ⊃"} expand(@{$rels->{super}})),
|
||||||
|
(map {"$_ ⊂"} expand(@{$rels->{sub}})),
|
||||||
|
(map {"$_ ≠"} expand(@{$rels->{anto}}));
|
||||||
|
}
|
||||||
|
if (@lines) {
|
||||||
|
print $tabfile "$keyword\t" . join('\n',
|
||||||
|
map { s/</</gr =~ s/>/>/gr =~ s/&/&/gr
|
||||||
|
=~ s/\\/\\\\/gr =~ s/\n/\\n/gr =~ s/\t/\\t/gr} @lines) . "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
close($tabfile) or die $?;
|
Loading…
Reference in New Issue