Add sample dictionary downloaders/builders

This commit is contained in:
Přemysl Eric Janouch 2021-10-07 03:37:12 +02:00
parent 3881725904
commit ed8b1bcdad
Signed by: p
GPG Key ID: A0420B94F92B9493
4 changed files with 47 additions and 3 deletions

View File

@ -172,6 +172,21 @@ endforeach ()
add_custom_target (tools DEPENDS ${tools}) add_custom_target (tools DEPENDS ${tools})
# Example dictionaries
file (GLOB dicts_scripts "${PROJECT_SOURCE_DIR}/dicts/*.sh")
set (dicts_targets)
foreach (dict_script ${dicts_scripts})
get_filename_component (dict_name "${dict_script}" NAME_WE)
list (APPEND dicts_targets "dicts-${dict_name}")
add_custom_target (dicts-${dict_name}
COMMAND sh -c "PATH=.:$PATH \"$0\"" "${dict_script}"
DEPENDS tabfile
COMMENT "Generating sample dictionary ${dict_name}"
VERBATIM)
endforeach ()
add_custom_target (dicts DEPENDS ${dicts_targets})
# The files to be installed # The files to be installed
include (GNUInstallDirs) include (GNUInstallDirs)
install (TARGETS ${PROJECT_NAME} DESTINATION ${CMAKE_INSTALL_BINDIR}) install (TARGETS ${PROJECT_NAME} DESTINATION ${CMAKE_INSTALL_BINDIR})
@ -212,4 +227,3 @@ set (CPACK_SOURCE_IGNORE_FILES "/\\\\.git;/build;/CMakeLists.txt.user")
set (CPACK_SOURCE_PACKAGE_FILE_NAME "${PROJECT_NAME}-${PROJECT_VERSION}") set (CPACK_SOURCE_PACKAGE_FILE_NAME "${PROJECT_NAME}-${PROJECT_VERSION}")
include (CPack) include (CPack)

View File

@ -101,13 +101,14 @@ Dictionaries
Unfortunately this application only really works with specific dictionaries. Unfortunately this application only really works with specific dictionaries.
Word definitions have to be in plain text, separated by newlines. Word definitions have to be in plain text, separated by newlines.
The `make dicts` command will build some examples from freely available sources.
You may use the included transform tool to transform existing dictionaries that You may use the included transform tool to transform existing dictionaries that
are almost useful as they are, e.g. after stripping XML tags. You might want to are almost useful as they are, e.g. after stripping XML tags. You might want to
fix up the `sametypesequence` of the resulting '.ifo' file afterwards, and run fix up the `sametypesequence` of the resulting '.ifo' file afterwards, and run
dictzip on the resulting '.dict' file. dictzip on the resulting '.dict' file.
https://mega.co.nz/#!axtD0QRK!sbtBgizksyfkPqKvKEgr8GQ11rsWhtqyRgUUV0B7pwg[ https://mega.co.nz/#!axtD0QRK!sbtBgizksyfkPqKvKEgr8GQ11rsWhtqyRgUUV0B7pwg[CZ <--> EN/DE/PL/RU dictionaries]
CZ <--> { EN, DE, PL, RU } dictionaries]
Contributing and Support Contributing and Support
------------------------ ------------------------

21
dicts/gnu-fdl-en-cz.sh Executable file
View File

@ -0,0 +1,21 @@
#!/bin/sh -e
# GNU/FDL English-Czech dictionary, see https://www.svobodneslovniky.cz/
curl -Lo- https://www.svobodneslovniky.cz/data/en-cs.txt.gz | \
zcat | grep -v ^# | sed 's/\\//g' | perl -CSD -F\\t -le '
sub e { shift =~ s/\\/\\\\/gr =~ s/\n/\\n/gr =~ s/\t/\\t/gr }
sub w {
open(my $f, "|-", "tabfile gnu-fdl-$_[0]") or die $!;
print $f e($k) . "\t" . e(join("\n", @$v))
while ($k, $v) = each %{$_[1]};
close($f);
}
my ($en, $cz, $notes, $special, $translator) = @F;
if ($cz) {
$notes =~ s/\w+:\s?//g; # remove word classes
$notes =~ s/(\w+\.)(?!])/($1)/; # quote "pl."
push(@{$encz{$en}}, $notes ? "$cz " . $notes : $cz);
push(@{$czen{$cz}}, $notes ? "$en " . $notes : $en);
} END {
w("en-cz", \%encz);
w("cz-en", \%czen);
}'

8
dicts/slovnik-cizich-slov.sh Executable file
View File

@ -0,0 +1,8 @@
#!/bin/sh -e
# Slovník cizích slov, see https://slovnik-cizich-slov.abz.cz/web.php/o-slovniku
# TODO: Skipping the optional pronunciation field, tabfile can't handle it yet,
# but could be made to accept a lowercase sametypesequence
curl -Lo- https://slovnik-cizich-slov.abz.cz/export.php | \
iconv -f latin2 -t UTF-8 | perl -CSD -F\\\| -le '
print "$_\t" . $F[2] =~ s/\\/\\\\/gr =~ s/; /\\n/gr for split(", ", $F[0])
' | sort -u | tabfile slovnik-cizich-slov