Add a tool to transform dictionaries

This commit is contained in:
Přemysl Eric Janouch 2020-09-03 23:17:17 +02:00
parent dd2bd04a07
commit 8d19acd91a
Signed by: p
GPG Key ID: A0420B94F92B9493
6 changed files with 317 additions and 39 deletions

View File

@ -180,15 +180,14 @@ add_executable (${PROJECT_NAME}
target_link_libraries (${PROJECT_NAME} ${project_common_libraries})
# Tools
add_executable (query-tool EXCLUDE_FROM_ALL
src/query-tool.c ${project_common_sources})
target_link_libraries (query-tool ${project_common_libraries})
set (tools add-pronunciation query-tool transform)
foreach (tool ${tools})
add_executable (${tool} EXCLUDE_FROM_ALL
src/${tool}.c ${project_common_sources})
target_link_libraries (${tool} ${project_common_libraries})
endforeach (tool)
add_executable (add-pronunciation EXCLUDE_FROM_ALL
src/add-pronunciation.c ${project_common_sources})
target_link_libraries (add-pronunciation ${project_common_libraries})
add_custom_target (tools DEPENDS add-pronunciation query-tool)
add_custom_target (tools DEPENDS ${tools})
# The files to be installed
include (GNUInstallDirs)

View File

@ -100,6 +100,11 @@ Dictionaries
Unfortunately this application only really works with specific dictionaries.
Word definitions have to be in plain text, separated by newlines.
You may use the included transform tool to transform existing dictionaries that
are almost useful as they are, e.g. after stripping XML tags. You might want to
fix up the `sametypesequence` of the resulting '.ifo' file afterwards, and run
dictzip on the resulting '.dict' file.
https://mega.co.nz/#!axtD0QRK!sbtBgizksyfkPqKvKEgr8GQ11rsWhtqyRgUUV0B7pwg[
CZ <--> { EN, DE, PL, RU } dictionaries]

View File

@ -282,32 +282,6 @@ stardict_info_copy (StardictInfo *dest, const StardictInfo *src)
}
}
/// Write a list of data fields back to a dictionary.
static gboolean
write_fields (Generator *generator, GList *fields, gboolean sts, GError **error)
{
while (fields)
{
StardictEntryField *field = fields->data;
if (!sts && !generator_write_type (generator, field->type, error))
return FALSE;
gboolean mark_end = !sts || fields->next != NULL;
if (g_ascii_islower (field->type))
{
if (!generator_write_string (generator,
field->data, mark_end, error))
return FALSE;
}
else if (!generator_write_raw (generator,
field->data, field->data_size, mark_end, error))
return FALSE;
fields = fields->next;
}
return TRUE;
}
int
main (int argc, char *argv[])
{
@ -516,8 +490,7 @@ G_GNUC_END_IGNORE_DEPRECATIONS
start_link.next = entry->fields;
start_link.data = &field;
if (!write_fields (generator, &start_link,
info->same_type_sequence != NULL, &error)
if (!generator_write_fields (generator, &start_link, &error)
|| !generator_finish_entry (generator,
stardict_iterator_get_word (iterator), &error))
{

View File

@ -1,7 +1,7 @@
/*
* generator.c: dictionary generator
*
* Copyright (c) 2013, Přemysl Eric Janouch <p@janouch.name>
* Copyright (c) 2013 - 2020, Přemysl Eric Janouch <p@janouch.name>
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted.
@ -170,6 +170,34 @@ generator_write_string (Generator *self,
return TRUE;
}
/// Write a list of data fields back to a dictionary. The list has to be
/// acceptable for the generated dictionary's sametypesequence (or lack of).
gboolean
generator_write_fields (Generator *self, const GList *fields, GError **error)
{
gboolean sts = self->info->same_type_sequence != NULL;
while (fields)
{
StardictEntryField *field = fields->data;
if (!sts && !generator_write_type (self, field->type, error))
return FALSE;
gboolean mark_end = !sts || fields->next != NULL;
if (g_ascii_islower (field->type))
{
if (!generator_write_string (self,
field->data, mark_end, error))
return FALSE;
}
else if (!generator_write_raw (self,
field->data, field->data_size, mark_end, error))
return FALSE;
fields = fields->next;
}
return TRUE;
}
/// Finishes the current entry and writes it into the index.
gboolean
generator_finish_entry (Generator *self, const gchar *word, GError **error)

View File

@ -4,7 +4,7 @@
* Nothing fancy. Just something moved out off the `stardict' test to be
* conveniently reused by the included tools.
*
* Copyright (c) 2013, Přemysl Eric Janouch <p@janouch.name>
* Copyright (c) 2013 - 2020, Přemysl Eric Janouch <p@janouch.name>
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted.
@ -42,12 +42,15 @@ Generator *generator_new (const gchar *base, GError **error);
gboolean generator_finish (Generator *self, GError **error);
void generator_free (Generator *self);
void generator_begin_entry (Generator *self);
gboolean generator_write_type (Generator *self, gchar type, GError **error);
gboolean generator_write_raw (Generator *self,
gpointer data, gsize data_size, gboolean mark_end, GError **error);
gboolean generator_write_string (Generator *self,
const gchar *s, gboolean mark_end, GError **error);
void generator_begin_entry (Generator *self);
gboolean generator_write_fields (Generator *self,
const GList *fields, GError **error);
gboolean generator_finish_entry (Generator *self,
const gchar *word, GError **error);

270
src/transform.c Normal file
View File

@ -0,0 +1,270 @@
/*
* A tool to transform dictionaries dictionaries by an external filter
*
* The external filter needs to process NUL-separated textual entries.
*
* Example: transform input.info output -- perl -p0e s/bullshit/soykaf/g
*
* Copyright (c) 2020, Přemysl Eric Janouch <p@janouch.name>
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <locale.h>
#include <glib.h>
#include <glib/gstdio.h>
#include <glib-unix.h>
#include <gio/gio.h>
#include "stardict.h"
#include "stardict-private.h"
#include "generator.h"
enum { PIPE_READ, PIPE_WRITE };
// --- Main --------------------------------------------------------------------
static inline void
print_progress (gulong *last_percent, StardictIterator *iterator, gsize total)
{
gulong percent =
(gulong) stardict_iterator_get_offset (iterator) * 100 / total;
if (percent != *last_percent)
{
printf ("\r Writing entries... %3lu%%", percent);
*last_percent = percent;
}
}
static gboolean
write_to_filter (StardictDict *dict, gint fd, GError **error)
{
StardictInfo *info = stardict_dict_get_info (dict);
gsize n_words = stardict_info_get_word_count (info);
StardictIterator *iterator = stardict_iterator_new (dict, 0);
gulong last_percent = -1;
while (stardict_iterator_is_valid (iterator))
{
print_progress (&last_percent, iterator, n_words);
StardictEntry *entry = stardict_iterator_get_entry (iterator);
for (const GList *fields = stardict_entry_get_fields (entry);
fields; fields = fields->next)
{
StardictEntryField *field = fields->data;
if (!g_ascii_islower (field->type))
continue;
if (write (fd, field->data, field->data_size)
!= (ssize_t) field->data_size)
{
g_set_error (error, G_IO_ERROR, G_IO_ERROR_FAILED,
"%s", strerror (errno));
return FALSE;
}
}
g_object_unref (entry);
stardict_iterator_next (iterator);
}
printf ("\n");
return TRUE;
}
static gboolean
update_from_filter (StardictDict *dict, Generator *generator,
GMappedFile *filtered_file, GError **error)
{
gchar *filtered = g_mapped_file_get_contents (filtered_file);
gchar *filtered_end = filtered + g_mapped_file_get_length (filtered_file);
StardictInfo *info = stardict_dict_get_info (dict);
gsize n_words = stardict_info_get_word_count (info);
StardictIterator *iterator = stardict_iterator_new (dict, 0);
gulong last_percent = -1;
while (stardict_iterator_is_valid (iterator))
{
print_progress (&last_percent, iterator, n_words);
StardictEntry *entry = stardict_iterator_get_entry (iterator);
generator_begin_entry (generator);
for (GList *fields = entry->fields; fields; fields = fields->next)
{
StardictEntryField *field = fields->data;
if (!g_ascii_islower (field->type))
continue;
gchar *end = memchr (filtered, 0, filtered_end - filtered);
if (!end)
{
g_set_error (error, G_IO_ERROR, G_IO_ERROR_FAILED,
"filter seems to have ended too early");
return FALSE;
}
g_free (field->data);
field->data = g_strdup (filtered);
field->data_size = end - filtered + 1;
filtered = end + 1;
}
if (!generator_write_fields (generator, entry->fields, error)
|| !generator_finish_entry (generator,
stardict_iterator_get_word (iterator), error))
return FALSE;
g_object_unref (entry);
stardict_iterator_next (iterator);
}
printf ("\n");
return TRUE;
}
// FIXME: copied from add-pronunciation.c, should merge it somewhere (utils?)
/// Copy the contents of one StardictInfo object into another. Ignores path.
static void
stardict_info_copy (StardictInfo *dest, const StardictInfo *src)
{
dest->version = src->version;
guint i;
for (i = 0; i < _stardict_ifo_keys_length; i++)
{
const struct stardict_ifo_key *key = &_stardict_ifo_keys[i];
if (key->type == IFO_STRING)
{
gchar **p = &G_STRUCT_MEMBER (gchar *, dest, key->offset);
gchar *q = G_STRUCT_MEMBER (gchar *, src, key->offset);
g_free (*p);
*p = q ? g_strdup (q) : NULL;
}
else
G_STRUCT_MEMBER (gulong, dest, key->offset) =
G_STRUCT_MEMBER (gulong, src, key->offset);
}
}
int
main (int argc, char *argv[])
{
// The GLib help includes an ellipsis character, for some reason
(void) setlocale (LC_ALL, "");
GError *error = NULL;
GOptionContext *ctx = g_option_context_new
("input.ifo output-basename -- FILTER [ARG...]");
g_option_context_set_summary
(ctx, "Transform dictionaries using a filter program.");
g_option_context_set_description (ctx, "Test?");
if (!g_option_context_parse (ctx, &argc, &argv, &error))
{
g_printerr ("Error: option parsing failed: %s\n", error->message);
exit (EXIT_FAILURE);
}
if (argc < 3)
{
gchar *help = g_option_context_get_help (ctx, TRUE, FALSE);
g_printerr ("%s", help);
g_free (help);
exit (EXIT_FAILURE);
}
// GLib is bullshit, getopt_long() always correctly removes this
gint program_argv_start = 3;
if (!strcmp (argv[program_argv_start], "--"))
program_argv_start++;
g_option_context_free (ctx);
printf ("Loading the original dictionary...\n");
StardictDict *dict = stardict_dict_new (argv[1], &error);
if (!dict)
{
g_printerr ("Error: opening the dictionary failed: %s\n",
error->message);
exit (EXIT_FAILURE);
}
printf ("Filtering entries...\n");
gint child_in[2];
if (!g_unix_open_pipe (child_in, 0, &error))
g_error ("g_unix_open_pipe: %s", error->message);
FILE *child_out = tmpfile ();
if (!child_out)
g_error ("tmpfile: %s", strerror (errno));
GPid pid = -1;
if (!g_spawn_async_with_fds (NULL /* working_directory */,
argv + program_argv_start /* forward a part of ours */, NULL /* envp */,
G_SPAWN_SEARCH_PATH | G_SPAWN_DO_NOT_REAP_CHILD,
NULL /* child_setup */, NULL /* user_data */,
&pid, child_in[PIPE_READ], fileno (child_out), STDERR_FILENO, &error))
g_error ("g_spawn: %s", error->message);
if (!write_to_filter (dict, child_in[PIPE_WRITE], &error))
g_error ("write_to_filter: %s", error->message);
if (!g_close (child_in[PIPE_READ], &error)
|| !g_close (child_in[PIPE_WRITE], &error))
g_error ("g_close: %s", error->message);
printf ("Waiting for the filter to finish...\n");
int wstatus = errno = 0;
if (waitpid (pid, &wstatus, 0) < 1
|| !WIFEXITED (wstatus) || WEXITSTATUS (wstatus) > 0)
g_error ("Filter failed (%s, status %d)", strerror (errno), wstatus);
GMappedFile *filtered = g_mapped_file_new_from_fd (fileno (child_out),
FALSE /* writable */, &error);
if (!filtered)
g_error ("g_mapped_file_new_from_fd: %s", error->message);
printf ("Writing the new dictionary...\n");
Generator *generator = generator_new (argv[2], &error);
if (!generator)
{
g_printerr ("Error: failed to create the output dictionary: %s\n",
error->message);
exit (EXIT_FAILURE);
}
StardictInfo *info = generator->info;
stardict_info_copy (info, stardict_dict_get_info (dict));
// This gets incremented each time an entry is finished
info->word_count = 0;
if (!update_from_filter (dict, generator, filtered, &error)
|| !generator_finish (generator, &error))
{
g_printerr ("Error: failed to write the dictionary: %s\n",
error->message);
exit (EXIT_FAILURE);
}
g_mapped_file_unref (filtered);
fclose (child_out);
generator_free (generator);
g_object_unref (dict);
return 0;
}