From 04ae72158e8bb0bd11553eacece6901714c9d455 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C5=99emysl=20Janouch?= Date: Wed, 8 May 2013 20:42:35 +0200 Subject: [PATCH] Add a WIP tool to add pronunciations to dicts --- Makefile | 5 +- add-pronunciation.c | 262 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 266 insertions(+), 1 deletion(-) create mode 100644 add-pronunciation.c diff --git a/Makefile b/Makefile index 4ca0280..41e69c1 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ SHELL = /bin/sh pkgs = ncursesw glib-2.0 gio-2.0 tests = test-stardict -targets = sdcli $(tests) +targets = sdcli add-pronunciation $(tests) CC = clang CFLAGS = -ggdb -std=gnu99 -Wall -Wextra -Wno-missing-field-initializers \ @@ -19,6 +19,9 @@ clean: sdcli: sdcli.o stardict.o $(CC) $^ -o $@ $(LDFLAGS) +add-pronunciation: add-pronunciation.o stardict.o + $(CC) $^ -o $@ $(LDFLAGS) + test-stardict: test-stardict.o stardict.o $(CC) $^ -o $@ $(LDFLAGS) diff --git a/add-pronunciation.c b/add-pronunciation.c new file mode 100644 index 0000000..45eae61 --- /dev/null +++ b/add-pronunciation.c @@ -0,0 +1,262 @@ +/* + * A tool to add eSpeak-generated pronunciation to dictionaries + * + * Here I use the `espeak' process rather than libespeak because of the GPL. + * + * Copyright (c) 2013, Přemysl Janouch + * All rights reserved. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + */ + +#include +#include +#include +#include + +#include +#include + +#include "stardict.h" + + +// --- Pronunciation generator ------------------------------------------------- + +typedef struct worker_data WorkerData; + +struct worker_data +{ + guint32 start_entry; //! The first entry to be processed + guint32 end_entry; //! Past the last entry to be processed + + /* Reader, writer */ + GMutex *dict_mutex; //! Locks the dictionary object + + /* Reader */ + GThread *main_thread; //! A handle to the reader thread + StardictDict *dict; //! The dictionary object + gpointer output; //! Linked-list of pronunciation data + + GMutex *remaining_mutex; //! Locks the progress stats + GCond *remaining_cond; //! Signals a change in progress + guint32 remaining; //! How many entries remain + + /* Writer */ + StardictIterator *iterator; //! Iterates over the dictionary + FILE *child_stdin; //! Standard input of eSpeak +}; + +/** Writes to espeak's stdin. */ +static gpointer +worker_writer (WorkerData *data) +{ + while (stardict_iterator_get_offset (data->iterator) != data->end_entry) + { + g_mutex_lock (data->dict_mutex); + const gchar *word = stardict_iterator_get_word (data->iterator); + g_mutex_unlock (data->dict_mutex); + + stardict_iterator_next (data->iterator); + if (fprintf (data->child_stdin, "%s\n", word) < 0) + g_error ("write to eSpeak failed: %s", strerror (errno)); + } + + g_object_unref (data->iterator); + return GINT_TO_POINTER (fclose (data->child_stdin)); +} + +/** Reads from espeak's stdout. */ +static gpointer +worker (WorkerData *data) +{ + /* Spawn eSpeak */ + static gchar *cmdline[] = { "espeak", "--ipa", "-q", NULL }; + gint child_in, child_out; + + GError *error; + if (!g_spawn_async_with_pipes (NULL, cmdline, NULL, + G_SPAWN_SEARCH_PATH, NULL, NULL, + NULL, &child_in, &child_out, NULL, &error)) + g_error ("g_spawn() failed: %s", error->message); + + data->child_stdin = fdopen (child_in, "wb"); + if (!data->child_stdin) + perror ("fdopen"); + + FILE *child_stdout = fdopen (child_out, "rb"); + if (!child_stdout) + perror ("fdopen"); + + /* Spawn a writer thread */ + g_mutex_lock (data->dict_mutex); + data->iterator = stardict_iterator_new (data->dict, data->start_entry); + g_mutex_unlock (data->dict_mutex); + + GThread *writer = g_thread_new ("write worker", + (GThreadFunc) worker_writer, data); + + /* Read the output */ + g_mutex_lock (data->remaining_mutex); + guint32 remaining = data->remaining; + g_mutex_unlock (data->remaining_mutex); + + data->output = NULL; + gpointer *output_end = &data->output; + while (remaining) + { + static gchar next[sizeof (gpointer)]; + GString *s = g_string_new (NULL); + g_string_append_len (s, next, sizeof next); + + gint c; + while ((c = fgetc (child_stdout)) != EOF && c != '\n') + g_string_append_c (s, c); + if (c == EOF) + g_error ("eSpeak process died too soon"); + + gchar *translation = g_string_free (s, FALSE); + *output_end = translation; + output_end = (gpointer *) translation; + + /* We limit progress reporting so that + * the mutex doesn't spin like crazy */ + if ((--remaining & 1023) != 0) + continue; + + g_mutex_lock (data->remaining_mutex); + data->remaining = remaining; + g_cond_broadcast (data->remaining_cond); + g_mutex_unlock (data->remaining_mutex); + } + + fclose (child_stdout); + return g_thread_join (writer); +} + +// --- Main -------------------------------------------------------------------- + +int +main (int argc, char *argv[]) +{ + gint n_processes = 1; + + GOptionEntry entries[] = + { + { "processes", 'N', G_OPTION_FLAG_IN_MAIN, + G_OPTION_ARG_INT, &n_processes, + "the number of espeak processes run in parallel", "PROCESSES" }, + { NULL } + }; + + GError *error = NULL; + GOptionContext *ctx = g_option_context_new + ("input.ifo output.ifo - add pronunciation to dictionaries"); + g_option_context_add_main_entries (ctx, entries, NULL); + if (!g_option_context_parse (ctx, &argc, &argv, &error)) + { + g_print ("option parsing failed: %s\n", error->message); + exit (EXIT_FAILURE); + } + + if (argc != 3) + { + gchar *help = g_option_context_get_help (ctx, TRUE, FALSE); + g_print ("%s", help); + g_free (help); + exit (EXIT_FAILURE); + } + + StardictDict *dict = stardict_dict_new (argv[1], &error); + if (!dict) + { + g_printerr ("opening the dictionary failed: %s\n", error->message); + exit (EXIT_FAILURE); + } + + gsize n_words = stardict_info_get_word_count + (stardict_dict_get_info (dict)); + + if (n_processes <= 0) + { + g_printerr ("Error: there must be at least one process\n"); + exit (EXIT_FAILURE); + } + + if ((gsize) n_processes > n_words * 1024) + { + n_processes = n_words / 1024; + if (!n_processes) + n_processes = 1; + g_printerr ("Warning: too many processes, reducing to %d\n", + n_processes); + } + + /* Spawn worker threads to generate pronunciations */ + static GMutex dict_mutex; + + static GMutex remaining_mutex; + static GCond remaining_cond; + + WorkerData *data = g_alloca (sizeof *data * n_processes); + + gint i; + for (i = 0; i < n_processes; i++) + { + data[i].start_entry = (n_words - 1) * i / n_processes; + data[i].end_entry = (n_words - 1) * (i + 1) / n_processes; + + data[i].remaining = data[i].end_entry - data[i].start_entry; + data[i].remaining_mutex = &remaining_mutex; + data[i].remaining_cond = &remaining_cond; + + data[i].dict = dict; + data[i].dict_mutex = &dict_mutex; + + data->main_thread = g_thread_new ("worker", (GThreadFunc) worker, data); + } + + /* Loop while the threads still have some work to do and report status */ + g_mutex_lock (&remaining_mutex); + for (;;) + { + gboolean all_finished = TRUE; + printf ("\rRetrieving pronunciation... "); + for (i = 0; i < n_processes; i++) + { + printf ("%3u%% ", data[i].remaining * 100 + / (data[i].end_entry - data[i].start_entry)); + if (data[i].remaining) + all_finished = FALSE; + } + + if (all_finished) + break; + g_cond_wait (&remaining_cond, &remaining_mutex); + } + g_mutex_unlock (&remaining_mutex); + + for (i = 0; i < n_processes; i++) + g_thread_join (data[i].main_thread); + + // TODO after all processing is done, the program will go through the whole + // dictionary and put extended data entries into a new one. + StardictIterator *iterator = stardict_iterator_new (dict, 0); + while (stardict_iterator_is_valid (iterator)) + { + // ... + stardict_iterator_next (iterator); + } + + return 0; +}