Add a WIP tool to add pronunciations to dicts

This commit is contained in:
Přemysl Eric Janouch 2013-05-08 20:42:35 +02:00
parent 818ee593aa
commit 04ae72158e
2 changed files with 266 additions and 1 deletions

View File

@ -2,7 +2,7 @@ SHELL = /bin/sh
pkgs = ncursesw glib-2.0 gio-2.0 pkgs = ncursesw glib-2.0 gio-2.0
tests = test-stardict tests = test-stardict
targets = sdcli $(tests) targets = sdcli add-pronunciation $(tests)
CC = clang CC = clang
CFLAGS = -ggdb -std=gnu99 -Wall -Wextra -Wno-missing-field-initializers \ CFLAGS = -ggdb -std=gnu99 -Wall -Wextra -Wno-missing-field-initializers \
@ -19,6 +19,9 @@ clean:
sdcli: sdcli.o stardict.o sdcli: sdcli.o stardict.o
$(CC) $^ -o $@ $(LDFLAGS) $(CC) $^ -o $@ $(LDFLAGS)
add-pronunciation: add-pronunciation.o stardict.o
$(CC) $^ -o $@ $(LDFLAGS)
test-stardict: test-stardict.o stardict.o test-stardict: test-stardict.o stardict.o
$(CC) $^ -o $@ $(LDFLAGS) $(CC) $^ -o $@ $(LDFLAGS)

262
add-pronunciation.c Normal file
View File

@ -0,0 +1,262 @@
/*
* A tool to add eSpeak-generated pronunciation to dictionaries
*
* Here I use the `espeak' process rather than libespeak because of the GPL.
*
* Copyright (c) 2013, Přemysl Janouch <p.janouch@gmail.com>
* All rights reserved.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <glib.h>
#include <gio/gio.h>
#include "stardict.h"
// --- Pronunciation generator -------------------------------------------------
typedef struct worker_data WorkerData;
struct worker_data
{
guint32 start_entry; //! The first entry to be processed
guint32 end_entry; //! Past the last entry to be processed
/* Reader, writer */
GMutex *dict_mutex; //! Locks the dictionary object
/* Reader */
GThread *main_thread; //! A handle to the reader thread
StardictDict *dict; //! The dictionary object
gpointer output; //! Linked-list of pronunciation data
GMutex *remaining_mutex; //! Locks the progress stats
GCond *remaining_cond; //! Signals a change in progress
guint32 remaining; //! How many entries remain
/* Writer */
StardictIterator *iterator; //! Iterates over the dictionary
FILE *child_stdin; //! Standard input of eSpeak
};
/** Writes to espeak's stdin. */
static gpointer
worker_writer (WorkerData *data)
{
while (stardict_iterator_get_offset (data->iterator) != data->end_entry)
{
g_mutex_lock (data->dict_mutex);
const gchar *word = stardict_iterator_get_word (data->iterator);
g_mutex_unlock (data->dict_mutex);
stardict_iterator_next (data->iterator);
if (fprintf (data->child_stdin, "%s\n", word) < 0)
g_error ("write to eSpeak failed: %s", strerror (errno));
}
g_object_unref (data->iterator);
return GINT_TO_POINTER (fclose (data->child_stdin));
}
/** Reads from espeak's stdout. */
static gpointer
worker (WorkerData *data)
{
/* Spawn eSpeak */
static gchar *cmdline[] = { "espeak", "--ipa", "-q", NULL };
gint child_in, child_out;
GError *error;
if (!g_spawn_async_with_pipes (NULL, cmdline, NULL,
G_SPAWN_SEARCH_PATH, NULL, NULL,
NULL, &child_in, &child_out, NULL, &error))
g_error ("g_spawn() failed: %s", error->message);
data->child_stdin = fdopen (child_in, "wb");
if (!data->child_stdin)
perror ("fdopen");
FILE *child_stdout = fdopen (child_out, "rb");
if (!child_stdout)
perror ("fdopen");
/* Spawn a writer thread */
g_mutex_lock (data->dict_mutex);
data->iterator = stardict_iterator_new (data->dict, data->start_entry);
g_mutex_unlock (data->dict_mutex);
GThread *writer = g_thread_new ("write worker",
(GThreadFunc) worker_writer, data);
/* Read the output */
g_mutex_lock (data->remaining_mutex);
guint32 remaining = data->remaining;
g_mutex_unlock (data->remaining_mutex);
data->output = NULL;
gpointer *output_end = &data->output;
while (remaining)
{
static gchar next[sizeof (gpointer)];
GString *s = g_string_new (NULL);
g_string_append_len (s, next, sizeof next);
gint c;
while ((c = fgetc (child_stdout)) != EOF && c != '\n')
g_string_append_c (s, c);
if (c == EOF)
g_error ("eSpeak process died too soon");
gchar *translation = g_string_free (s, FALSE);
*output_end = translation;
output_end = (gpointer *) translation;
/* We limit progress reporting so that
* the mutex doesn't spin like crazy */
if ((--remaining & 1023) != 0)
continue;
g_mutex_lock (data->remaining_mutex);
data->remaining = remaining;
g_cond_broadcast (data->remaining_cond);
g_mutex_unlock (data->remaining_mutex);
}
fclose (child_stdout);
return g_thread_join (writer);
}
// --- Main --------------------------------------------------------------------
int
main (int argc, char *argv[])
{
gint n_processes = 1;
GOptionEntry entries[] =
{
{ "processes", 'N', G_OPTION_FLAG_IN_MAIN,
G_OPTION_ARG_INT, &n_processes,
"the number of espeak processes run in parallel", "PROCESSES" },
{ NULL }
};
GError *error = NULL;
GOptionContext *ctx = g_option_context_new
("input.ifo output.ifo - add pronunciation to dictionaries");
g_option_context_add_main_entries (ctx, entries, NULL);
if (!g_option_context_parse (ctx, &argc, &argv, &error))
{
g_print ("option parsing failed: %s\n", error->message);
exit (EXIT_FAILURE);
}
if (argc != 3)
{
gchar *help = g_option_context_get_help (ctx, TRUE, FALSE);
g_print ("%s", help);
g_free (help);
exit (EXIT_FAILURE);
}
StardictDict *dict = stardict_dict_new (argv[1], &error);
if (!dict)
{
g_printerr ("opening the dictionary failed: %s\n", error->message);
exit (EXIT_FAILURE);
}
gsize n_words = stardict_info_get_word_count
(stardict_dict_get_info (dict));
if (n_processes <= 0)
{
g_printerr ("Error: there must be at least one process\n");
exit (EXIT_FAILURE);
}
if ((gsize) n_processes > n_words * 1024)
{
n_processes = n_words / 1024;
if (!n_processes)
n_processes = 1;
g_printerr ("Warning: too many processes, reducing to %d\n",
n_processes);
}
/* Spawn worker threads to generate pronunciations */
static GMutex dict_mutex;
static GMutex remaining_mutex;
static GCond remaining_cond;
WorkerData *data = g_alloca (sizeof *data * n_processes);
gint i;
for (i = 0; i < n_processes; i++)
{
data[i].start_entry = (n_words - 1) * i / n_processes;
data[i].end_entry = (n_words - 1) * (i + 1) / n_processes;
data[i].remaining = data[i].end_entry - data[i].start_entry;
data[i].remaining_mutex = &remaining_mutex;
data[i].remaining_cond = &remaining_cond;
data[i].dict = dict;
data[i].dict_mutex = &dict_mutex;
data->main_thread = g_thread_new ("worker", (GThreadFunc) worker, data);
}
/* Loop while the threads still have some work to do and report status */
g_mutex_lock (&remaining_mutex);
for (;;)
{
gboolean all_finished = TRUE;
printf ("\rRetrieving pronunciation... ");
for (i = 0; i < n_processes; i++)
{
printf ("%3u%% ", data[i].remaining * 100
/ (data[i].end_entry - data[i].start_entry));
if (data[i].remaining)
all_finished = FALSE;
}
if (all_finished)
break;
g_cond_wait (&remaining_cond, &remaining_mutex);
}
g_mutex_unlock (&remaining_mutex);
for (i = 0; i < n_processes; i++)
g_thread_join (data[i].main_thread);
// TODO after all processing is done, the program will go through the whole
// dictionary and put extended data entries into a new one.
StardictIterator *iterator = stardict_iterator_new (dict, 0);
while (stardict_iterator_is_valid (iterator))
{
// ...
stardict_iterator_next (iterator);
}
return 0;
}