Add a WIP tool to add pronunciations to dicts

2013-05-08 20:42:35 +02:00 · 2013-05-08 20:42:35 +02:00 · 04ae72158e
parent 818ee593aa
commit 04ae72158e
2 changed files with 266 additions and 1 deletions
--- a/5
+++ b/5
@ -2,7 +2,7 @@ SHELL = /bin/sh
 pkgs = ncursesw glib-2.0 gio-2.0
 tests = test-stardict
-targets = sdcli $(tests)
+targets = sdcli add-pronunciation $(tests)
 CC = clang
 CFLAGS = -ggdb -std=gnu99 -Wall -Wextra -Wno-missing-field-initializers \
@ -19,6 +19,9 @@ clean:
 sdcli: sdcli.o stardict.o
 	$(CC) $^ -o $@ $(LDFLAGS)
 add-pronunciation: add-pronunciation.o stardict.o
 	$(CC) $^ -o $@ $(LDFLAGS)
 test-stardict: test-stardict.o stardict.o
 	$(CC) $^ -o $@ $(LDFLAGS)
--- a/add-pronunciation.c
+++ b/add-pronunciation.c
@ -0,0 +1,262 @@
 /*
 * A tool to add eSpeak-generated pronunciation to dictionaries
 *
 * Here I use the `espeak' process rather than libespeak because of the GPL.
 *
 * Copyright (c) 2013, Přemysl Janouch <p.janouch@gmail.com>
 * All rights reserved.
 *
 * Permission to use, copy, modify, and/or distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 *
 */
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
 #include <glib.h>
 #include <gio/gio.h>
 #include "stardict.h"
 // --- Pronunciation generator -------------------------------------------------
 typedef struct worker_data WorkerData;
 struct worker_data
 {
 	guint32 start_entry;                //! The first entry to be processed
 	guint32 end_entry;                  //! Past the last entry to be processed
 	/* Reader, writer */
 	GMutex *dict_mutex;                 //! Locks the dictionary object
 	/* Reader */
 	GThread *main_thread;               //! A handle to the reader thread
 	StardictDict *dict;                 //! The dictionary object
 	gpointer output;                    //! Linked-list of pronunciation data
 	GMutex *remaining_mutex;            //! Locks the progress stats
 	GCond *remaining_cond;              //! Signals a change in progress
 	guint32 remaining;                  //! How many entries remain
 	/* Writer */
 	StardictIterator *iterator;         //! Iterates over the dictionary
 	FILE *child_stdin;                  //! Standard input of eSpeak
 };
 /** Writes to espeak's stdin. */
 static gpointer
 worker_writer (WorkerData *data)
 {
 	while (stardict_iterator_get_offset (data->iterator) != data->end_entry)
 	{
 		g_mutex_lock (data->dict_mutex);
 		const gchar *word = stardict_iterator_get_word (data->iterator);
 		g_mutex_unlock (data->dict_mutex);
 		stardict_iterator_next (data->iterator);
 		if (fprintf (data->child_stdin, "%s\n", word) < 0)
 			g_error ("write to eSpeak failed: %s", strerror (errno));
 	}
 	g_object_unref (data->iterator);
 	return GINT_TO_POINTER (fclose (data->child_stdin));
 }
 /** Reads from espeak's stdout. */
 static gpointer
 worker (WorkerData *data)
 {
 	/* Spawn eSpeak */
 	static gchar *cmdline[] = { "espeak", "--ipa", "-q", NULL };
 	gint child_in, child_out;
 	GError *error;
 	if (!g_spawn_async_with_pipes (NULL, cmdline, NULL,
 		G_SPAWN_SEARCH_PATH, NULL, NULL,
 		NULL, &child_in, &child_out, NULL, &error))
 		g_error ("g_spawn() failed: %s", error->message);
 	data->child_stdin = fdopen (child_in, "wb");
 	if (!data->child_stdin)
 		perror ("fdopen");
 	FILE *child_stdout = fdopen (child_out, "rb");
 	if (!child_stdout)
 		perror ("fdopen");
 	/* Spawn a writer thread */
 	g_mutex_lock (data->dict_mutex);
 	data->iterator = stardict_iterator_new (data->dict, data->start_entry);
 	g_mutex_unlock (data->dict_mutex);
 	GThread *writer = g_thread_new ("write worker",
 		(GThreadFunc) worker_writer, data);
 	/* Read the output */
 	g_mutex_lock (data->remaining_mutex);
 	guint32 remaining = data->remaining;
 	g_mutex_unlock (data->remaining_mutex);
 	data->output = NULL;
 	gpointer *output_end = &data->output;
 	while (remaining)
 	{
 		static gchar next[sizeof (gpointer)];
 		GString *s = g_string_new (NULL);
 		g_string_append_len (s, next, sizeof next);
 		gint c;
 		while ((c = fgetc (child_stdout)) != EOF && c != '\n')
 			g_string_append_c (s, c);
 		if (c == EOF)
 			g_error ("eSpeak process died too soon");
 		gchar *translation = g_string_free (s, FALSE);
 		*output_end = translation;
 		output_end = (gpointer *) translation;
 		/* We limit progress reporting so that
 		 * the mutex doesn't spin like crazy */
 		if ((--remaining & 1023) != 0)
 			continue;
 		g_mutex_lock (data->remaining_mutex);
 		data->remaining = remaining;
 		g_cond_broadcast (data->remaining_cond);
 		g_mutex_unlock (data->remaining_mutex);
 	}
 	fclose (child_stdout);
 	return g_thread_join (writer);
 }
 // --- Main --------------------------------------------------------------------
 int
 main (int argc, char *argv[])
 {
 	gint n_processes = 1;
 	GOptionEntry entries[] =
 	{
 		{ "processes", 'N', G_OPTION_FLAG_IN_MAIN,
 		  G_OPTION_ARG_INT, &n_processes,
 		  "the number of espeak processes run in parallel", "PROCESSES" },
 		{ NULL }
 	};
 	GError *error = NULL;
 	GOptionContext *ctx = g_option_context_new
 		("input.ifo output.ifo - add pronunciation to dictionaries");
 	g_option_context_add_main_entries (ctx, entries, NULL);
 	if (!g_option_context_parse (ctx, &argc, &argv, &error))
 	{
 		g_print ("option parsing failed: %s\n", error->message);
 		exit (EXIT_FAILURE);
 	}
 	if (argc != 3)
 	{
 		gchar *help = g_option_context_get_help (ctx, TRUE, FALSE);
 		g_print ("%s", help);
 		g_free (help);
 		exit (EXIT_FAILURE);
 	}
 	StardictDict *dict = stardict_dict_new (argv[1], &error);
 	if (!dict)
 	{
 		g_printerr ("opening the dictionary failed: %s\n", error->message);
 		exit (EXIT_FAILURE);
 	}
 	gsize n_words = stardict_info_get_word_count
 		(stardict_dict_get_info (dict));
 	if (n_processes <= 0)
 	{
 		g_printerr ("Error: there must be at least one process\n");
 		exit (EXIT_FAILURE);
 	}
 	if ((gsize) n_processes > n_words * 1024)
 	{
 		n_processes = n_words / 1024;
 		if (!n_processes)
 			n_processes = 1;
 		g_printerr ("Warning: too many processes, reducing to %d\n",
 			n_processes);
 	}
 	/* Spawn worker threads to generate pronunciations */
 	static GMutex dict_mutex;
 	static GMutex remaining_mutex;
 	static GCond remaining_cond;
 	WorkerData *data = g_alloca (sizeof *data * n_processes);
 	gint i;
 	for (i = 0; i < n_processes; i++)
 	{
 		data[i].start_entry = (n_words - 1) *  i      / n_processes;
 		data[i].end_entry   = (n_words - 1) * (i + 1) / n_processes;
 		data[i].remaining = data[i].end_entry - data[i].start_entry;
 		data[i].remaining_mutex = &remaining_mutex;
 		data[i].remaining_cond = &remaining_cond;
 		data[i].dict = dict;
 		data[i].dict_mutex = &dict_mutex;
 		data->main_thread = g_thread_new ("worker", (GThreadFunc) worker, data);
 	}
 	/* Loop while the threads still have some work to do and report status */
 	g_mutex_lock (&remaining_mutex);
 	for (;;)
 	{
 		gboolean all_finished = TRUE;
 		printf ("\rRetrieving pronunciation... ");
 		for (i = 0; i < n_processes; i++)
 		{
 			printf ("%3u%% ", data[i].remaining * 100
 				/ (data[i].end_entry - data[i].start_entry));
 			if (data[i].remaining)
 				all_finished = FALSE;
 		}
 		if (all_finished)
 			break;
 		g_cond_wait (&remaining_cond, &remaining_mutex);
 	}
 	g_mutex_unlock (&remaining_mutex);
 	for (i = 0; i < n_processes; i++)
 		g_thread_join (data[i].main_thread);
 	// TODO after all processing is done, the program will go through the whole
 	//      dictionary and put extended data entries into a new one.
 	StardictIterator *iterator = stardict_iterator_new (dict, 0);
 	while (stardict_iterator_is_valid (iterator))
 	{
 		// ...
 		stardict_iterator_next (iterator);
 	}
 	return 0;
 }