Add a WIP tool to add pronunciations to dicts
This commit is contained in:
parent
818ee593aa
commit
04ae72158e
5
Makefile
5
Makefile
|
@ -2,7 +2,7 @@ SHELL = /bin/sh
|
|||
|
||||
pkgs = ncursesw glib-2.0 gio-2.0
|
||||
tests = test-stardict
|
||||
targets = sdcli $(tests)
|
||||
targets = sdcli add-pronunciation $(tests)
|
||||
|
||||
CC = clang
|
||||
CFLAGS = -ggdb -std=gnu99 -Wall -Wextra -Wno-missing-field-initializers \
|
||||
|
@ -19,6 +19,9 @@ clean:
|
|||
sdcli: sdcli.o stardict.o
|
||||
$(CC) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
add-pronunciation: add-pronunciation.o stardict.o
|
||||
$(CC) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
test-stardict: test-stardict.o stardict.o
|
||||
$(CC) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
|
|
|
@ -0,0 +1,262 @@
|
|||
/*
|
||||
* A tool to add eSpeak-generated pronunciation to dictionaries
|
||||
*
|
||||
* Here I use the `espeak' process rather than libespeak because of the GPL.
|
||||
*
|
||||
* Copyright (c) 2013, Přemysl Janouch <p.janouch@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Permission to use, copy, modify, and/or distribute this software for any
|
||||
* purpose with or without fee is hereby granted, provided that the above
|
||||
* copyright notice and this permission notice appear in all copies.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
|
||||
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
||||
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include <glib.h>
|
||||
#include <gio/gio.h>
|
||||
|
||||
#include "stardict.h"
|
||||
|
||||
|
||||
// --- Pronunciation generator -------------------------------------------------
|
||||
|
||||
typedef struct worker_data WorkerData;
|
||||
|
||||
struct worker_data
|
||||
{
|
||||
guint32 start_entry; //! The first entry to be processed
|
||||
guint32 end_entry; //! Past the last entry to be processed
|
||||
|
||||
/* Reader, writer */
|
||||
GMutex *dict_mutex; //! Locks the dictionary object
|
||||
|
||||
/* Reader */
|
||||
GThread *main_thread; //! A handle to the reader thread
|
||||
StardictDict *dict; //! The dictionary object
|
||||
gpointer output; //! Linked-list of pronunciation data
|
||||
|
||||
GMutex *remaining_mutex; //! Locks the progress stats
|
||||
GCond *remaining_cond; //! Signals a change in progress
|
||||
guint32 remaining; //! How many entries remain
|
||||
|
||||
/* Writer */
|
||||
StardictIterator *iterator; //! Iterates over the dictionary
|
||||
FILE *child_stdin; //! Standard input of eSpeak
|
||||
};
|
||||
|
||||
/** Writes to espeak's stdin. */
|
||||
static gpointer
|
||||
worker_writer (WorkerData *data)
|
||||
{
|
||||
while (stardict_iterator_get_offset (data->iterator) != data->end_entry)
|
||||
{
|
||||
g_mutex_lock (data->dict_mutex);
|
||||
const gchar *word = stardict_iterator_get_word (data->iterator);
|
||||
g_mutex_unlock (data->dict_mutex);
|
||||
|
||||
stardict_iterator_next (data->iterator);
|
||||
if (fprintf (data->child_stdin, "%s\n", word) < 0)
|
||||
g_error ("write to eSpeak failed: %s", strerror (errno));
|
||||
}
|
||||
|
||||
g_object_unref (data->iterator);
|
||||
return GINT_TO_POINTER (fclose (data->child_stdin));
|
||||
}
|
||||
|
||||
/** Reads from espeak's stdout. */
|
||||
static gpointer
|
||||
worker (WorkerData *data)
|
||||
{
|
||||
/* Spawn eSpeak */
|
||||
static gchar *cmdline[] = { "espeak", "--ipa", "-q", NULL };
|
||||
gint child_in, child_out;
|
||||
|
||||
GError *error;
|
||||
if (!g_spawn_async_with_pipes (NULL, cmdline, NULL,
|
||||
G_SPAWN_SEARCH_PATH, NULL, NULL,
|
||||
NULL, &child_in, &child_out, NULL, &error))
|
||||
g_error ("g_spawn() failed: %s", error->message);
|
||||
|
||||
data->child_stdin = fdopen (child_in, "wb");
|
||||
if (!data->child_stdin)
|
||||
perror ("fdopen");
|
||||
|
||||
FILE *child_stdout = fdopen (child_out, "rb");
|
||||
if (!child_stdout)
|
||||
perror ("fdopen");
|
||||
|
||||
/* Spawn a writer thread */
|
||||
g_mutex_lock (data->dict_mutex);
|
||||
data->iterator = stardict_iterator_new (data->dict, data->start_entry);
|
||||
g_mutex_unlock (data->dict_mutex);
|
||||
|
||||
GThread *writer = g_thread_new ("write worker",
|
||||
(GThreadFunc) worker_writer, data);
|
||||
|
||||
/* Read the output */
|
||||
g_mutex_lock (data->remaining_mutex);
|
||||
guint32 remaining = data->remaining;
|
||||
g_mutex_unlock (data->remaining_mutex);
|
||||
|
||||
data->output = NULL;
|
||||
gpointer *output_end = &data->output;
|
||||
while (remaining)
|
||||
{
|
||||
static gchar next[sizeof (gpointer)];
|
||||
GString *s = g_string_new (NULL);
|
||||
g_string_append_len (s, next, sizeof next);
|
||||
|
||||
gint c;
|
||||
while ((c = fgetc (child_stdout)) != EOF && c != '\n')
|
||||
g_string_append_c (s, c);
|
||||
if (c == EOF)
|
||||
g_error ("eSpeak process died too soon");
|
||||
|
||||
gchar *translation = g_string_free (s, FALSE);
|
||||
*output_end = translation;
|
||||
output_end = (gpointer *) translation;
|
||||
|
||||
/* We limit progress reporting so that
|
||||
* the mutex doesn't spin like crazy */
|
||||
if ((--remaining & 1023) != 0)
|
||||
continue;
|
||||
|
||||
g_mutex_lock (data->remaining_mutex);
|
||||
data->remaining = remaining;
|
||||
g_cond_broadcast (data->remaining_cond);
|
||||
g_mutex_unlock (data->remaining_mutex);
|
||||
}
|
||||
|
||||
fclose (child_stdout);
|
||||
return g_thread_join (writer);
|
||||
}
|
||||
|
||||
// --- Main --------------------------------------------------------------------
|
||||
|
||||
int
|
||||
main (int argc, char *argv[])
|
||||
{
|
||||
gint n_processes = 1;
|
||||
|
||||
GOptionEntry entries[] =
|
||||
{
|
||||
{ "processes", 'N', G_OPTION_FLAG_IN_MAIN,
|
||||
G_OPTION_ARG_INT, &n_processes,
|
||||
"the number of espeak processes run in parallel", "PROCESSES" },
|
||||
{ NULL }
|
||||
};
|
||||
|
||||
GError *error = NULL;
|
||||
GOptionContext *ctx = g_option_context_new
|
||||
("input.ifo output.ifo - add pronunciation to dictionaries");
|
||||
g_option_context_add_main_entries (ctx, entries, NULL);
|
||||
if (!g_option_context_parse (ctx, &argc, &argv, &error))
|
||||
{
|
||||
g_print ("option parsing failed: %s\n", error->message);
|
||||
exit (EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if (argc != 3)
|
||||
{
|
||||
gchar *help = g_option_context_get_help (ctx, TRUE, FALSE);
|
||||
g_print ("%s", help);
|
||||
g_free (help);
|
||||
exit (EXIT_FAILURE);
|
||||
}
|
||||
|
||||
StardictDict *dict = stardict_dict_new (argv[1], &error);
|
||||
if (!dict)
|
||||
{
|
||||
g_printerr ("opening the dictionary failed: %s\n", error->message);
|
||||
exit (EXIT_FAILURE);
|
||||
}
|
||||
|
||||
gsize n_words = stardict_info_get_word_count
|
||||
(stardict_dict_get_info (dict));
|
||||
|
||||
if (n_processes <= 0)
|
||||
{
|
||||
g_printerr ("Error: there must be at least one process\n");
|
||||
exit (EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if ((gsize) n_processes > n_words * 1024)
|
||||
{
|
||||
n_processes = n_words / 1024;
|
||||
if (!n_processes)
|
||||
n_processes = 1;
|
||||
g_printerr ("Warning: too many processes, reducing to %d\n",
|
||||
n_processes);
|
||||
}
|
||||
|
||||
/* Spawn worker threads to generate pronunciations */
|
||||
static GMutex dict_mutex;
|
||||
|
||||
static GMutex remaining_mutex;
|
||||
static GCond remaining_cond;
|
||||
|
||||
WorkerData *data = g_alloca (sizeof *data * n_processes);
|
||||
|
||||
gint i;
|
||||
for (i = 0; i < n_processes; i++)
|
||||
{
|
||||
data[i].start_entry = (n_words - 1) * i / n_processes;
|
||||
data[i].end_entry = (n_words - 1) * (i + 1) / n_processes;
|
||||
|
||||
data[i].remaining = data[i].end_entry - data[i].start_entry;
|
||||
data[i].remaining_mutex = &remaining_mutex;
|
||||
data[i].remaining_cond = &remaining_cond;
|
||||
|
||||
data[i].dict = dict;
|
||||
data[i].dict_mutex = &dict_mutex;
|
||||
|
||||
data->main_thread = g_thread_new ("worker", (GThreadFunc) worker, data);
|
||||
}
|
||||
|
||||
/* Loop while the threads still have some work to do and report status */
|
||||
g_mutex_lock (&remaining_mutex);
|
||||
for (;;)
|
||||
{
|
||||
gboolean all_finished = TRUE;
|
||||
printf ("\rRetrieving pronunciation... ");
|
||||
for (i = 0; i < n_processes; i++)
|
||||
{
|
||||
printf ("%3u%% ", data[i].remaining * 100
|
||||
/ (data[i].end_entry - data[i].start_entry));
|
||||
if (data[i].remaining)
|
||||
all_finished = FALSE;
|
||||
}
|
||||
|
||||
if (all_finished)
|
||||
break;
|
||||
g_cond_wait (&remaining_cond, &remaining_mutex);
|
||||
}
|
||||
g_mutex_unlock (&remaining_mutex);
|
||||
|
||||
for (i = 0; i < n_processes; i++)
|
||||
g_thread_join (data[i].main_thread);
|
||||
|
||||
// TODO after all processing is done, the program will go through the whole
|
||||
// dictionary and put extended data entries into a new one.
|
||||
StardictIterator *iterator = stardict_iterator_new (dict, 0);
|
||||
while (stardict_iterator_is_valid (iterator))
|
||||
{
|
||||
// ...
|
||||
stardict_iterator_next (iterator);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue