Add a WIP tool to add pronunciations to dicts
This commit is contained in:
parent
818ee593aa
commit
04ae72158e
5
Makefile
5
Makefile
|
@ -2,7 +2,7 @@ SHELL = /bin/sh
|
||||||
|
|
||||||
pkgs = ncursesw glib-2.0 gio-2.0
|
pkgs = ncursesw glib-2.0 gio-2.0
|
||||||
tests = test-stardict
|
tests = test-stardict
|
||||||
targets = sdcli $(tests)
|
targets = sdcli add-pronunciation $(tests)
|
||||||
|
|
||||||
CC = clang
|
CC = clang
|
||||||
CFLAGS = -ggdb -std=gnu99 -Wall -Wextra -Wno-missing-field-initializers \
|
CFLAGS = -ggdb -std=gnu99 -Wall -Wextra -Wno-missing-field-initializers \
|
||||||
|
@ -19,6 +19,9 @@ clean:
|
||||||
sdcli: sdcli.o stardict.o
|
sdcli: sdcli.o stardict.o
|
||||||
$(CC) $^ -o $@ $(LDFLAGS)
|
$(CC) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
add-pronunciation: add-pronunciation.o stardict.o
|
||||||
|
$(CC) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
test-stardict: test-stardict.o stardict.o
|
test-stardict: test-stardict.o stardict.o
|
||||||
$(CC) $^ -o $@ $(LDFLAGS)
|
$(CC) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,262 @@
|
||||||
|
/*
|
||||||
|
* A tool to add eSpeak-generated pronunciation to dictionaries
|
||||||
|
*
|
||||||
|
* Here I use the `espeak' process rather than libespeak because of the GPL.
|
||||||
|
*
|
||||||
|
* Copyright (c) 2013, Přemysl Janouch <p.janouch@gmail.com>
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
* purpose with or without fee is hereby granted, provided that the above
|
||||||
|
* copyright notice and this permission notice appear in all copies.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||||
|
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||||
|
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||||
|
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||||
|
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
|
||||||
|
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
||||||
|
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <errno.h>
|
||||||
|
|
||||||
|
#include <glib.h>
|
||||||
|
#include <gio/gio.h>
|
||||||
|
|
||||||
|
#include "stardict.h"
|
||||||
|
|
||||||
|
|
||||||
|
// --- Pronunciation generator -------------------------------------------------
|
||||||
|
|
||||||
|
typedef struct worker_data WorkerData;
|
||||||
|
|
||||||
|
struct worker_data
|
||||||
|
{
|
||||||
|
guint32 start_entry; //! The first entry to be processed
|
||||||
|
guint32 end_entry; //! Past the last entry to be processed
|
||||||
|
|
||||||
|
/* Reader, writer */
|
||||||
|
GMutex *dict_mutex; //! Locks the dictionary object
|
||||||
|
|
||||||
|
/* Reader */
|
||||||
|
GThread *main_thread; //! A handle to the reader thread
|
||||||
|
StardictDict *dict; //! The dictionary object
|
||||||
|
gpointer output; //! Linked-list of pronunciation data
|
||||||
|
|
||||||
|
GMutex *remaining_mutex; //! Locks the progress stats
|
||||||
|
GCond *remaining_cond; //! Signals a change in progress
|
||||||
|
guint32 remaining; //! How many entries remain
|
||||||
|
|
||||||
|
/* Writer */
|
||||||
|
StardictIterator *iterator; //! Iterates over the dictionary
|
||||||
|
FILE *child_stdin; //! Standard input of eSpeak
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Writes to espeak's stdin. */
|
||||||
|
static gpointer
|
||||||
|
worker_writer (WorkerData *data)
|
||||||
|
{
|
||||||
|
while (stardict_iterator_get_offset (data->iterator) != data->end_entry)
|
||||||
|
{
|
||||||
|
g_mutex_lock (data->dict_mutex);
|
||||||
|
const gchar *word = stardict_iterator_get_word (data->iterator);
|
||||||
|
g_mutex_unlock (data->dict_mutex);
|
||||||
|
|
||||||
|
stardict_iterator_next (data->iterator);
|
||||||
|
if (fprintf (data->child_stdin, "%s\n", word) < 0)
|
||||||
|
g_error ("write to eSpeak failed: %s", strerror (errno));
|
||||||
|
}
|
||||||
|
|
||||||
|
g_object_unref (data->iterator);
|
||||||
|
return GINT_TO_POINTER (fclose (data->child_stdin));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Reads from espeak's stdout. */
|
||||||
|
static gpointer
|
||||||
|
worker (WorkerData *data)
|
||||||
|
{
|
||||||
|
/* Spawn eSpeak */
|
||||||
|
static gchar *cmdline[] = { "espeak", "--ipa", "-q", NULL };
|
||||||
|
gint child_in, child_out;
|
||||||
|
|
||||||
|
GError *error;
|
||||||
|
if (!g_spawn_async_with_pipes (NULL, cmdline, NULL,
|
||||||
|
G_SPAWN_SEARCH_PATH, NULL, NULL,
|
||||||
|
NULL, &child_in, &child_out, NULL, &error))
|
||||||
|
g_error ("g_spawn() failed: %s", error->message);
|
||||||
|
|
||||||
|
data->child_stdin = fdopen (child_in, "wb");
|
||||||
|
if (!data->child_stdin)
|
||||||
|
perror ("fdopen");
|
||||||
|
|
||||||
|
FILE *child_stdout = fdopen (child_out, "rb");
|
||||||
|
if (!child_stdout)
|
||||||
|
perror ("fdopen");
|
||||||
|
|
||||||
|
/* Spawn a writer thread */
|
||||||
|
g_mutex_lock (data->dict_mutex);
|
||||||
|
data->iterator = stardict_iterator_new (data->dict, data->start_entry);
|
||||||
|
g_mutex_unlock (data->dict_mutex);
|
||||||
|
|
||||||
|
GThread *writer = g_thread_new ("write worker",
|
||||||
|
(GThreadFunc) worker_writer, data);
|
||||||
|
|
||||||
|
/* Read the output */
|
||||||
|
g_mutex_lock (data->remaining_mutex);
|
||||||
|
guint32 remaining = data->remaining;
|
||||||
|
g_mutex_unlock (data->remaining_mutex);
|
||||||
|
|
||||||
|
data->output = NULL;
|
||||||
|
gpointer *output_end = &data->output;
|
||||||
|
while (remaining)
|
||||||
|
{
|
||||||
|
static gchar next[sizeof (gpointer)];
|
||||||
|
GString *s = g_string_new (NULL);
|
||||||
|
g_string_append_len (s, next, sizeof next);
|
||||||
|
|
||||||
|
gint c;
|
||||||
|
while ((c = fgetc (child_stdout)) != EOF && c != '\n')
|
||||||
|
g_string_append_c (s, c);
|
||||||
|
if (c == EOF)
|
||||||
|
g_error ("eSpeak process died too soon");
|
||||||
|
|
||||||
|
gchar *translation = g_string_free (s, FALSE);
|
||||||
|
*output_end = translation;
|
||||||
|
output_end = (gpointer *) translation;
|
||||||
|
|
||||||
|
/* We limit progress reporting so that
|
||||||
|
* the mutex doesn't spin like crazy */
|
||||||
|
if ((--remaining & 1023) != 0)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
g_mutex_lock (data->remaining_mutex);
|
||||||
|
data->remaining = remaining;
|
||||||
|
g_cond_broadcast (data->remaining_cond);
|
||||||
|
g_mutex_unlock (data->remaining_mutex);
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose (child_stdout);
|
||||||
|
return g_thread_join (writer);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Main --------------------------------------------------------------------
|
||||||
|
|
||||||
|
int
|
||||||
|
main (int argc, char *argv[])
|
||||||
|
{
|
||||||
|
gint n_processes = 1;
|
||||||
|
|
||||||
|
GOptionEntry entries[] =
|
||||||
|
{
|
||||||
|
{ "processes", 'N', G_OPTION_FLAG_IN_MAIN,
|
||||||
|
G_OPTION_ARG_INT, &n_processes,
|
||||||
|
"the number of espeak processes run in parallel", "PROCESSES" },
|
||||||
|
{ NULL }
|
||||||
|
};
|
||||||
|
|
||||||
|
GError *error = NULL;
|
||||||
|
GOptionContext *ctx = g_option_context_new
|
||||||
|
("input.ifo output.ifo - add pronunciation to dictionaries");
|
||||||
|
g_option_context_add_main_entries (ctx, entries, NULL);
|
||||||
|
if (!g_option_context_parse (ctx, &argc, &argv, &error))
|
||||||
|
{
|
||||||
|
g_print ("option parsing failed: %s\n", error->message);
|
||||||
|
exit (EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc != 3)
|
||||||
|
{
|
||||||
|
gchar *help = g_option_context_get_help (ctx, TRUE, FALSE);
|
||||||
|
g_print ("%s", help);
|
||||||
|
g_free (help);
|
||||||
|
exit (EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
StardictDict *dict = stardict_dict_new (argv[1], &error);
|
||||||
|
if (!dict)
|
||||||
|
{
|
||||||
|
g_printerr ("opening the dictionary failed: %s\n", error->message);
|
||||||
|
exit (EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
gsize n_words = stardict_info_get_word_count
|
||||||
|
(stardict_dict_get_info (dict));
|
||||||
|
|
||||||
|
if (n_processes <= 0)
|
||||||
|
{
|
||||||
|
g_printerr ("Error: there must be at least one process\n");
|
||||||
|
exit (EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((gsize) n_processes > n_words * 1024)
|
||||||
|
{
|
||||||
|
n_processes = n_words / 1024;
|
||||||
|
if (!n_processes)
|
||||||
|
n_processes = 1;
|
||||||
|
g_printerr ("Warning: too many processes, reducing to %d\n",
|
||||||
|
n_processes);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Spawn worker threads to generate pronunciations */
|
||||||
|
static GMutex dict_mutex;
|
||||||
|
|
||||||
|
static GMutex remaining_mutex;
|
||||||
|
static GCond remaining_cond;
|
||||||
|
|
||||||
|
WorkerData *data = g_alloca (sizeof *data * n_processes);
|
||||||
|
|
||||||
|
gint i;
|
||||||
|
for (i = 0; i < n_processes; i++)
|
||||||
|
{
|
||||||
|
data[i].start_entry = (n_words - 1) * i / n_processes;
|
||||||
|
data[i].end_entry = (n_words - 1) * (i + 1) / n_processes;
|
||||||
|
|
||||||
|
data[i].remaining = data[i].end_entry - data[i].start_entry;
|
||||||
|
data[i].remaining_mutex = &remaining_mutex;
|
||||||
|
data[i].remaining_cond = &remaining_cond;
|
||||||
|
|
||||||
|
data[i].dict = dict;
|
||||||
|
data[i].dict_mutex = &dict_mutex;
|
||||||
|
|
||||||
|
data->main_thread = g_thread_new ("worker", (GThreadFunc) worker, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Loop while the threads still have some work to do and report status */
|
||||||
|
g_mutex_lock (&remaining_mutex);
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
gboolean all_finished = TRUE;
|
||||||
|
printf ("\rRetrieving pronunciation... ");
|
||||||
|
for (i = 0; i < n_processes; i++)
|
||||||
|
{
|
||||||
|
printf ("%3u%% ", data[i].remaining * 100
|
||||||
|
/ (data[i].end_entry - data[i].start_entry));
|
||||||
|
if (data[i].remaining)
|
||||||
|
all_finished = FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (all_finished)
|
||||||
|
break;
|
||||||
|
g_cond_wait (&remaining_cond, &remaining_mutex);
|
||||||
|
}
|
||||||
|
g_mutex_unlock (&remaining_mutex);
|
||||||
|
|
||||||
|
for (i = 0; i < n_processes; i++)
|
||||||
|
g_thread_join (data[i].main_thread);
|
||||||
|
|
||||||
|
// TODO after all processing is done, the program will go through the whole
|
||||||
|
// dictionary and put extended data entries into a new one.
|
||||||
|
StardictIterator *iterator = stardict_iterator_new (dict, 0);
|
||||||
|
while (stardict_iterator_is_valid (iterator))
|
||||||
|
{
|
||||||
|
// ...
|
||||||
|
stardict_iterator_next (iterator);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
Loading…
Reference in New Issue