Finish the add-pronunciation tool

2013-05-10 02:06:57 +02:00
parent bcf933fb9e
commit e783f31de9
2 changed files with 319 additions and 24 deletions
--- a/src/add-pronunciation.c
+++ b/src/add-pronunciation.c
@@ -2,6 +2,7 @@
 * A tool to add eSpeak-generated pronunciation to dictionaries
 *
 * Here I use the `espeak' process rather than libespeak because of the GPL.
 * It's far from ideal, rather good as a starting point.
 *
 * Copyright (c) 2013, Přemysl Janouch <p.janouch@gmail.com>
 * All rights reserved.
@@ -39,6 +40,11 @@ typedef struct worker_data WorkerData;
 struct worker_data
 {
 	gchar **cmdline;                    //! eSpeak command line
 	guint ignore_acronyms : 1;          //! Don't spell out acronyms
 	GRegex *re_stop;                    //! Regex for stop sequences
 	GRegex *re_acronym;                 //! Regex for ACRONYMS
 	guint32 start_entry;                //! The first entry to be processed
 	guint32 end_entry;                  //! Past the last entry to be processed
@@ -53,41 +59,140 @@ struct worker_data
 	GMutex *remaining_mutex;            //! Locks the progress stats
 	GCond *remaining_cond;              //! Signals a change in progress
 	guint32 remaining;                  //! How many entries remain
 	guint32 total;                      //! Total number of entries
 	/* Writer */
 	StardictIterator *iterator;         //! Iterates over the dictionary
 	FILE *child_stdin;                  //! Standard input of eSpeak
 };
 /** eSpeak splits the output on certain characters. */
 #define LINE_SPLITTING_CHARS            ".,:;?!"
 /** We don't want to include brackets either. */
 #define OTHER_STOP_CHARS                "([{<"
 /** A void word used to make a unique "no pronunciation available" mark. */
 #define VOID_ENTRY                      "not present in any dictionary"
 /** Adds dots between characters. */
 static gboolean
 writer_acronym_cb (const GMatchInfo *info, GString *res,
 	G_GNUC_UNUSED gpointer data)
 {
 	gchar *preceding = g_match_info_fetch (info, 1);
 	g_string_append (res, preceding);
 	g_free (preceding);
 	gchar *word = g_match_info_fetch (info, 2);
 	g_string_append_c (res, *word);
 	const gchar *p;
 	for (p = word + 1; *p; p++)
 	{
 		g_string_append_c (res, '.');
 		g_string_append_c (res, *p);
 	}
 	g_free (word);
 	return FALSE;
 }
 /** Writes to espeak's stdin. */
 static gpointer
 worker_writer (WorkerData *data)
 {
 	GError *error;
 	GMatchInfo *match_info;
 	while (stardict_iterator_get_offset (data->iterator) != data->end_entry)
 	{
 		g_mutex_lock (data->dict_mutex);
 		const gchar *word = stardict_iterator_get_word (data->iterator);
 		g_mutex_unlock (data->dict_mutex);
 		word += strspn (word, LINE_SPLITTING_CHARS " \t");
 		gchar *x = g_strdup (word);
 		/* Cut the word if needed be */
 		error = NULL;
 		if (g_regex_match_full (data->re_stop,
 			x, -1, 0, 0, &match_info, &error))
 		{
 			gint start_pos;
 			g_match_info_fetch_pos (match_info, 0, &start_pos, NULL);
 			x[start_pos] = 0;
 		}
 		g_match_info_free (match_info);
 		/* Change acronyms so that they're not pronounced as words */
 		if (!error && !data->ignore_acronyms)
 		{
 			char *tmp = g_regex_replace_eval (data->re_acronym,
 				x, -1, 0, 0, writer_acronym_cb, NULL, &error);
 			g_free (x);
 			x = tmp;
 		}
 		if (error)
 		{
 			g_printerr ("Notice: error processing '%s': %s\n",
 				word, error->message);
 			g_clear_error (&error);
 			*x = 0;
 		}
 		/* We might have accidentally cut off everything */
 		if (!*x)
 		{
 			g_free (x);
 			x = g_strdup (VOID_ENTRY);
 		}
 		stardict_iterator_next (data->iterator);
-		if (fprintf (data->child_stdin, "%s\n", word) < 0)
+		if (fprintf (data->child_stdin, "%s\n", x) < 0)
 			g_error ("write to eSpeak failed: %s", strerror (errno));
 		g_free (x);
 	}
 	g_object_unref (data->iterator);
 	return GINT_TO_POINTER (fclose (data->child_stdin));
 }
 /** Get the void entry (and test if espeak works). */
 static gchar *
 get_void_entry (gchar *cmdline[])
 {
 	gchar *output;
 	gint exit_status;
 	GError *error;
 	if (!g_spawn_sync (NULL, cmdline, NULL,
 		G_SPAWN_SEARCH_PATH | G_SPAWN_STDERR_TO_DEV_NULL, NULL, NULL,
 		&output, NULL, &exit_status, &error))
 	{
 		g_printerr ("Error: couldn't spawn espeak: %s", error->message);
 		exit (EXIT_FAILURE);
 	}
 	if (exit_status)
 	{
 		g_printerr ("Error: espeak returned %d\n", exit_status);
 		exit (EXIT_FAILURE);
 	}
 	return output;
 }
 /** Reads from espeak's stdout. */
 static gpointer
 worker (WorkerData *data)
 {
 	/* Spawn eSpeak */
 	static gchar *cmdline[] = { "espeak", "--ipa", "-q", NULL };
 	gint child_in, child_out;
 	GError *error;
-	if (!g_spawn_async_with_pipes (NULL, cmdline, NULL,
+	gint child_in, child_out;
 	if (!g_spawn_async_with_pipes (NULL, data->cmdline, NULL,
 		G_SPAWN_SEARCH_PATH, NULL, NULL,
 		NULL, &child_in, &child_out, NULL, &error))
 		g_error ("g_spawn() failed: %s", error->message);
@@ -133,7 +238,7 @@ worker (WorkerData *data)
 		/* We limit progress reporting so that
 		 * the mutex doesn't spin like crazy */
-		if ((--remaining & 1023) != 0)
+		if ((--remaining & 255) != 0)
 			continue;
 		g_mutex_lock (data->remaining_mutex);
@@ -142,47 +247,133 @@ worker (WorkerData *data)
 		g_mutex_unlock (data->remaining_mutex);
 	}
 	if (fgetc (child_stdout) != EOF)
 	{
 		g_printerr ("Error: eSpeak has written more lines than it should. "
 			"The output would be corrupt, aborting.\n");
 		exit (EXIT_FAILURE);
 	}
 	fclose (child_stdout);
 	return g_thread_join (writer);
 }
 // --- Main --------------------------------------------------------------------
 /** Copy the contents of one StardictInfo object into another.  Ignores path. */
 static void
 stardict_info_copy (StardictInfo *dest, const StardictInfo *src)
 {
 	dest->version = src->version;
 	guint i;
 	for (i = 0; i < _stardict_ifo_keys_length; i++)
 	{
 		const struct stardict_ifo_key *key = &_stardict_ifo_keys[i];
 		if (key->type == IFO_STRING)
 		{
 			gchar **p = &G_STRUCT_MEMBER (gchar *, dest, key->offset);
 			gchar  *q =  G_STRUCT_MEMBER (gchar *, src,  key->offset);
 			g_free (*p);
 			*p = q ? g_strdup (q) : NULL;
 		}
 		else
 			G_STRUCT_MEMBER (gulong, dest, key->offset) =
 				G_STRUCT_MEMBER (gulong, src, key->offset);
 	}
 }
 /** Write a list of data fields back to a dictionary. */
 static gboolean
 write_fields (Generator *generator, GList *fields, gboolean sts, GError **error)
 {
 	while (fields)
 	{
 		StardictEntryField *field = fields->data;
 		if (!sts && !generator_write_type (generator, field->type, error))
 			return FALSE;
 		gboolean mark_end = !sts || fields->next != NULL;
 		if (g_ascii_islower (field->type))
 		{
 			if (!generator_write_string (generator,
 				field->data, mark_end, error))
 				return FALSE;
 		}
 		else if (!generator_write_raw (generator,
 			field->data, field->data_size, mark_end, error))
 			return FALSE;
 		fields = fields->next;
 	}
 	return TRUE;
 }
 int
 main (int argc, char *argv[])
 {
 	gint n_processes = 1;
 	gchar *voice = NULL;
 	gboolean ignore_acronyms = FALSE;
 	GOptionEntry entries[] =
 	{
 		{ "processes", 'N', G_OPTION_FLAG_IN_MAIN,
 		  G_OPTION_ARG_INT, &n_processes,
-		  "the number of espeak processes run in parallel", "PROCESSES" },
+		  "The number of espeak processes run in parallel", "PROCESSES" },
 		{ "voice", 'v', G_OPTION_FLAG_IN_MAIN,
 		  G_OPTION_ARG_STRING, &voice,
 		  "The voice to be used by eSpeak to pronounce the words", "VOICE" },
 		{ "ignore-acronyms", 0, G_OPTION_FLAG_IN_MAIN,
 		  G_OPTION_ARG_NONE, &ignore_acronyms,
 		  "Don't spell out words composed of big letters only", NULL },
 		{ NULL }
 	};
 G_GNUC_BEGIN_IGNORE_DEPRECATIONS
 	if (glib_check_version (2, 36, 0))
 		g_type_init ();
 G_GNUC_END_IGNORE_DEPRECATIONS
 	GError *error = NULL;
 	GOptionContext *ctx = g_option_context_new
-		("input.ifo output.ifo - add pronunciation to dictionaries");
+		("input.ifo output-basename - add pronunciation to dictionaries");
 	g_option_context_add_main_entries (ctx, entries, NULL);
 	if (!g_option_context_parse (ctx, &argc, &argv, &error))
 	{
-		g_print ("option parsing failed: %s\n", error->message);
+		g_printerr ("Error: option parsing failed: %s\n", error->message);
 		exit (EXIT_FAILURE);
 	}
 	if (argc != 3)
 	{
 		gchar *help = g_option_context_get_help (ctx, TRUE, FALSE);
-		g_print ("%s", help);
+		g_printerr ("%s", help);
 		g_free (help);
 		exit (EXIT_FAILURE);
 	}
 	g_option_context_free (ctx);
 	/* See if we can run espeak */
 	static gchar *cmdline[] = { "espeak", "--ipa", "-q", NULL, NULL, NULL };
 	if (voice)
 	{
 		cmdline[3] = "-v";
 		cmdline[4] = voice;
 	}
 	gchar *void_entry = g_strstrip (get_void_entry (cmdline));
 	/* Load the dictionary */
 	printf ("Loading the original dictionary...\n");
 	StardictDict *dict = stardict_dict_new (argv[1], &error);
 	if (!dict)
 	{
-		g_printerr ("opening the dictionary failed: %s\n", error->message);
+		g_printerr ("Error: opening the dictionary failed: %s\n",
 			error->message);
 		exit (EXIT_FAILURE);
 	}
@@ -204,7 +395,7 @@ main (int argc, char *argv[])
 			n_processes);
 	}
-	/* Spawn worker threads to generate pronunciations */
+	/* Spawn worker threads to generate pronunciation data */
 	static GMutex dict_mutex;
 	static GMutex remaining_mutex;
@@ -212,20 +403,35 @@ main (int argc, char *argv[])
 	WorkerData *data = g_alloca (sizeof *data * n_processes);
 	GRegex *re_stop = g_regex_new ("[" LINE_SPLITTING_CHARS "][ ?]"
 		"|\\.\\.\\.|[" OTHER_STOP_CHARS "]", G_REGEX_OPTIMIZE, 0, &error);
 	g_assert (re_stop != NULL);
 	GRegex *re_acronym = g_regex_new ("(^|\\pZ)(\\p{Lu}+)(?=\\pZ|$)",
 		G_REGEX_OPTIMIZE, 0, &error);
 	g_assert (re_acronym != NULL);
 	gint i;
 	for (i = 0; i < n_processes; i++)
 	{
-		data[i].start_entry = (n_words - 1) *  i      / n_processes;
+		data[i].start_entry = n_words *  i      / n_processes;
-		data[i].end_entry   = (n_words - 1) * (i + 1) / n_processes;
+		data[i].end_entry   = n_words * (i + 1) / n_processes;
-		data[i].remaining = data[i].end_entry - data[i].start_entry;
+		data[i].total = data[i].remaining =
 			data[i].end_entry - data[i].start_entry;
 		data[i].remaining_mutex = &remaining_mutex;
 		data[i].remaining_cond = &remaining_cond;
 		data[i].dict = dict;
 		data[i].dict_mutex = &dict_mutex;
-		data->main_thread = g_thread_new ("worker", (GThreadFunc) worker, data);
+		data[i].re_stop = re_stop;
 		data[i].re_acronym = re_acronym;
 		data[i].cmdline = cmdline;
 		data[i].ignore_acronyms = ignore_acronyms;
 		data[i].main_thread =
 			g_thread_new ("worker", (GThreadFunc) worker, &data[i]);
 	}
 	/* Loop while the threads still have some work to do and report status */
@@ -236,8 +442,7 @@ main (int argc, char *argv[])
 		printf ("\rRetrieving pronunciation... ");
 		for (i = 0; i < n_processes; i++)
 		{
-			printf ("%3u%% ", data[i].remaining * 100
+			printf ("%3u%% ", 100 - data[i].remaining * 100 / data[i].total);
 				/ (data[i].end_entry - data[i].start_entry));
 			if (data[i].remaining)
 				all_finished = FALSE;
 		}
@@ -248,17 +453,103 @@ main (int argc, char *argv[])
 	}
 	g_mutex_unlock (&remaining_mutex);
 	putchar ('\n');
 	for (i = 0; i < n_processes; i++)
 		g_thread_join (data[i].main_thread);
-	// TODO after all processing is done, the program will go through the whole
+	g_regex_unref (re_stop);
-	//      dictionary and put extended data entries into a new one.
+	g_regex_unref (re_acronym);
-	StardictIterator *iterator = stardict_iterator_new (dict, 0);
+
-	while (stardict_iterator_is_valid (iterator))
+	/* Put extended entries into a new dictionary */
 	Generator *generator = generator_new (argv[2], &error);
 	if (!generator)
 	{
-		// ...
+		g_printerr ("Error: failed to create the output dictionary: %s\n",
 			error->message);
 		exit (EXIT_FAILURE);
 	}
 	StardictInfo *info = generator->info;
 	stardict_info_copy (info, stardict_dict_get_info (dict));
 	/* This gets incremented each time an entry is finished */
 	info->word_count = 0;
 	if (info->same_type_sequence)
 	{
 		gchar *new_sts = g_strconcat ("t", info->same_type_sequence, NULL);
 		g_free (info->same_type_sequence);
 		info->same_type_sequence = new_sts;
 	}
 	/* Write out all the entries together with the pronunciation */
 	for (i = 0; i < n_processes; i++)
 	{
 		StardictIterator *iterator =
 			stardict_iterator_new (dict, data[i].start_entry);
 		gpointer *output = data[i].output;
 		while (stardict_iterator_get_offset (iterator) != data[i].end_entry)
 		{
 			printf ("\rCreating a new dictionary... %3lu%%",
 				(gulong) stardict_iterator_get_offset (iterator) * 100
 				/ stardict_dict_get_info (dict)->word_count);
 			g_assert (output != NULL);
 			gchar *pronunciation = g_strstrip ((gchar *) (output + 1));
 			StardictEntry *entry = stardict_iterator_get_entry (iterator);
 			generator_begin_entry (generator);
 			if (!strcmp (pronunciation, void_entry))
 				*pronunciation = 0;
 //			g_printerr ("%s /%s/\n",
 //				stardict_iterator_get_word (iterator), pronunciation);
 			/* For the sake of simplicity we fake a new start;
 			 * write_fields() only iterates the list in one direction. */
 			StardictEntryField field;
 			field.type = 't';
 			field.data = pronunciation;
 			GList start_link;
 			start_link.next = entry->fields;
 			start_link.data = &field;
 			if (!write_fields (generator, &start_link,
 					info->same_type_sequence != NULL, &error)
 			 || !generator_finish_entry (generator,
 					stardict_iterator_get_word (iterator), &error))
 			{
 				g_printerr ("Error: write failed: %s\n", error->message);
 				exit (EXIT_FAILURE);
 			}
 			g_object_unref (entry);
 			gpointer *tmp = output;
 			output = *output;
 			g_free (tmp);
 			stardict_iterator_next (iterator);
 		}
 		g_assert (output == NULL);
 		g_object_unref (iterator);
 	}
 	putchar ('\n');
 	if (!generator_finish (generator, &error))
 	{
 		g_printerr ("Error: failed to write the dictionary: %s\n",
 			error->message);
 		exit (EXIT_FAILURE);
 	}
 	generator_free (generator);
 	g_object_unref (dict);
 	g_free (void_entry);
 	return 0;
 }
--- a/src/generator.c
+++ b/src/generator.c
@@ -114,6 +114,10 @@ generator_finish (Generator *self, GError **error)
 		}
 		else
 		{
 			if (self->info->version == SD_VERSION_2_4_2
 			 && !strcmp (key->name, "idxoffsetbits"))
 				continue;
 			gulong value = G_STRUCT_MEMBER (gulong,
 				self->info, key->offset);
 			if (value)