You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
223 lines
6.4 KiB
223 lines
6.4 KiB
/* |
|
* A clean reimplementation of StarDict's tabfile |
|
* |
|
* Copyright (c) 2020 - 2021, Přemysl Eric Janouch <p@janouch.name> |
|
* |
|
* Permission to use, copy, modify, and/or distribute this software for any |
|
* purpose with or without fee is hereby granted. |
|
* |
|
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
|
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
|
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
|
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
|
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION |
|
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
|
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
|
* |
|
*/ |
|
|
|
#include <stdio.h> |
|
#include <stdlib.h> |
|
#include <string.h> |
|
#include <errno.h> |
|
#include <locale.h> |
|
|
|
#include <glib.h> |
|
#include <gio/gio.h> |
|
#include <pango/pango.h> |
|
|
|
#include <unicode/ucol.h> |
|
|
|
#include "config.h" |
|
#include "stardict.h" |
|
#include "stardict-private.h" |
|
#include "generator.h" |
|
#include "utils.h" |
|
|
|
|
|
static gboolean |
|
set_data_error (GError **error, const gchar *message) |
|
{ |
|
g_set_error_literal (error, G_IO_ERROR, G_IO_ERROR_INVALID_DATA, message); |
|
return FALSE; |
|
} |
|
|
|
static const gchar escapes[256] = { ['n'] = '\n', ['t'] = '\t', ['\\'] = '\\' }; |
|
|
|
static gboolean |
|
inplace_unescape (gchar *line, GError **error) |
|
{ |
|
gboolean escape = FALSE; |
|
gchar *dest = line; |
|
for (gchar *src = line; *src; src++) |
|
{ |
|
if (escape) |
|
{ |
|
escape = FALSE; |
|
if (!(*dest++ = escapes[(guchar) *src])) |
|
return set_data_error (error, "unsupported escape"); |
|
} |
|
else if (*src == '\\') |
|
escape = TRUE; |
|
else |
|
*dest++ = *src; |
|
} |
|
if (escape) |
|
return set_data_error (error, "trailing escape character"); |
|
|
|
*dest = 0; |
|
return TRUE; |
|
} |
|
|
|
static gboolean |
|
import_line (Generator *generator, gchar *line, gsize len, GError **error) |
|
{ |
|
if (!len) |
|
return TRUE; |
|
if (!g_utf8_validate_len (line, len, NULL)) |
|
return set_data_error (error, "not valid UTF-8"); |
|
|
|
gchar *separator = strchr (line, '\t'); |
|
if (!separator) |
|
return set_data_error (error, "keyword separator not found"); |
|
|
|
*separator++ = 0; |
|
if (strchr (line, '\\')) |
|
// The index wouldn't be sorted correctly with our method |
|
return set_data_error (error, "escapes not allowed in keywords"); |
|
|
|
gchar *newline = strpbrk (separator, "\r\n"); |
|
if (newline) |
|
*newline = 0; |
|
|
|
if (!inplace_unescape (line, error) |
|
|| !inplace_unescape (separator, error)) |
|
return FALSE; |
|
|
|
if (generator->info->same_type_sequence |
|
&& *generator->info->same_type_sequence == STARDICT_FIELD_PANGO |
|
&& !pango_parse_markup (separator, -1, 0, NULL, NULL, NULL, error)) |
|
return FALSE; |
|
|
|
generator_begin_entry (generator); |
|
return generator_write_string (generator, separator, TRUE, error) |
|
&& generator_finish_entry (generator, line, error); |
|
} |
|
|
|
static gboolean |
|
transform (FILE *fsorted, Generator *generator, GError **error) |
|
{ |
|
gchar *line = NULL; |
|
gsize size = 0, ln = 1; |
|
for (ssize_t read; (read = getline (&line, &size, fsorted)) >= 0; ln++) |
|
if (!import_line (generator, line, read, error)) |
|
break; |
|
|
|
free (line); |
|
if (ferror (fsorted)) |
|
{ |
|
g_set_error_literal (error, G_IO_ERROR, |
|
g_io_error_from_errno (errno), g_strerror (errno)); |
|
return FALSE; |
|
} |
|
if (!feof (fsorted)) |
|
{ |
|
// You'll only get good line number output with presorted input! |
|
g_prefix_error (error, "line %zu: ", ln); |
|
return FALSE; |
|
} |
|
return TRUE; |
|
} |
|
|
|
static void |
|
validate_collation_locale (const gchar *locale) |
|
{ |
|
UErrorCode error = U_ZERO_ERROR; |
|
UCollator *collator = ucol_open (locale, &error); |
|
if (!collator) |
|
fatal ("failed to create a collator for %s: %s\n", |
|
locale, u_errorName (error)); |
|
ucol_close (collator); |
|
} |
|
|
|
int |
|
main (int argc, char *argv[]) |
|
{ |
|
// The GLib help includes an ellipsis character, for some reason |
|
(void) setlocale (LC_ALL, ""); |
|
|
|
GError *error = NULL; |
|
GOptionContext *ctx = g_option_context_new ("output-basename < input"); |
|
g_option_context_set_summary (ctx, |
|
"Create a StarDict dictionary from plaintext."); |
|
|
|
gboolean pango_markup = FALSE; |
|
StardictInfo template = {}; |
|
GOptionEntry entries[] = |
|
{ |
|
{ "pango", 'p', 0, G_OPTION_ARG_NONE, &pango_markup, |
|
"Entries use Pango markup", NULL }, |
|
|
|
{ "book-name", 'b', 0, G_OPTION_ARG_STRING, &template.book_name, |
|
"Set the book name field", "TEXT" }, |
|
{ "author", 'a', 0, G_OPTION_ARG_STRING, &template.author, |
|
"Set the author field ", "NAME" }, |
|
{ "e-mail", 'e', 0, G_OPTION_ARG_STRING, &template.email, |
|
"Set the e-mail field", "ADDRESS" }, |
|
{ "website", 'w', 0, G_OPTION_ARG_STRING, &template.website, |
|
"Set the website field", "LINK" }, |
|
{ "description", 'd', 0, G_OPTION_ARG_STRING, &template.description, |
|
"Set the description field (newlines supported)", "TEXT" }, |
|
{ "date", 'D', 0, G_OPTION_ARG_STRING, &template.date, |
|
"Set the date field", "DATE" }, |
|
{ "collation", 'c', 0, G_OPTION_ARG_STRING, &template.collation, |
|
"Set the collation field (for ICU)", "LOCALE" }, |
|
{ } |
|
}; |
|
|
|
g_option_context_add_main_entries (ctx, entries, GETTEXT_PACKAGE); |
|
if (!g_option_context_parse (ctx, &argc, &argv, &error)) |
|
fatal ("Error: option parsing failed: %s\n", error->message); |
|
if (argc != 2) |
|
fatal ("%s", g_option_context_get_help (ctx, TRUE, NULL)); |
|
g_option_context_free (ctx); |
|
|
|
template.version = SD_VERSION_3_0_0; |
|
template.same_type_sequence = pango_markup |
|
? (char[]) { STARDICT_FIELD_PANGO, 0 } |
|
: (char[]) { STARDICT_FIELD_MEANING, 0 }; |
|
|
|
if (!template.book_name) |
|
template.book_name = argv[1]; |
|
if (template.description) |
|
{ |
|
gchar **lines = g_strsplit (template.description, "\n", -1); |
|
g_free (template.description); |
|
gchar *in_one_line = g_strjoinv ("<br>", lines); |
|
g_strfreev (lines); |
|
template.description = in_one_line; |
|
} |
|
if (template.collation) |
|
validate_collation_locale (template.collation); |
|
|
|
// This actually implements stardict_strcmp(), POSIX-compatibly. |
|
// Your sort(1) is not expected to be stable by default, like bsdsort is. |
|
FILE *fsorted = popen ("LC_ALL=C sort -t'\t' -k1f,1", "r"); |
|
if (!fsorted) |
|
fatal ("%s: %s\n", "popen", g_strerror (errno)); |
|
|
|
Generator *generator = generator_new (argv[1], &error); |
|
if (!generator) |
|
fatal ("Error: failed to create the output dictionary: %s\n", |
|
error->message); |
|
|
|
StardictInfo *info = generator->info; |
|
stardict_info_copy (info, &template); |
|
if (!transform (fsorted, generator, &error) |
|
|| !generator_finish (generator, &error)) |
|
fatal ("Error: failed to write the dictionary: %s\n", error->message); |
|
|
|
generator_free (generator); |
|
fclose (fsorted); |
|
return 0; |
|
}
|
|
|