Add a custom collation mechanism
A lot better than that StarDict shitfuckery.
This commit is contained in:
parent
d93b241a65
commit
2abbe7017f
|
@ -23,7 +23,7 @@ before_install:
|
|||
- sudo apt-get update -qq
|
||||
install:
|
||||
- sudo apt-get install -y xsltproc docbook-xsl zlib1g-dev libncursesw5-dev
|
||||
- sudo apt-get install -y libgtk-3-dev libpango1.0-dev
|
||||
- sudo apt-get install -y libgtk-3-dev libpango1.0-dev libicu-dev
|
||||
before_script:
|
||||
- mkdir build
|
||||
- cd build
|
||||
|
|
|
@ -24,7 +24,8 @@ set (project_VERSION "${project_VERSION}.${project_VERSION_PATCH}")
|
|||
# Dependencies
|
||||
find_package (ZLIB REQUIRED)
|
||||
find_package (PkgConfig REQUIRED)
|
||||
pkg_check_modules (dependencies REQUIRED ncursesw glib-2.0 gio-2.0 pango)
|
||||
pkg_check_modules (dependencies REQUIRED
|
||||
ncursesw glib-2.0 gio-2.0 pango icu-uc icu-i18n)
|
||||
|
||||
if (USE_SYSTEM_TERMO)
|
||||
find_package (Termo REQUIRED)
|
||||
|
|
10
README
10
README
|
@ -19,7 +19,7 @@ this regard.
|
|||
Building and Running
|
||||
--------------------
|
||||
Build dependencies: CMake, pkg-config, xsltproc, docbook-xsl,
|
||||
ncursesw, zlib, termo (included),
|
||||
ncursesw, zlib, ICU, termo (included),
|
||||
glib-2.0, pango, gtk+ (optional, any version)
|
||||
|
||||
$ git clone https://github.com/pjanouch/sdtui.git
|
||||
|
@ -45,6 +45,14 @@ argument. If you want the application to watch the X11 primary selection for
|
|||
changes and automatically search for the selected text, use the -w switch.
|
||||
This feature requires GTK+.
|
||||
|
||||
Extensions
|
||||
----------
|
||||
As the original StarDict is a bit of a clusterfuck with regard to collation of
|
||||
dictionary entries, I had to introduce an additional "collation" field into the
|
||||
.ifo file. When sdtui discovers this field while reading the dictionary, it
|
||||
automatically reorders the index according to that locale (e.g. "cs_CZ").
|
||||
This operation may take a little while.
|
||||
|
||||
Dictionaries
|
||||
------------
|
||||
Unfortunately this application only really works with specific dictionaries.
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
* stardict-private.h: internal StarDict API
|
||||
*
|
||||
* Copyright (c) 2013, Přemysl Janouch <p.janouch@gmail.com>
|
||||
* Copyright (c) 2013 - 2015, Přemysl Janouch <p.janouch@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Permission to use, copy, modify, and/or distribute this software for any
|
||||
|
@ -47,6 +47,8 @@ struct stardict_info
|
|||
gchar * description;
|
||||
gchar * date;
|
||||
gchar * same_type_sequence;
|
||||
|
||||
gchar * collation;
|
||||
};
|
||||
|
||||
struct stardict_index_entry
|
||||
|
|
169
src/stardict.c
169
src/stardict.c
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
* stardict.c: StarDict API
|
||||
*
|
||||
* Copyright (c) 2013, Přemysl Janouch <p.janouch@gmail.com>
|
||||
* Copyright (c) 2013 - 2015, Přemysl Janouch <p.janouch@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Permission to use, copy, modify, and/or distribute this software for any
|
||||
|
@ -27,6 +27,8 @@
|
|||
#include <gio/gio.h>
|
||||
#include <glib/gi18n.h>
|
||||
|
||||
#include <unicode/ucol.h>
|
||||
|
||||
#include "stardict.h"
|
||||
#include "stardict-private.h"
|
||||
#include "dictzip-input-stream.h"
|
||||
|
@ -177,6 +179,8 @@ stardict_info_free (StardictInfo *sdi)
|
|||
g_free (sdi->description);
|
||||
g_free (sdi->date);
|
||||
g_free (sdi->same_type_sequence);
|
||||
|
||||
g_free (sdi->collation);
|
||||
g_free (sdi);
|
||||
}
|
||||
|
||||
|
@ -194,7 +198,10 @@ const struct stardict_ifo_key _stardict_ifo_keys[] =
|
|||
DEFINE_IFO_KEY ("website", STRING, website),
|
||||
DEFINE_IFO_KEY ("description", STRING, description),
|
||||
DEFINE_IFO_KEY ("date", STRING, date),
|
||||
DEFINE_IFO_KEY ("sametypesequence", STRING, same_type_sequence)
|
||||
DEFINE_IFO_KEY ("sametypesequence", STRING, same_type_sequence),
|
||||
|
||||
// These are our own custom
|
||||
DEFINE_IFO_KEY ("collation", STRING, collation)
|
||||
};
|
||||
|
||||
gsize _stardict_ifo_keys_length = G_N_ELEMENTS (_stardict_ifo_keys);
|
||||
|
@ -358,6 +365,12 @@ struct stardict_dict_private
|
|||
GArray * index; //!< Word index
|
||||
GArray * synonyms; //!< Synonyms
|
||||
|
||||
/* The collated indexes are only permutations of their normal selves. */
|
||||
|
||||
UCollator * collator; //!< ICU index collator
|
||||
GArray * collated_index; //!< Sorted indexes into @a index
|
||||
GArray * collated_synonyms; //!< Sorted indexes into @a synonyms
|
||||
|
||||
/* There are currently three ways the dictionary data can be read:
|
||||
* through mmap(), from a seekable GInputStream, or from a preallocated
|
||||
* chunk of memory that the whole dictionary has been decompressed into.
|
||||
|
@ -384,6 +397,13 @@ stardict_dict_finalize (GObject *self)
|
|||
g_array_free (priv->index, TRUE);
|
||||
g_array_free (priv->synonyms, TRUE);
|
||||
|
||||
if (priv->collator)
|
||||
ucol_close (priv->collator);
|
||||
if (priv->collated_index)
|
||||
g_array_free (priv->collated_index, TRUE);
|
||||
if (priv->collated_synonyms)
|
||||
g_array_free (priv->collated_synonyms, TRUE);
|
||||
|
||||
if (priv->mapped_dict)
|
||||
g_mapped_file_unref (priv->mapped_dict);
|
||||
else if (priv->dict_stream)
|
||||
|
@ -641,6 +661,90 @@ cannot_open:
|
|||
return TRUE;
|
||||
}
|
||||
|
||||
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
|
||||
/** Compare the two strings by collation rules. */
|
||||
static inline gint
|
||||
stardict_dict_strcoll (gconstpointer s1, gconstpointer s2, gpointer data)
|
||||
{
|
||||
StardictDict *sd = data;
|
||||
UErrorCode error = U_ZERO_ERROR;
|
||||
return ucol_strcollUTF8 (sd->priv->collator, s1, -1, s2, -1, &error);
|
||||
}
|
||||
|
||||
/** Stricter stardict_dict_strcoll() used to sort the collated index. */
|
||||
static inline gint
|
||||
stardict_dict_strcoll_for_sorting
|
||||
(gconstpointer s1, gconstpointer s2, gpointer data)
|
||||
{
|
||||
UCollationResult a = stardict_dict_strcoll (s1, s2, data);
|
||||
return a ? a : strcmp (s1, s2);
|
||||
}
|
||||
|
||||
static inline gint
|
||||
stardict_dict_index_coll_for_sorting
|
||||
(gconstpointer x1, gconstpointer x2, gpointer data)
|
||||
{
|
||||
StardictDict *sd = data;
|
||||
const gchar *s1 = g_array_index
|
||||
(sd->priv->index, StardictIndexEntry, *(guint32 *) x1).name;
|
||||
const gchar *s2 = g_array_index
|
||||
(sd->priv->index, StardictIndexEntry, *(guint32 *) x2).name;
|
||||
return stardict_dict_strcoll_for_sorting (s1, s2, data);
|
||||
}
|
||||
|
||||
static inline gint
|
||||
stardict_dict_synonyms_coll_for_sorting
|
||||
(gconstpointer x1, gconstpointer x2, gpointer data)
|
||||
{
|
||||
StardictDict *sd = data;
|
||||
const gchar *s1 = g_array_index
|
||||
(sd->priv->index, StardictSynonymEntry, *(guint32 *) x1).word;
|
||||
const gchar *s2 = g_array_index
|
||||
(sd->priv->index, StardictSynonymEntry, *(guint32 *) x2).word;
|
||||
return stardict_dict_strcoll_for_sorting (s1, s2, data);
|
||||
}
|
||||
|
||||
static gboolean
|
||||
stardict_dict_set_collation (StardictDict *sd, const gchar *collation)
|
||||
{
|
||||
StardictDictPrivate *priv = sd->priv;
|
||||
UErrorCode error = U_ZERO_ERROR;
|
||||
if (!(priv->collator = ucol_open (collation, &error)))
|
||||
{
|
||||
// TODO: set a meaningful error
|
||||
g_info ("failed to create a collator for `%s'", collation);
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
// TODO: if error != U_ZERO_ERROR, report a meaningful message
|
||||
|
||||
ucol_setAttribute (priv->collator, UCOL_CASE_FIRST, UCOL_OFF, &error);
|
||||
|
||||
priv->collated_index = g_array_sized_new (FALSE, FALSE,
|
||||
sizeof (guint32), priv->index->len);
|
||||
for (guint32 i = 0; i < priv->index->len; i++)
|
||||
g_array_append_val (priv->collated_index, i);
|
||||
g_array_sort_with_data (sd->priv->collated_index,
|
||||
stardict_dict_index_coll_for_sorting, sd);
|
||||
|
||||
priv->collated_synonyms = g_array_sized_new (FALSE, FALSE,
|
||||
sizeof (guint32), priv->synonyms->len);
|
||||
for (guint32 i = 0; i < priv->synonyms->len; i++)
|
||||
g_array_append_val (priv->collated_synonyms, i);
|
||||
g_array_sort_with_data (sd->priv->collated_synonyms,
|
||||
stardict_dict_synonyms_coll_for_sorting, sd);
|
||||
|
||||
// Make the collator something like case-insensitive, see:
|
||||
// http://userguide.icu-project.org/collation/concepts
|
||||
// We shouldn't need to sort the data anymore, and if we did, we could just
|
||||
// reset the strength to its default value for the given locale.
|
||||
ucol_setStrength (priv->collator, UCOL_SECONDARY);
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
|
||||
/** Load a StarDict dictionary.
|
||||
* @param[in] sdi Parsed .ifo data. The dictionary assumes ownership.
|
||||
*/
|
||||
|
@ -709,9 +813,12 @@ stardict_dict_new_from_info (StardictInfo *sdi, GError **error)
|
|||
|
||||
gchar *base_syn = g_strconcat (base, ".syn", NULL);
|
||||
if (g_file_test (base_syn, G_FILE_TEST_EXISTS | G_FILE_TEST_IS_REGULAR))
|
||||
load_syn (sd, base_syn, NULL);
|
||||
(void) load_syn (sd, base_syn, NULL);
|
||||
g_free (base_syn);
|
||||
|
||||
if (sdi->collation)
|
||||
(void) stardict_dict_set_collation (sd, sdi->collation);
|
||||
|
||||
g_free (base);
|
||||
return sd;
|
||||
|
||||
|
@ -722,6 +829,20 @@ error:
|
|||
return NULL;
|
||||
}
|
||||
|
||||
static gint
|
||||
stardict_dict_cmp_synonym (StardictDict *sd, const gchar *word, gint i)
|
||||
{
|
||||
GArray *collated = sd->priv->collated_synonyms;
|
||||
GArray *synonyms = sd->priv->synonyms;
|
||||
|
||||
if (sd->priv->collator)
|
||||
return stardict_dict_strcoll (word,
|
||||
g_array_index (synonyms, StardictSynonymEntry,
|
||||
g_array_index (collated, guint32, i)).word, sd);
|
||||
return g_ascii_strcasecmp (word,
|
||||
g_array_index (synonyms, StardictSynonymEntry, i).word);
|
||||
}
|
||||
|
||||
/** Return words for which the argument is a synonym of or NULL
|
||||
* if there are no such words.
|
||||
*/
|
||||
|
@ -731,12 +852,12 @@ stardict_dict_get_synonyms (StardictDict *sd, const gchar *word)
|
|||
GArray *synonyms = sd->priv->synonyms;
|
||||
GArray *index = sd->priv->index;
|
||||
|
||||
BINARY_SEARCH_BEGIN (synonyms->len - 1, g_ascii_strcasecmp (word,
|
||||
g_array_index (synonyms, StardictSynonymEntry, imid).word))
|
||||
BINARY_SEARCH_BEGIN (synonyms->len - 1,
|
||||
stardict_dict_cmp_synonym (sd, word, imid))
|
||||
|
||||
// Back off to the first matching entry
|
||||
while (imid > 0 && !g_ascii_strcasecmp (word,
|
||||
g_array_index (synonyms, StardictSynonymEntry, --imid).word));
|
||||
while (imid > 0 && !stardict_dict_cmp_synonym (sd, word, --imid))
|
||||
;
|
||||
|
||||
GPtrArray *array = g_ptr_array_new ();
|
||||
|
||||
|
@ -751,10 +872,23 @@ stardict_dict_get_synonyms (StardictDict *sd, const gchar *word)
|
|||
return (gchar **) g_ptr_array_free (array, FALSE);
|
||||
|
||||
BINARY_SEARCH_END
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static gint
|
||||
stardict_dict_cmp_index (StardictDict *sd, const gchar *word, gint i)
|
||||
{
|
||||
GArray *collated = sd->priv->collated_index;
|
||||
GArray *index = sd->priv->index;
|
||||
|
||||
if (sd->priv->collator)
|
||||
return stardict_dict_strcoll (word,
|
||||
g_array_index (index, StardictIndexEntry,
|
||||
g_array_index (collated, guint32, i)).name, sd);
|
||||
return g_ascii_strcasecmp (word,
|
||||
g_array_index (index, StardictIndexEntry, i).name);
|
||||
}
|
||||
|
||||
/** Search for a word. The search is ASCII-case-insensitive.
|
||||
* @param[in] word The word in utf-8 encoding
|
||||
* @param[out] success TRUE if found
|
||||
|
@ -765,12 +899,11 @@ stardict_dict_search (StardictDict *sd, const gchar *word, gboolean *success)
|
|||
{
|
||||
GArray *index = sd->priv->index;
|
||||
|
||||
BINARY_SEARCH_BEGIN (index->len - 1, g_ascii_strcasecmp (word,
|
||||
g_array_index (index, StardictIndexEntry, imid).name))
|
||||
BINARY_SEARCH_BEGIN (index->len - 1,
|
||||
stardict_dict_cmp_index (sd, word, imid))
|
||||
|
||||
// Back off to the first matching entry
|
||||
while (imid > 0 && !g_ascii_strcasecmp (word,
|
||||
g_array_index (index, StardictIndexEntry, imid - 1).name))
|
||||
while (imid > 0 && !stardict_dict_cmp_index (sd, word, imid - 1))
|
||||
imid--;
|
||||
|
||||
if (success) *success = TRUE;
|
||||
|
@ -1051,6 +1184,13 @@ stardict_iterator_new (StardictDict *sd, guint32 offset)
|
|||
return si;
|
||||
}
|
||||
|
||||
static gint64
|
||||
stardict_iterator_get_real_offset (StardictIterator *sdi)
|
||||
{
|
||||
return sdi->owner->priv->collator ? g_array_index
|
||||
(sdi->owner->priv->collated_index, guint32, sdi->offset) : sdi->offset;
|
||||
}
|
||||
|
||||
/** Return the word in the index that the iterator points at, or NULL. */
|
||||
const gchar *
|
||||
stardict_iterator_get_word (StardictIterator *sdi)
|
||||
|
@ -1059,7 +1199,7 @@ stardict_iterator_get_word (StardictIterator *sdi)
|
|||
if (!stardict_iterator_is_valid (sdi))
|
||||
return NULL;
|
||||
return g_array_index (sdi->owner->priv->index,
|
||||
StardictIndexEntry, sdi->offset).name;
|
||||
StardictIndexEntry, stardict_iterator_get_real_offset (sdi)).name;
|
||||
}
|
||||
|
||||
/** Return the dictionary entry that the iterator points at, or NULL. */
|
||||
|
@ -1069,7 +1209,8 @@ stardict_iterator_get_entry (StardictIterator *sdi)
|
|||
g_return_val_if_fail (STARDICT_IS_ITERATOR (sdi), NULL);
|
||||
if (!stardict_iterator_is_valid (sdi))
|
||||
return FALSE;
|
||||
return stardict_dict_get_entry (sdi->owner, sdi->offset);
|
||||
return stardict_dict_get_entry (sdi->owner,
|
||||
stardict_iterator_get_real_offset (sdi));
|
||||
}
|
||||
|
||||
/** Return whether the iterator points to a valid index entry. */
|
||||
|
|
Loading…
Reference in New Issue