Add a custom collation mechanism
A lot better than that StarDict shitfuckery.
This commit is contained in:
parent
d93b241a65
commit
2abbe7017f
|
@ -23,7 +23,7 @@ before_install:
|
||||||
- sudo apt-get update -qq
|
- sudo apt-get update -qq
|
||||||
install:
|
install:
|
||||||
- sudo apt-get install -y xsltproc docbook-xsl zlib1g-dev libncursesw5-dev
|
- sudo apt-get install -y xsltproc docbook-xsl zlib1g-dev libncursesw5-dev
|
||||||
- sudo apt-get install -y libgtk-3-dev libpango1.0-dev
|
- sudo apt-get install -y libgtk-3-dev libpango1.0-dev libicu-dev
|
||||||
before_script:
|
before_script:
|
||||||
- mkdir build
|
- mkdir build
|
||||||
- cd build
|
- cd build
|
||||||
|
|
|
@ -24,7 +24,8 @@ set (project_VERSION "${project_VERSION}.${project_VERSION_PATCH}")
|
||||||
# Dependencies
|
# Dependencies
|
||||||
find_package (ZLIB REQUIRED)
|
find_package (ZLIB REQUIRED)
|
||||||
find_package (PkgConfig REQUIRED)
|
find_package (PkgConfig REQUIRED)
|
||||||
pkg_check_modules (dependencies REQUIRED ncursesw glib-2.0 gio-2.0 pango)
|
pkg_check_modules (dependencies REQUIRED
|
||||||
|
ncursesw glib-2.0 gio-2.0 pango icu-uc icu-i18n)
|
||||||
|
|
||||||
if (USE_SYSTEM_TERMO)
|
if (USE_SYSTEM_TERMO)
|
||||||
find_package (Termo REQUIRED)
|
find_package (Termo REQUIRED)
|
||||||
|
|
10
README
10
README
|
@ -19,7 +19,7 @@ this regard.
|
||||||
Building and Running
|
Building and Running
|
||||||
--------------------
|
--------------------
|
||||||
Build dependencies: CMake, pkg-config, xsltproc, docbook-xsl,
|
Build dependencies: CMake, pkg-config, xsltproc, docbook-xsl,
|
||||||
ncursesw, zlib, termo (included),
|
ncursesw, zlib, ICU, termo (included),
|
||||||
glib-2.0, pango, gtk+ (optional, any version)
|
glib-2.0, pango, gtk+ (optional, any version)
|
||||||
|
|
||||||
$ git clone https://github.com/pjanouch/sdtui.git
|
$ git clone https://github.com/pjanouch/sdtui.git
|
||||||
|
@ -45,6 +45,14 @@ argument. If you want the application to watch the X11 primary selection for
|
||||||
changes and automatically search for the selected text, use the -w switch.
|
changes and automatically search for the selected text, use the -w switch.
|
||||||
This feature requires GTK+.
|
This feature requires GTK+.
|
||||||
|
|
||||||
|
Extensions
|
||||||
|
----------
|
||||||
|
As the original StarDict is a bit of a clusterfuck with regard to collation of
|
||||||
|
dictionary entries, I had to introduce an additional "collation" field into the
|
||||||
|
.ifo file. When sdtui discovers this field while reading the dictionary, it
|
||||||
|
automatically reorders the index according to that locale (e.g. "cs_CZ").
|
||||||
|
This operation may take a little while.
|
||||||
|
|
||||||
Dictionaries
|
Dictionaries
|
||||||
------------
|
------------
|
||||||
Unfortunately this application only really works with specific dictionaries.
|
Unfortunately this application only really works with specific dictionaries.
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
/*
|
/*
|
||||||
* stardict-private.h: internal StarDict API
|
* stardict-private.h: internal StarDict API
|
||||||
*
|
*
|
||||||
* Copyright (c) 2013, Přemysl Janouch <p.janouch@gmail.com>
|
* Copyright (c) 2013 - 2015, Přemysl Janouch <p.janouch@gmail.com>
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
*
|
*
|
||||||
* Permission to use, copy, modify, and/or distribute this software for any
|
* Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
@ -47,6 +47,8 @@ struct stardict_info
|
||||||
gchar * description;
|
gchar * description;
|
||||||
gchar * date;
|
gchar * date;
|
||||||
gchar * same_type_sequence;
|
gchar * same_type_sequence;
|
||||||
|
|
||||||
|
gchar * collation;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct stardict_index_entry
|
struct stardict_index_entry
|
||||||
|
|
169
src/stardict.c
169
src/stardict.c
|
@ -1,7 +1,7 @@
|
||||||
/*
|
/*
|
||||||
* stardict.c: StarDict API
|
* stardict.c: StarDict API
|
||||||
*
|
*
|
||||||
* Copyright (c) 2013, Přemysl Janouch <p.janouch@gmail.com>
|
* Copyright (c) 2013 - 2015, Přemysl Janouch <p.janouch@gmail.com>
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
*
|
*
|
||||||
* Permission to use, copy, modify, and/or distribute this software for any
|
* Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
@ -27,6 +27,8 @@
|
||||||
#include <gio/gio.h>
|
#include <gio/gio.h>
|
||||||
#include <glib/gi18n.h>
|
#include <glib/gi18n.h>
|
||||||
|
|
||||||
|
#include <unicode/ucol.h>
|
||||||
|
|
||||||
#include "stardict.h"
|
#include "stardict.h"
|
||||||
#include "stardict-private.h"
|
#include "stardict-private.h"
|
||||||
#include "dictzip-input-stream.h"
|
#include "dictzip-input-stream.h"
|
||||||
|
@ -177,6 +179,8 @@ stardict_info_free (StardictInfo *sdi)
|
||||||
g_free (sdi->description);
|
g_free (sdi->description);
|
||||||
g_free (sdi->date);
|
g_free (sdi->date);
|
||||||
g_free (sdi->same_type_sequence);
|
g_free (sdi->same_type_sequence);
|
||||||
|
|
||||||
|
g_free (sdi->collation);
|
||||||
g_free (sdi);
|
g_free (sdi);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -194,7 +198,10 @@ const struct stardict_ifo_key _stardict_ifo_keys[] =
|
||||||
DEFINE_IFO_KEY ("website", STRING, website),
|
DEFINE_IFO_KEY ("website", STRING, website),
|
||||||
DEFINE_IFO_KEY ("description", STRING, description),
|
DEFINE_IFO_KEY ("description", STRING, description),
|
||||||
DEFINE_IFO_KEY ("date", STRING, date),
|
DEFINE_IFO_KEY ("date", STRING, date),
|
||||||
DEFINE_IFO_KEY ("sametypesequence", STRING, same_type_sequence)
|
DEFINE_IFO_KEY ("sametypesequence", STRING, same_type_sequence),
|
||||||
|
|
||||||
|
// These are our own custom
|
||||||
|
DEFINE_IFO_KEY ("collation", STRING, collation)
|
||||||
};
|
};
|
||||||
|
|
||||||
gsize _stardict_ifo_keys_length = G_N_ELEMENTS (_stardict_ifo_keys);
|
gsize _stardict_ifo_keys_length = G_N_ELEMENTS (_stardict_ifo_keys);
|
||||||
|
@ -358,6 +365,12 @@ struct stardict_dict_private
|
||||||
GArray * index; //!< Word index
|
GArray * index; //!< Word index
|
||||||
GArray * synonyms; //!< Synonyms
|
GArray * synonyms; //!< Synonyms
|
||||||
|
|
||||||
|
/* The collated indexes are only permutations of their normal selves. */
|
||||||
|
|
||||||
|
UCollator * collator; //!< ICU index collator
|
||||||
|
GArray * collated_index; //!< Sorted indexes into @a index
|
||||||
|
GArray * collated_synonyms; //!< Sorted indexes into @a synonyms
|
||||||
|
|
||||||
/* There are currently three ways the dictionary data can be read:
|
/* There are currently three ways the dictionary data can be read:
|
||||||
* through mmap(), from a seekable GInputStream, or from a preallocated
|
* through mmap(), from a seekable GInputStream, or from a preallocated
|
||||||
* chunk of memory that the whole dictionary has been decompressed into.
|
* chunk of memory that the whole dictionary has been decompressed into.
|
||||||
|
@ -384,6 +397,13 @@ stardict_dict_finalize (GObject *self)
|
||||||
g_array_free (priv->index, TRUE);
|
g_array_free (priv->index, TRUE);
|
||||||
g_array_free (priv->synonyms, TRUE);
|
g_array_free (priv->synonyms, TRUE);
|
||||||
|
|
||||||
|
if (priv->collator)
|
||||||
|
ucol_close (priv->collator);
|
||||||
|
if (priv->collated_index)
|
||||||
|
g_array_free (priv->collated_index, TRUE);
|
||||||
|
if (priv->collated_synonyms)
|
||||||
|
g_array_free (priv->collated_synonyms, TRUE);
|
||||||
|
|
||||||
if (priv->mapped_dict)
|
if (priv->mapped_dict)
|
||||||
g_mapped_file_unref (priv->mapped_dict);
|
g_mapped_file_unref (priv->mapped_dict);
|
||||||
else if (priv->dict_stream)
|
else if (priv->dict_stream)
|
||||||
|
@ -641,6 +661,90 @@ cannot_open:
|
||||||
return TRUE;
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||||
|
|
||||||
|
/** Compare the two strings by collation rules. */
|
||||||
|
static inline gint
|
||||||
|
stardict_dict_strcoll (gconstpointer s1, gconstpointer s2, gpointer data)
|
||||||
|
{
|
||||||
|
StardictDict *sd = data;
|
||||||
|
UErrorCode error = U_ZERO_ERROR;
|
||||||
|
return ucol_strcollUTF8 (sd->priv->collator, s1, -1, s2, -1, &error);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Stricter stardict_dict_strcoll() used to sort the collated index. */
|
||||||
|
static inline gint
|
||||||
|
stardict_dict_strcoll_for_sorting
|
||||||
|
(gconstpointer s1, gconstpointer s2, gpointer data)
|
||||||
|
{
|
||||||
|
UCollationResult a = stardict_dict_strcoll (s1, s2, data);
|
||||||
|
return a ? a : strcmp (s1, s2);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline gint
|
||||||
|
stardict_dict_index_coll_for_sorting
|
||||||
|
(gconstpointer x1, gconstpointer x2, gpointer data)
|
||||||
|
{
|
||||||
|
StardictDict *sd = data;
|
||||||
|
const gchar *s1 = g_array_index
|
||||||
|
(sd->priv->index, StardictIndexEntry, *(guint32 *) x1).name;
|
||||||
|
const gchar *s2 = g_array_index
|
||||||
|
(sd->priv->index, StardictIndexEntry, *(guint32 *) x2).name;
|
||||||
|
return stardict_dict_strcoll_for_sorting (s1, s2, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline gint
|
||||||
|
stardict_dict_synonyms_coll_for_sorting
|
||||||
|
(gconstpointer x1, gconstpointer x2, gpointer data)
|
||||||
|
{
|
||||||
|
StardictDict *sd = data;
|
||||||
|
const gchar *s1 = g_array_index
|
||||||
|
(sd->priv->index, StardictSynonymEntry, *(guint32 *) x1).word;
|
||||||
|
const gchar *s2 = g_array_index
|
||||||
|
(sd->priv->index, StardictSynonymEntry, *(guint32 *) x2).word;
|
||||||
|
return stardict_dict_strcoll_for_sorting (s1, s2, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
static gboolean
|
||||||
|
stardict_dict_set_collation (StardictDict *sd, const gchar *collation)
|
||||||
|
{
|
||||||
|
StardictDictPrivate *priv = sd->priv;
|
||||||
|
UErrorCode error = U_ZERO_ERROR;
|
||||||
|
if (!(priv->collator = ucol_open (collation, &error)))
|
||||||
|
{
|
||||||
|
// TODO: set a meaningful error
|
||||||
|
g_info ("failed to create a collator for `%s'", collation);
|
||||||
|
return FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: if error != U_ZERO_ERROR, report a meaningful message
|
||||||
|
|
||||||
|
ucol_setAttribute (priv->collator, UCOL_CASE_FIRST, UCOL_OFF, &error);
|
||||||
|
|
||||||
|
priv->collated_index = g_array_sized_new (FALSE, FALSE,
|
||||||
|
sizeof (guint32), priv->index->len);
|
||||||
|
for (guint32 i = 0; i < priv->index->len; i++)
|
||||||
|
g_array_append_val (priv->collated_index, i);
|
||||||
|
g_array_sort_with_data (sd->priv->collated_index,
|
||||||
|
stardict_dict_index_coll_for_sorting, sd);
|
||||||
|
|
||||||
|
priv->collated_synonyms = g_array_sized_new (FALSE, FALSE,
|
||||||
|
sizeof (guint32), priv->synonyms->len);
|
||||||
|
for (guint32 i = 0; i < priv->synonyms->len; i++)
|
||||||
|
g_array_append_val (priv->collated_synonyms, i);
|
||||||
|
g_array_sort_with_data (sd->priv->collated_synonyms,
|
||||||
|
stardict_dict_synonyms_coll_for_sorting, sd);
|
||||||
|
|
||||||
|
// Make the collator something like case-insensitive, see:
|
||||||
|
// http://userguide.icu-project.org/collation/concepts
|
||||||
|
// We shouldn't need to sort the data anymore, and if we did, we could just
|
||||||
|
// reset the strength to its default value for the given locale.
|
||||||
|
ucol_setStrength (priv->collator, UCOL_SECONDARY);
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||||
|
|
||||||
/** Load a StarDict dictionary.
|
/** Load a StarDict dictionary.
|
||||||
* @param[in] sdi Parsed .ifo data. The dictionary assumes ownership.
|
* @param[in] sdi Parsed .ifo data. The dictionary assumes ownership.
|
||||||
*/
|
*/
|
||||||
|
@ -709,9 +813,12 @@ stardict_dict_new_from_info (StardictInfo *sdi, GError **error)
|
||||||
|
|
||||||
gchar *base_syn = g_strconcat (base, ".syn", NULL);
|
gchar *base_syn = g_strconcat (base, ".syn", NULL);
|
||||||
if (g_file_test (base_syn, G_FILE_TEST_EXISTS | G_FILE_TEST_IS_REGULAR))
|
if (g_file_test (base_syn, G_FILE_TEST_EXISTS | G_FILE_TEST_IS_REGULAR))
|
||||||
load_syn (sd, base_syn, NULL);
|
(void) load_syn (sd, base_syn, NULL);
|
||||||
g_free (base_syn);
|
g_free (base_syn);
|
||||||
|
|
||||||
|
if (sdi->collation)
|
||||||
|
(void) stardict_dict_set_collation (sd, sdi->collation);
|
||||||
|
|
||||||
g_free (base);
|
g_free (base);
|
||||||
return sd;
|
return sd;
|
||||||
|
|
||||||
|
@ -722,6 +829,20 @@ error:
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static gint
|
||||||
|
stardict_dict_cmp_synonym (StardictDict *sd, const gchar *word, gint i)
|
||||||
|
{
|
||||||
|
GArray *collated = sd->priv->collated_synonyms;
|
||||||
|
GArray *synonyms = sd->priv->synonyms;
|
||||||
|
|
||||||
|
if (sd->priv->collator)
|
||||||
|
return stardict_dict_strcoll (word,
|
||||||
|
g_array_index (synonyms, StardictSynonymEntry,
|
||||||
|
g_array_index (collated, guint32, i)).word, sd);
|
||||||
|
return g_ascii_strcasecmp (word,
|
||||||
|
g_array_index (synonyms, StardictSynonymEntry, i).word);
|
||||||
|
}
|
||||||
|
|
||||||
/** Return words for which the argument is a synonym of or NULL
|
/** Return words for which the argument is a synonym of or NULL
|
||||||
* if there are no such words.
|
* if there are no such words.
|
||||||
*/
|
*/
|
||||||
|
@ -731,12 +852,12 @@ stardict_dict_get_synonyms (StardictDict *sd, const gchar *word)
|
||||||
GArray *synonyms = sd->priv->synonyms;
|
GArray *synonyms = sd->priv->synonyms;
|
||||||
GArray *index = sd->priv->index;
|
GArray *index = sd->priv->index;
|
||||||
|
|
||||||
BINARY_SEARCH_BEGIN (synonyms->len - 1, g_ascii_strcasecmp (word,
|
BINARY_SEARCH_BEGIN (synonyms->len - 1,
|
||||||
g_array_index (synonyms, StardictSynonymEntry, imid).word))
|
stardict_dict_cmp_synonym (sd, word, imid))
|
||||||
|
|
||||||
// Back off to the first matching entry
|
// Back off to the first matching entry
|
||||||
while (imid > 0 && !g_ascii_strcasecmp (word,
|
while (imid > 0 && !stardict_dict_cmp_synonym (sd, word, --imid))
|
||||||
g_array_index (synonyms, StardictSynonymEntry, --imid).word));
|
;
|
||||||
|
|
||||||
GPtrArray *array = g_ptr_array_new ();
|
GPtrArray *array = g_ptr_array_new ();
|
||||||
|
|
||||||
|
@ -751,10 +872,23 @@ stardict_dict_get_synonyms (StardictDict *sd, const gchar *word)
|
||||||
return (gchar **) g_ptr_array_free (array, FALSE);
|
return (gchar **) g_ptr_array_free (array, FALSE);
|
||||||
|
|
||||||
BINARY_SEARCH_END
|
BINARY_SEARCH_END
|
||||||
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static gint
|
||||||
|
stardict_dict_cmp_index (StardictDict *sd, const gchar *word, gint i)
|
||||||
|
{
|
||||||
|
GArray *collated = sd->priv->collated_index;
|
||||||
|
GArray *index = sd->priv->index;
|
||||||
|
|
||||||
|
if (sd->priv->collator)
|
||||||
|
return stardict_dict_strcoll (word,
|
||||||
|
g_array_index (index, StardictIndexEntry,
|
||||||
|
g_array_index (collated, guint32, i)).name, sd);
|
||||||
|
return g_ascii_strcasecmp (word,
|
||||||
|
g_array_index (index, StardictIndexEntry, i).name);
|
||||||
|
}
|
||||||
|
|
||||||
/** Search for a word. The search is ASCII-case-insensitive.
|
/** Search for a word. The search is ASCII-case-insensitive.
|
||||||
* @param[in] word The word in utf-8 encoding
|
* @param[in] word The word in utf-8 encoding
|
||||||
* @param[out] success TRUE if found
|
* @param[out] success TRUE if found
|
||||||
|
@ -765,12 +899,11 @@ stardict_dict_search (StardictDict *sd, const gchar *word, gboolean *success)
|
||||||
{
|
{
|
||||||
GArray *index = sd->priv->index;
|
GArray *index = sd->priv->index;
|
||||||
|
|
||||||
BINARY_SEARCH_BEGIN (index->len - 1, g_ascii_strcasecmp (word,
|
BINARY_SEARCH_BEGIN (index->len - 1,
|
||||||
g_array_index (index, StardictIndexEntry, imid).name))
|
stardict_dict_cmp_index (sd, word, imid))
|
||||||
|
|
||||||
// Back off to the first matching entry
|
// Back off to the first matching entry
|
||||||
while (imid > 0 && !g_ascii_strcasecmp (word,
|
while (imid > 0 && !stardict_dict_cmp_index (sd, word, imid - 1))
|
||||||
g_array_index (index, StardictIndexEntry, imid - 1).name))
|
|
||||||
imid--;
|
imid--;
|
||||||
|
|
||||||
if (success) *success = TRUE;
|
if (success) *success = TRUE;
|
||||||
|
@ -1051,6 +1184,13 @@ stardict_iterator_new (StardictDict *sd, guint32 offset)
|
||||||
return si;
|
return si;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static gint64
|
||||||
|
stardict_iterator_get_real_offset (StardictIterator *sdi)
|
||||||
|
{
|
||||||
|
return sdi->owner->priv->collator ? g_array_index
|
||||||
|
(sdi->owner->priv->collated_index, guint32, sdi->offset) : sdi->offset;
|
||||||
|
}
|
||||||
|
|
||||||
/** Return the word in the index that the iterator points at, or NULL. */
|
/** Return the word in the index that the iterator points at, or NULL. */
|
||||||
const gchar *
|
const gchar *
|
||||||
stardict_iterator_get_word (StardictIterator *sdi)
|
stardict_iterator_get_word (StardictIterator *sdi)
|
||||||
|
@ -1059,7 +1199,7 @@ stardict_iterator_get_word (StardictIterator *sdi)
|
||||||
if (!stardict_iterator_is_valid (sdi))
|
if (!stardict_iterator_is_valid (sdi))
|
||||||
return NULL;
|
return NULL;
|
||||||
return g_array_index (sdi->owner->priv->index,
|
return g_array_index (sdi->owner->priv->index,
|
||||||
StardictIndexEntry, sdi->offset).name;
|
StardictIndexEntry, stardict_iterator_get_real_offset (sdi)).name;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Return the dictionary entry that the iterator points at, or NULL. */
|
/** Return the dictionary entry that the iterator points at, or NULL. */
|
||||||
|
@ -1069,7 +1209,8 @@ stardict_iterator_get_entry (StardictIterator *sdi)
|
||||||
g_return_val_if_fail (STARDICT_IS_ITERATOR (sdi), NULL);
|
g_return_val_if_fail (STARDICT_IS_ITERATOR (sdi), NULL);
|
||||||
if (!stardict_iterator_is_valid (sdi))
|
if (!stardict_iterator_is_valid (sdi))
|
||||||
return FALSE;
|
return FALSE;
|
||||||
return stardict_dict_get_entry (sdi->owner, sdi->offset);
|
return stardict_dict_get_entry (sdi->owner,
|
||||||
|
stardict_iterator_get_real_offset (sdi));
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Return whether the iterator points to a valid index entry. */
|
/** Return whether the iterator points to a valid index entry. */
|
||||||
|
|
Loading…
Reference in New Issue