Simplify the collation mess
Brings a bit worse performance on load since more memory is being reordered but it seems to be worth it.
This commit is contained in:
parent
36236955e5
commit
26155227c7
|
@ -56,6 +56,7 @@ struct stardict_index_entry
|
||||||
gchar * name; ///< The word in utf-8
|
gchar * name; ///< The word in utf-8
|
||||||
guint64 data_offset; ///< Offset of the definition
|
guint64 data_offset; ///< Offset of the definition
|
||||||
guint32 data_size; ///< Size of the definition
|
guint32 data_size; ///< Size of the definition
|
||||||
|
guint32 reverse_index; ///< Word at this index before sorting
|
||||||
};
|
};
|
||||||
|
|
||||||
struct stardict_synonym_entry
|
struct stardict_synonym_entry
|
||||||
|
|
|
@ -379,7 +379,6 @@ struct stardict_dict_private
|
||||||
// The collated indexes are only permutations of their normal selves.
|
// The collated indexes are only permutations of their normal selves.
|
||||||
|
|
||||||
UCollator * collator; //!< ICU index collator
|
UCollator * collator; //!< ICU index collator
|
||||||
GArray * collated_index; //!< Sorted indexes into @a index
|
|
||||||
GArray * collated_synonyms; //!< Sorted indexes into @a synonyms
|
GArray * collated_synonyms; //!< Sorted indexes into @a synonyms
|
||||||
|
|
||||||
// There are currently three ways the dictionary data can be read:
|
// There are currently three ways the dictionary data can be read:
|
||||||
|
@ -410,8 +409,6 @@ stardict_dict_finalize (GObject *self)
|
||||||
|
|
||||||
if (priv->collator)
|
if (priv->collator)
|
||||||
ucol_close (priv->collator);
|
ucol_close (priv->collator);
|
||||||
if (priv->collated_index)
|
|
||||||
g_array_free (priv->collated_index, TRUE);
|
|
||||||
if (priv->collated_synonyms)
|
if (priv->collated_synonyms)
|
||||||
g_array_free (priv->collated_synonyms, TRUE);
|
g_array_free (priv->collated_synonyms, TRUE);
|
||||||
|
|
||||||
|
@ -494,6 +491,7 @@ load_idx_internal (StardictDict *sd, GInputStream *is, GError **error)
|
||||||
goto error;
|
goto error;
|
||||||
|
|
||||||
entry.name = g_string_chunk_insert (sd->priv->string_allocator, name);
|
entry.name = g_string_chunk_insert (sd->priv->string_allocator, name);
|
||||||
|
entry.reverse_index = priv->index->len;
|
||||||
g_array_append_val (priv->index, entry);
|
g_array_append_val (priv->index, entry);
|
||||||
g_free (name);
|
g_free (name);
|
||||||
}
|
}
|
||||||
|
@ -707,12 +705,8 @@ static inline gint
|
||||||
stardict_dict_index_coll_for_sorting
|
stardict_dict_index_coll_for_sorting
|
||||||
(gconstpointer x1, gconstpointer x2, gpointer data)
|
(gconstpointer x1, gconstpointer x2, gpointer data)
|
||||||
{
|
{
|
||||||
StardictDict *sd = data;
|
const StardictIndexEntry *e1 = x1, *e2 = x2;
|
||||||
const gchar *s1 = g_array_index
|
return stardict_dict_strcoll_for_sorting (e1->name, e2->name, data);
|
||||||
(sd->priv->index, StardictIndexEntry, *(guint32 *) x1).name;
|
|
||||||
const gchar *s2 = g_array_index
|
|
||||||
(sd->priv->index, StardictIndexEntry, *(guint32 *) x2).name;
|
|
||||||
return stardict_dict_strcoll_for_sorting (s1, s2, data);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline gint
|
static inline gint
|
||||||
|
@ -741,15 +735,21 @@ stardict_dict_set_collation (StardictDict *sd, const gchar *collation)
|
||||||
|
|
||||||
// TODO: if error != U_ZERO_ERROR, report a meaningful message
|
// TODO: if error != U_ZERO_ERROR, report a meaningful message
|
||||||
|
|
||||||
|
// Reorder the index according to the ICU locale
|
||||||
ucol_setAttribute (priv->collator, UCOL_CASE_FIRST, UCOL_OFF, &error);
|
ucol_setAttribute (priv->collator, UCOL_CASE_FIRST, UCOL_OFF, &error);
|
||||||
|
g_array_sort_with_data (sd->priv->index,
|
||||||
priv->collated_index = g_array_sized_new (FALSE, FALSE,
|
|
||||||
sizeof (guint32), priv->index->len);
|
|
||||||
for (guint32 i = 0; i < priv->index->len; i++)
|
|
||||||
g_array_append_val (priv->collated_index, i);
|
|
||||||
g_array_sort_with_data (sd->priv->collated_index,
|
|
||||||
stardict_dict_index_coll_for_sorting, sd);
|
stardict_dict_index_coll_for_sorting, sd);
|
||||||
|
|
||||||
|
// Construct a reverse index from the original index as it's used less
|
||||||
|
guint32 *reverse = g_malloc_n (priv->index->len, sizeof *reverse);
|
||||||
|
for (guint32 i = 0; i < priv->index->len; i++)
|
||||||
|
reverse[g_array_index (priv->index,
|
||||||
|
StardictIndexEntry, i).reverse_index] = i;
|
||||||
|
for (guint32 i = 0; i < priv->index->len; i++)
|
||||||
|
g_array_index (priv->index,
|
||||||
|
StardictIndexEntry, i).reverse_index = reverse[i];
|
||||||
|
g_free (reverse);
|
||||||
|
|
||||||
priv->collated_synonyms = g_array_sized_new (FALSE, FALSE,
|
priv->collated_synonyms = g_array_sized_new (FALSE, FALSE,
|
||||||
sizeof (guint32), priv->synonyms->len);
|
sizeof (guint32), priv->synonyms->len);
|
||||||
for (guint32 i = 0; i < priv->synonyms->len; i++)
|
for (guint32 i = 0; i < priv->synonyms->len; i++)
|
||||||
|
@ -882,9 +882,16 @@ stardict_dict_get_synonyms (StardictDict *sd, const gchar *word)
|
||||||
|
|
||||||
// And add all matching entries from that position on to the array
|
// And add all matching entries from that position on to the array
|
||||||
do
|
do
|
||||||
|
{
|
||||||
|
guint32 i = g_array_index
|
||||||
|
(synonyms, StardictSynonymEntry, ++imid).original_word;
|
||||||
|
// When we use a collator this will point to the original entry,
|
||||||
|
// otherwise it points to itself and this changes nothing
|
||||||
|
i = g_array_index
|
||||||
|
(sd->priv->index, StardictIndexEntry, i).reverse_index;
|
||||||
g_ptr_array_add (array, g_strdup (g_array_index
|
g_ptr_array_add (array, g_strdup (g_array_index
|
||||||
(index, StardictIndexEntry, g_array_index
|
(index, StardictIndexEntry, i).name));
|
||||||
(synonyms, StardictSynonymEntry, ++imid).original_word).name));
|
}
|
||||||
while ((guint) imid < synonyms->len - 1 && !stardict_strcmp (word,
|
while ((guint) imid < synonyms->len - 1 && !stardict_strcmp (word,
|
||||||
g_array_index (synonyms, StardictSynonymEntry, imid + 1).word));
|
g_array_index (synonyms, StardictSynonymEntry, imid + 1).word));
|
||||||
|
|
||||||
|
@ -897,15 +904,11 @@ stardict_dict_get_synonyms (StardictDict *sd, const gchar *word)
|
||||||
static gint
|
static gint
|
||||||
stardict_dict_cmp_index (StardictDict *sd, const gchar *word, gint i)
|
stardict_dict_cmp_index (StardictDict *sd, const gchar *word, gint i)
|
||||||
{
|
{
|
||||||
GArray *collated = sd->priv->collated_index;
|
const gchar *target =
|
||||||
GArray *index = sd->priv->index;
|
g_array_index (sd->priv->index, StardictIndexEntry, i).name;
|
||||||
|
|
||||||
if (sd->priv->collator)
|
if (sd->priv->collator)
|
||||||
return stardict_dict_strcoll (word,
|
return stardict_dict_strcoll (word, target, sd);
|
||||||
g_array_index (index, StardictIndexEntry,
|
return g_ascii_strcasecmp (word, target);
|
||||||
g_array_index (collated, guint32, i)).name, sd);
|
|
||||||
return g_ascii_strcasecmp (word,
|
|
||||||
g_array_index (index, StardictIndexEntry, i).name);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Search for a word. The search is ASCII-case-insensitive.
|
/// Search for a word. The search is ASCII-case-insensitive.
|
||||||
|
@ -930,33 +933,25 @@ stardict_dict_search (StardictDict *sd, const gchar *word, gboolean *success)
|
||||||
BINARY_SEARCH_END
|
BINARY_SEARCH_END
|
||||||
|
|
||||||
// Try to find a longer common prefix with a preceding entry
|
// Try to find a longer common prefix with a preceding entry
|
||||||
|
// FIXME: this doesn't work well with diacritics, which are ignored when
|
||||||
|
// reordering but can take us to an entry with a suboptimal prefix
|
||||||
|
// while searching
|
||||||
#define PREFIX(i) stardict_longest_common_collation_prefix \
|
#define PREFIX(i) stardict_longest_common_collation_prefix \
|
||||||
(sd, word, g_array_index (index, StardictIndexEntry, i).name)
|
(sd, word, g_array_index (index, StardictIndexEntry, i).name)
|
||||||
|
|
||||||
// We need to take care not to step through the entire dictionary
|
// We need to take care not to step through the entire dictionary
|
||||||
// if not a single character matches, because it can be quite costly
|
// if not a single character matches, because it can be quite costly
|
||||||
if (sd->priv->collator)
|
size_t probe, best = PREFIX (imin);
|
||||||
{
|
|
||||||
GArray *collated = sd->priv->collated_index;
|
|
||||||
size_t probe, best = PREFIX (g_array_index (collated, guint32, imin));
|
|
||||||
while (best && imin > 0 && (probe =
|
|
||||||
PREFIX (g_array_index (collated, guint32, imin - 1))) >= best)
|
|
||||||
{
|
|
||||||
best = probe;
|
|
||||||
imin--;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// XXX: only looking for _better_ backward matches here, since the
|
// XXX: only looking for _better_ backward matches here, since the
|
||||||
// fallback common prefix searching algorithm doesn't ignore case
|
// fallback common prefix searching algorithm doesn't ignore case
|
||||||
size_t probe, best = PREFIX (imin);
|
size_t needed_improvement = !sd->priv->collator;
|
||||||
while (best && imin > 0 && (probe = PREFIX (imin - 1)) > best)
|
while (best && imin > 0
|
||||||
|
&& (probe = PREFIX (imin - 1)) >= best + needed_improvement)
|
||||||
{
|
{
|
||||||
best = probe;
|
best = probe;
|
||||||
imin--;
|
imin--;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
#undef PREFIX
|
#undef PREFIX
|
||||||
|
|
||||||
|
@ -1305,13 +1300,6 @@ stardict_iterator_new (StardictDict *sd, guint32 offset)
|
||||||
return si;
|
return si;
|
||||||
}
|
}
|
||||||
|
|
||||||
static gint64
|
|
||||||
stardict_iterator_get_real_offset (StardictIterator *sdi)
|
|
||||||
{
|
|
||||||
return sdi->owner->priv->collator ? g_array_index
|
|
||||||
(sdi->owner->priv->collated_index, guint32, sdi->offset) : sdi->offset;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Return the word in the index that the iterator points at, or NULL.
|
/// Return the word in the index that the iterator points at, or NULL.
|
||||||
const gchar *
|
const gchar *
|
||||||
stardict_iterator_get_word (StardictIterator *sdi)
|
stardict_iterator_get_word (StardictIterator *sdi)
|
||||||
|
@ -1320,7 +1308,7 @@ stardict_iterator_get_word (StardictIterator *sdi)
|
||||||
if (!stardict_iterator_is_valid (sdi))
|
if (!stardict_iterator_is_valid (sdi))
|
||||||
return NULL;
|
return NULL;
|
||||||
return g_array_index (sdi->owner->priv->index,
|
return g_array_index (sdi->owner->priv->index,
|
||||||
StardictIndexEntry, stardict_iterator_get_real_offset (sdi)).name;
|
StardictIndexEntry, sdi->offset).name;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the dictionary entry that the iterator points at, or NULL.
|
/// Return the dictionary entry that the iterator points at, or NULL.
|
||||||
|
@ -1330,8 +1318,7 @@ stardict_iterator_get_entry (StardictIterator *sdi)
|
||||||
g_return_val_if_fail (STARDICT_IS_ITERATOR (sdi), NULL);
|
g_return_val_if_fail (STARDICT_IS_ITERATOR (sdi), NULL);
|
||||||
if (!stardict_iterator_is_valid (sdi))
|
if (!stardict_iterator_is_valid (sdi))
|
||||||
return FALSE;
|
return FALSE;
|
||||||
return stardict_dict_get_entry (sdi->owner,
|
return stardict_dict_get_entry (sdi->owner, sdi->offset);
|
||||||
stardict_iterator_get_real_offset (sdi));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return whether the iterator points to a valid index entry.
|
/// Return whether the iterator points to a valid index entry.
|
||||||
|
|
Loading…
Reference in New Issue