Make prefix matching behave better
It seems to me that we _do_ want to ignore diacritics most of the time. I've also fixed the fallback.
This commit is contained in:
		@@ -379,6 +379,7 @@ struct stardict_dict_private
 | 
				
			|||||||
	// The collated indexes are only permutations of their normal selves.
 | 
						// The collated indexes are only permutations of their normal selves.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	UCollator     * collator;           //!< ICU index collator
 | 
						UCollator     * collator;           //!< ICU index collator
 | 
				
			||||||
 | 
						UCollator     * collator_root;      //!< ICU fallback root collator
 | 
				
			||||||
	GArray        * collated_synonyms;  //!< Sorted indexes into @a synonyms
 | 
						GArray        * collated_synonyms;  //!< Sorted indexes into @a synonyms
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	// There are currently three ways the dictionary data can be read:
 | 
						// There are currently three ways the dictionary data can be read:
 | 
				
			||||||
@@ -409,6 +410,8 @@ stardict_dict_finalize (GObject *self)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
	if (priv->collator)
 | 
						if (priv->collator)
 | 
				
			||||||
		ucol_close (priv->collator);
 | 
							ucol_close (priv->collator);
 | 
				
			||||||
 | 
						if (priv->collator_root)
 | 
				
			||||||
 | 
							ucol_close (priv->collator_root);
 | 
				
			||||||
	if (priv->collated_synonyms)
 | 
						if (priv->collated_synonyms)
 | 
				
			||||||
		g_array_free (priv->collated_synonyms, TRUE);
 | 
							g_array_free (priv->collated_synonyms, TRUE);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -836,8 +839,12 @@ stardict_dict_new_from_info (StardictInfo *sdi, GError **error)
 | 
				
			|||||||
		(void) load_syn (sd, base_syn, NULL);
 | 
							(void) load_syn (sd, base_syn, NULL);
 | 
				
			||||||
	g_free (base_syn);
 | 
						g_free (base_syn);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (sdi->collation)
 | 
						// We need a fallback collator to find common prefixes
 | 
				
			||||||
		(void) stardict_dict_set_collation (sd, sdi->collation);
 | 
						if (!sdi->collation || !stardict_dict_set_collation (sd, sdi->collation))
 | 
				
			||||||
 | 
						{
 | 
				
			||||||
 | 
							UErrorCode error = U_ZERO_ERROR;
 | 
				
			||||||
 | 
							sd->priv->collator_root = ucol_open ("" /* root collator */, &error);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	g_free (base);
 | 
						g_free (base);
 | 
				
			||||||
	return sd;
 | 
						return sd;
 | 
				
			||||||
@@ -933,22 +940,21 @@ stardict_dict_search (StardictDict *sd, const gchar *word, gboolean *success)
 | 
				
			|||||||
	BINARY_SEARCH_END
 | 
						BINARY_SEARCH_END
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	// Try to find a longer common prefix with a preceding entry
 | 
						// Try to find a longer common prefix with a preceding entry
 | 
				
			||||||
	// FIXME: this doesn't work well with diacritics, which are ignored when
 | 
					 | 
				
			||||||
	//   reordering but can take us to an entry with a suboptimal prefix
 | 
					 | 
				
			||||||
	//   while searching
 | 
					 | 
				
			||||||
#define PREFIX(i) stardict_longest_common_collation_prefix \
 | 
					#define PREFIX(i) stardict_longest_common_collation_prefix \
 | 
				
			||||||
	(sd, word, g_array_index (index, StardictIndexEntry, i).name)
 | 
						(sd, word, g_array_index (index, StardictIndexEntry, i).name)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	// We need to take care not to step through the entire dictionary
 | 
						// We need to take care not to step through the entire dictionary
 | 
				
			||||||
	// if not a single character matches, because it can be quite costly
 | 
						// if not a single character matches, because it can be quite costly
 | 
				
			||||||
	size_t probe, best = PREFIX (imin);
 | 
						size_t probe, best = PREFIX (imin);
 | 
				
			||||||
 | 
						while (best && imin > 0 && (probe = PREFIX (imin - 1)) >= best)
 | 
				
			||||||
	// XXX: only looking for _better_ backward matches here, since the
 | 
					 | 
				
			||||||
	//   fallback common prefix searching algorithm doesn't ignore case
 | 
					 | 
				
			||||||
	size_t needed_improvement = !sd->priv->collator;
 | 
					 | 
				
			||||||
	while (best && imin > 0
 | 
					 | 
				
			||||||
		&& (probe = PREFIX (imin - 1)) >= best + needed_improvement)
 | 
					 | 
				
			||||||
	{
 | 
						{
 | 
				
			||||||
 | 
							// TODO: take more care to not screw up exact matches,
 | 
				
			||||||
 | 
							//   use several "best"s according to quality
 | 
				
			||||||
 | 
							//   (the most severe issue here is ignored diacritics)
 | 
				
			||||||
 | 
							if (!strcmp (word, g_array_index
 | 
				
			||||||
 | 
								(index, StardictIndexEntry, imin).name))
 | 
				
			||||||
 | 
								break;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		best = probe;
 | 
							best = probe;
 | 
				
			||||||
		imin--;
 | 
							imin--;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
@@ -985,13 +991,14 @@ stardict_longest_common_collation_prefix (StardictDict *sd,
 | 
				
			|||||||
	if (U_FAILURE (error))
 | 
						if (U_FAILURE (error))
 | 
				
			||||||
		return 0;
 | 
							return 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						UCollator *collator = sd->priv->collator;
 | 
				
			||||||
 | 
						if (!collator && !(collator = sd->priv->collator_root))
 | 
				
			||||||
 | 
							return 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	// ucol_getSortKey() can't be used for these purposes, so the only
 | 
						// ucol_getSortKey() can't be used for these purposes, so the only
 | 
				
			||||||
	// reasonable thing remaining is iterating by full graphemes.  It doesn't
 | 
						// reasonable thing remaining is iterating by full graphemes.  It doesn't
 | 
				
			||||||
	// work entirely correctly (e.g. Czech "ch" should be regarded as a single
 | 
						// work entirely correctly (e.g. Czech "ch" should be regarded as a single
 | 
				
			||||||
	// unit, and punctuation could be ignored).  It's just good enough.
 | 
						// unit).  It's just good enough for most purposes.
 | 
				
			||||||
	//
 | 
					 | 
				
			||||||
	// In theory we could set the strength to UCOL_PRIMARY and ignore accents
 | 
					 | 
				
			||||||
	// but that's likely not what the user wants most of the time.
 | 
					 | 
				
			||||||
	//
 | 
						//
 | 
				
			||||||
	// Locale shouldn't matter much with graphemes, let's use the default.
 | 
						// Locale shouldn't matter much with graphemes, let's use the default.
 | 
				
			||||||
	UBreakIterator *it1 =
 | 
						UBreakIterator *it1 =
 | 
				
			||||||
@@ -999,23 +1006,21 @@ stardict_longest_common_collation_prefix (StardictDict *sd,
 | 
				
			|||||||
	UBreakIterator *it2 =
 | 
						UBreakIterator *it2 =
 | 
				
			||||||
		ubrk_open (UBRK_CHARACTER, NULL, uc2, uc2_len, &error);
 | 
							ubrk_open (UBRK_CHARACTER, NULL, uc2, uc2_len, &error);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						UCollationStrength prev_strength = ucol_getStrength (collator);
 | 
				
			||||||
 | 
						ucol_setStrength (collator, UCOL_PRIMARY);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	int32_t longest = 0;
 | 
						int32_t longest = 0;
 | 
				
			||||||
	int32_t pos1, pos2;
 | 
						int32_t pos1, pos2;
 | 
				
			||||||
	while ((pos1 = ubrk_next (it1)) != UBRK_DONE
 | 
						while ((pos1 = ubrk_next (it1)) != UBRK_DONE
 | 
				
			||||||
		&& (pos2 = ubrk_next (it2)) != UBRK_DONE)
 | 
							&& (pos2 = ubrk_next (it2)) != UBRK_DONE)
 | 
				
			||||||
	{
 | 
						{
 | 
				
			||||||
		if (sd->priv->collator)
 | 
							if (!ucol_strcoll (collator, uc1, pos1, uc2, pos2))
 | 
				
			||||||
		{
 | 
					 | 
				
			||||||
			if (!ucol_strcoll (sd->priv->collator, uc1, pos1, uc2, pos2))
 | 
					 | 
				
			||||||
				longest = pos1;
 | 
					 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
		// XXX: I'd need a new collator, so just do the minimal working thing
 | 
					 | 
				
			||||||
		else if (pos1 == pos2 && !memcmp (uc1, uc2, pos1 * sizeof *uc1))
 | 
					 | 
				
			||||||
			longest = pos1;
 | 
								longest = pos1;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	ubrk_close (it1);
 | 
						ubrk_close (it1);
 | 
				
			||||||
	ubrk_close (it2);
 | 
						ubrk_close (it2);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						ucol_setStrength (collator, prev_strength);
 | 
				
			||||||
	if (!longest)
 | 
						if (!longest)
 | 
				
			||||||
		return 0;
 | 
							return 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user