Searching different keywords in a text efficiently

Hello folks,

This week I wrote a search tool that recognizes…

  • single words
  • sentences
  • plural and singular words in English as same
  • accents and non-accents as same

…using Regular Expressions, Linq and implementing IEqualityComparer. A good application for this tool would be, for example, a product name matching.

A keyword can be either a simply word, 2 words separated by a dash, 2 or more words, plurals and singular words, or accented words.

Example of keywords: “coke, coca-cola, coca cola, guaraná, drink”. The search tool would have to find and match any of these:

Coca-Cola, also known as Coke, is a carbonated soft drink sold in stores, restaurants, and vending machines internationally, as well as Guarana Antarctica, both are the most popular soft drinks in Brazil, where the American one is known as Coca cola.

You can take my example and change to meet your requirements. Perhaps you would like to match all the keywords and not simply any of them, like I’m doing. Have fun!

To indicate a word separation char within a text, I use the following regular expression, meaning that every time I see those symbols, I will split them up.:

string SEPARATOR_PATTERN = @"[,.;\s]";
/// <summary>
/// Searches passed text and returns true if text contains any keyword
/// </summary>
/// <param name="text">Enter the text to be searched</param>
/// <param name="Keywords">Enter the keywords</param>
/// <returns>Returns true if text contains any keyword</returns>
public bool ContainsAny(string text, string[] keywords)
{
    if (string.IsNullOrWhiteSpace(text))
        return false;

    //SEPARATOR_PATTERN is a constant: @"[,.;\s]";
    Regex re = new Regex(SEPARATOR_PATTERN + "+");

    //reinforce the equality by eliminating single quotes
    Regex regex = new Regex(@"('|"")", RegexOptions.Compiled);
    text = regex.Replace(text, "").Trim();

    bool ok = false;

    foreach (var key in keywords)
    {
        bool found = checkKeyword(key, text);

        if (found)
        {
            ok = true;
            break;
        }
    }

    return ok;
}

/// <summary>
/// Returns true if all words match the keywords
/// </summary>
/// <param name="text">Enter the text to be searched</param>
/// <param name="Keywords">Enter the keywords</param>
/// <returns></returns>
public bool ContainsAll(string text, string[] keywords)
{
    if (string.IsNullOrWhiteSpace(text))
        return false;

    int count = 0;

    foreach (var key in keywords)
    {
        bool found = checkKeyword(key, text);

        if (found)
            count++;
    }

    return KeywordsCollection.Count == count;
}

private bool checkKeyword(string key, string text)
{
    bool found = false;

    //verify if the keyword has a "space"
    bool withSpace = re.IsMatch(key);

    //reinforce the equality by eliminating single quotes
    Regex regex = new Regex(@"('|"")", RegexOptions.Compiled);
    key = regex.Replace(key, "").Trim();

    if (withSpace)
    {
        string kPlural = key.ToPlural();

        if (text.StartsWith(key, StringComparison.InvariantCultureIgnoreCase) || text.StartsWith(kPlural, StringComparison.InvariantCultureIgnoreCase))
            found = true;
        else if (text.EndsWith(key, StringComparison.InvariantCultureIgnoreCase) || text.EndsWith(kPlural, StringComparison.InvariantCultureIgnoreCase))
            found = true;
        else
        {
            Regex specific = new Regex(SEPARATOR_PATTERN + "{1}(" + key + "|" + kPlural + ")" + SEPARATOR_PATTERN + "{1}", RegexOptions.IgnoreCase | RegexOptions.CultureInvariant);
            if (specific.IsMatch(text))
                found = true;
        }
    }
    else
    {
        string[] words = re.Split(text).Where(w => w.Length > 1).ToArray();

        string[] keys = new string[] { key };

        var query = words.Join<string, string, string, string>(keys, w => w, k => k, (w, k) => k, new KeywordComparer());

        if (query.Count() > 0)
            return true;
    }

    return found;

}

private class KeywordComparer : IEqualityComparer<string>
{
    public bool Equals(string x, string y)
    {
        if (Object.ReferenceEquals(x, y)) return true;

        if (Object.ReferenceEquals(x, null) || Object.ReferenceEquals(y, null))
            return false;

        //reinforce the equality by eliminating single quotes
        Regex regex = new Regex(@"('|"")", RegexOptions.Compiled);
        x = regex.Replace(x, "").Trim();
        y = regex.Replace(y, "").Trim();
                
        return (string.Compare(y, x, CultureInfo.InvariantCulture,
                CompareOptions.IgnoreNonSpace | CompareOptions.IgnoreCase) == 0 ||
                string.Compare(y, x.ToPlural(), CultureInfo.InvariantCulture,
                CompareOptions.IgnoreNonSpace | CompareOptions.IgnoreCase) == 0 ||
                string.Compare(y.ToPlural(), x, CultureInfo.InvariantCulture,
                CompareOptions.IgnoreNonSpace | CompareOptions.IgnoreCase) == 0);
    }

    public int GetHashCode(string str)
    {
        return 0;

    }

}

Pluralize a text by simply using this method below:

/// <summary>
/// Pluralize this string
/// </summary>
public static string ToPlural(this string source)
{
    string plural = source;

    //ignore plurals
    if (plural.EndsWith("es", true, CultureInfo.InvariantCulture) ||
        plural.EndsWith("ies", true, CultureInfo.InvariantCulture))
        return plural;

    Regex g = new Regex(@"s\b|z\b|x\b|sh\b|ch\b");

    MatchCollection matches = g.Matches(plural);

    if (matches.Count > 0)
        plural += "es";
    else
        if (plural.EndsWith("y", true, CultureInfo.InvariantCulture))
        {
            Regex g2 = new Regex(@"(ay|ey|iy|oy|uy)\b");
            if (g2.Matches(plural).Count <= 0)
                plural = plural.Substring(0, source.Length - 1) + "ies";
            else
                plural += "s";
        }
        else
            plural += "s";

    return plural;
}

 

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s