Subheader

Programming discussions since 1842

Monday, August 8, 2011

Trigram string searching

String searching and sorting is another one of my unexplainable favorite programming pasttimes. I was reading the interesting but tragically short article on the subject on wikipedia when I stumbled upon a special technique for fuzzy string searching known as Trigram string searching. What's that? Well, this is what the (entire) article had to say:
Trigram search is a powerful method of searching for text when the exact syntax or spelling of the target object is not precisely known. It finds objects which match the maximum number of three-character strings in the entered search terms, i.e. near matches. A threshold can be specified as a cutoff point, after which a result is no longer regarded as a match.
Bit short for such an important technique. This is the algorithm that manages to successfully turn up the search results for "Programming" when you spelled it "Porgraming".

The lack of a real implementation to go with the article irritated me. "That can't require very much coding, can it?"

Nope, It didn't!
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Scanner;

public class TrigramMain {
   
    private static final int OFFSET = 3;
    private static String[] mainArray;
    private static int inputIndex = 0;
    private static Map<String, ArrayList<Integer>> indexMap;
   
    public static void main(String[] args) {       
        readFile();
        makeTrigramIndex();
       
        Scanner scan = new Scanner(System.in);
        String input;
        while (true) {
            System.out.println("Enter search word: ");
            input = scan.next();
            if (input.equals("quit")) {
                System.exit(0);
            } else if (input == null) {
                System.out.println("Search word missing!");
            } else {
                //System.out.println(search(input));
                System.out.println(exclusiveSearch(input,1));
            }
        }
    }
       
    /**
     * An exclusive trigram string search that returns only words with more than 1 match.
     * @param searchWord - The chosen search string
     * @return A string containing the results.
     */
    private static String exclusiveSearch(String searchWord, int minimumMatches) {
        System.out.println("You exclusively searched for " + searchWord + ". Did you mean:");
        ArrayList<String> resultWords = new ArrayList<String>();
       
        ArrayList<Integer> hits = new ArrayList<Integer>();
        String triKey;
        for (int i = 0; i <= searchWord.length()-OFFSET; i++) {
            triKey = searchWord.substring(i, i+OFFSET);
            if (indexMap.containsKey(triKey)) {
                hits = indexMap.get(triKey);
                for (Integer j : hits) {
                    resultWords.add(mainArray[j]);
                }
            }
        }
       
        if (resultWords.isEmpty()) {
            System.out.println("Error!");
        }
       
        StringBuilder result = new StringBuilder();
        while (resultWords.isEmpty() == false) {
            String string = resultWords.get(0);
            int stringFreq = stringFrequency(resultWords, string);
            if (stringFreq >= minimumMatches) {
                result.append(string + " (" + stringFreq + " matches)" + '\n');
            }
            ArrayList<String> removeList = new ArrayList<String>();
            removeList.add(string);
            resultWords.removeAll(removeList);
        }
       
        return result.toString();
    }
   
    private static void makeTrigramIndex() {
        indexMap = new HashMap<String, ArrayList<Integer>>();
        for (int index = 0; index < mainArray.length; index++) {
            String string = mainArray[index];
            int len = string.length();
            for (int i = 0; i <= len-OFFSET; i++) {
                String trigram = string.substring(i, i+OFFSET);
                ArrayList<Integer> wordList;
                if (indexMap.containsKey(trigram)) {
                    wordList = indexMap.get(trigram);
                } else {
                    wordList = new ArrayList<Integer>();
                }
                wordList.add(index);
                indexMap.put(trigram, wordList);
            }
        }
    }
   
    private static void readFile() {
        BufferedReader fin = null;
        ArrayList<String> mainArrayList;
        try {
            fin = new BufferedReader(new FileReader("words.txt"));
            mainArrayList = new ArrayList<String>();
            String line = fin.readLine();
            while(line != null) {
                mainArrayList.add(line);
                inputIndex++;
                line = fin.readLine();
            }
            mainArray = (String[]) mainArrayList.toArray(new String[0]);
        } catch (FileNotFoundException fnfe) {
            fnfe.printStackTrace();
        } catch (IOException ie) {
            ie.printStackTrace();
        } finally {
            if(fin != null) {
                try {
                    fin.close();
                } catch (IOException ie) {
                    ie.printStackTrace();
                }
            }
        }
    }
   
    private static int stringFrequency(List<String> inputList, String searchWord) {
        Iterator<String> it = inputList.iterator();
        int result = 0;
        while (it.hasNext()) {
            String word = (String) it.next();
            if (word.equalsIgnoreCase(searchWord)) {
                result++;
            }
        }
        return result;
    }
   
}

No comments:

Post a Comment