Trigram search is a powerful method of searching for text when the exact syntax or spelling of the target object is not precisely known. It finds objects which match the maximum number of three-character strings in the entered search terms, i.e. near matches. A threshold can be specified as a cutoff point, after which a result is no longer regarded as a match.Bit short for such an important technique. This is the algorithm that manages to successfully turn up the search results for "Programming" when you spelled it "Porgraming".
The lack of a real implementation to go with the article irritated me. "That can't require very much coding, can it?"
Nope, It didn't!
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
public class TrigramMain {
private static final int OFFSET = 3;
private static String[] mainArray;
private static int inputIndex = 0;
private static Map<String, ArrayList<Integer>> indexMap;
public static void main(String[] args) {
readFile();
makeTrigramIndex();
Scanner scan = new Scanner(System.in);
String input;
while (true) {
System.out.println("Enter search word: ");
input = scan.next();
if (input.equals("quit")) {
System.exit(0);
} else if (input == null) {
System.out.println("Search word missing!");
} else {
//System.out.println(search(input));
System.out.println(exclusiveSearch(input,1));
}
}
}
/**
* An exclusive trigram string search that returns only words with more than 1 match.
* @param searchWord - The chosen search string
* @return A string containing the results.
*/
private static String exclusiveSearch(String searchWord, int minimumMatches) {
System.out.println("You exclusively searched for " + searchWord + ". Did you mean:");
ArrayList<String> resultWords = new ArrayList<String>();
ArrayList<Integer> hits = new ArrayList<Integer>();
String triKey;
for (int i = 0; i <= searchWord.length()-OFFSET; i++) {
triKey = searchWord.substring(i, i+OFFSET);
if (indexMap.containsKey(triKey)) {
hits = indexMap.get(triKey);
for (Integer j : hits) {
resultWords.add(mainArray[j]);
}
}
}
if (resultWords.isEmpty()) {
System.out.println("Error!");
}
StringBuilder result = new StringBuilder();
while (resultWords.isEmpty() == false) {
String string = resultWords.get(0);
int stringFreq = stringFrequency(resultWords, string);
if (stringFreq >= minimumMatches) {
result.append(string + " (" + stringFreq + " matches)" + '\n');
}
ArrayList<String> removeList = new ArrayList<String>();
removeList.add(string);
resultWords.removeAll(removeList);
}
return result.toString();
}
private static void makeTrigramIndex() {
indexMap = new HashMap<String, ArrayList<Integer>>();
for (int index = 0; index < mainArray.length; index++) {
String string = mainArray[index];
int len = string.length();
for (int i = 0; i <= len-OFFSET; i++) {
String trigram = string.substring(i, i+OFFSET);
ArrayList<Integer> wordList;
if (indexMap.containsKey(trigram)) {
wordList = indexMap.get(trigram);
} else {
wordList = new ArrayList<Integer>();
}
wordList.add(index);
indexMap.put(trigram, wordList);
}
}
}
private static void readFile() {
BufferedReader fin = null;
ArrayList<String> mainArrayList;
try {
fin = new BufferedReader(new FileReader("words.txt"));
mainArrayList = new ArrayList<String>();
String line = fin.readLine();
while(line != null) {
mainArrayList.add(line);
inputIndex++;
line = fin.readLine();
}
mainArray = (String[]) mainArrayList.toArray(new String[0]);
} catch (FileNotFoundException fnfe) {
fnfe.printStackTrace();
} catch (IOException ie) {
ie.printStackTrace();
} finally {
if(fin != null) {
try {
fin.close();
} catch (IOException ie) {
ie.printStackTrace();
}
}
}
}
private static int stringFrequency(List<String> inputList, String searchWord) {
Iterator<String> it = inputList.iterator();
int result = 0;
while (it.hasNext()) {
String word = (String) it.next();
if (word.equalsIgnoreCase(searchWord)) {
result++;
}
}
return result;
}
}
No comments:
Post a Comment