Ik zou een SynonymFilter op de firstName gebruiken tijdens het indexeren, zodat je alle mogelijke combinaties hebt (Bob -> Robert, Robert -> Bob, enz...). Indexeer de bestaande gebruikers die je hebt.
Gebruik vervolgens de QueryParser (zonder de SynonymFilter in de analysator) om wat vage vragen te stellen.
Dit is de code die ik bedacht:
public class NameDuplicateTests {
private Analyzer analyzer;
private IndexSearcher searcher;
private IndexReader reader;
private QueryParser qp;
private final static Multimap<String, String> firstNameSynonyms;
static {
firstNameSynonyms = HashMultimap.create();
List<String> robertSynonyms = ImmutableList.of("Bob", "Bobby", "Robert");
for (String name: robertSynonyms) {
firstNameSynonyms.putAll(name, robertSynonyms);
}
List<String> willSynonyms = ImmutableList.of("William", "Will", "Bill", "Billy");
for (String name: willSynonyms) {
firstNameSynonyms.putAll(name, willSynonyms);
}
}
public static Analyzer createAnalyzer() {
return new Analyzer() {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream tokenizer = new WhitespaceTokenizer(reader);
if (fieldName.equals("firstName")) {
tokenizer = new SynonymFilter(tokenizer, new SynonymEngine() {
@Override
public String[] getSynonyms(String s) throws IOException {
return firstNameSynonyms.get(s).toArray(new String[0]);
}
});
}
return tokenizer;
}
};
}
@Before
public void setUp() throws Exception {
Directory dir = new RAMDirectory();
analyzer = createAnalyzer();
IndexWriter writer = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);
ImmutableList<String> firstNames = ImmutableList.of("William", "Robert", "Bobby", "Will", "Anton");
ImmutableList<String> lastNames = ImmutableList.of("Robert", "Williams", "Mayor", "Bob", "FunkyMother");
for (int id = 0; id < firstNames.size(); id++) {
Document doc = new Document();
doc.add(new Field("id", String.valueOf(id), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("firstName", firstNames.get(id), Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("lastName", lastNames.get(id), Field.Store.YES, Field.Index.NOT_ANALYZED));
writer.addDocument(doc);
}
writer.close();
qp = new QueryParser(Version.LUCENE_30, "firstName", new WhitespaceAnalyzer());
searcher = new IndexSearcher(dir);
reader = searcher.getIndexReader();
}
@After
public void tearDown() throws Exception {
searcher.close();
}
@Test
public void testNameFilter() throws Exception {
search("+firstName:Bob +lastName:Williams");
search("+firstName:Bob +lastName:Wolliam~");
}
private void search(String query) throws ParseException, IOException {
Query q = qp.parse(query);
System.out.println(q);
TopDocs res = searcher.search(q, 3);
for (ScoreDoc sd: res.scoreDocs) {
Document doc = reader.document(sd.doc);
System.out.println("Found " + doc.get("firstName") + " " + doc.get("lastName"));
}
}
}
Wat resulteert in:
+firstName:Bob +lastName:Williams
Found Robert Williams
+firstName:Bob +lastName:wolliam~0.5
Found Robert Williams
Ik hoop dat dat helpt!