在lucene中搜索UUID无法正常工作

我有一个UUID字段,我将按以下格式添加到我的文档中:372d325c-e01b-432f-98bd-bc4c949f15b8。 但是,当我尝试通过UUID查询文档时,无论我如何尝试转义表达式,它都不会返回它们。 例如:

+uuid:372d325c-e01b-432f-98bd-bc4c949f15b8 +uuid:"372d325c-e01b-432f-98bd-bc4c949f15b8" +uuid:372d325c\-e01b\-432f\-98bd\-bc4c949f15b8 +uuid:(372d325c-e01b-432f-98bd-bc4c949f15b8) +uuid:("372d325c-e01b-432f-98bd-bc4c949f15b8") 

甚至使用TermQuery完全跳过QueryParser,如下所示:

 new TermQuery(new Term("uuid", uuid.toString())) 

要么

 new TermQuery(new Term("uuid", QueryParser.escape(uuid.toString()))) 

这些搜索都不会返回文档,但如果我搜索UUID的某些部分,它将返回一个文档。 例如,这些将返回一些东西:

 +uuid:372d325c +uuid:e01b +uuid:432f 

我应该怎么做索引这些文件,以便我可以通过他们的UUID拉回来? 我已经考虑重新格式化UUID以删除连字符,但我还没有实现它。

我使用它的唯一方法是使用WhitespaceAnalyzer而不是StandardAnalyzer。 然后像这样使用TermQuery:

 IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, new WhitespaceAnalyzer(Version.LUCENE_36)) .setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); writer = new IndexWriter( directory, config); 

然后搜索:

 TopDocs docs = searcher.search(new TermQuery(new Term("uuid", uuid.toString())), 1); 

WhitespaceAnalyzer阻止Lucene用连字符拆分UUID。 另一种选择可能是从UUID中消除破折号,但使用WhitespaceAnalyzer也可以用于我的目的。

根据Lucene查询语法规则 ,查询

 +uuid:372d325c\-e01b\-432f\-98bd\-bc4c949f15b8 

应该管用。

我想如果不这样做,那是因为当文档插入索引时,未填充uuid字段。 你能确定这个领域究竟插入了什么吗? 您可以使用Luke抓取索引并查找为uuid字段存储的实际值。

如果您计划将UUID字段作为查找键,则需要让Lucene将整个字段索引为单个字符串,而不进行标记化。 这是通过为UUID字段设置正确的FieldType来完成的。 在Lucene 4+中,您可以使用StringField。

 import java.io.IOException; import java.util.UUID; import junit.framework.Assert; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.junit.Test; /** * Using Lucene 4.7 on Java 7. */ public class LuceneUUIDFieldLookupTest { private Directory directory; private Analyzer analyzer; @Test public void testUsingUUIDAsLookupKey() throws IOException, ParseException { directory = new RAMDirectory(); analyzer = new StandardAnalyzer(Version.LUCENE_47); UUID docUUID = UUID.randomUUID(); String docContentText1 = "Stack Overflow is a question and answer site for professional and enthusiast programmers."; index(docUUID, docContentText1); QueryParser parser = new QueryParser(Version.LUCENE_47, MyIndexedFields.DOC_TEXT_FIELD.name(), analyzer); Query queryForProgrammer = parser.parse("programmers"); IndexSearcher indexSearcher = getIndexSearcher(); TopDocs hits = indexSearcher.search(queryForProgrammer, Integer.MAX_VALUE); Assert.assertTrue(hits.scoreDocs.length == 1); Integer internalDocId1 = hits.scoreDocs[0].doc; Document docRetrieved1 = indexSearcher.doc(internalDocId1); indexSearcher.getIndexReader().close(); String docText1 = docRetrieved1.get(MyIndexedFields.DOC_TEXT_FIELD.name()); Assert.assertEquals(docText1, docContentText1); String docContentText2 = "TechCrunch is a leading technology media property, dedicated to ... according to a new report from the Wall Street Journal confirmed by Google to TechCrunch."; reindex(docUUID, docContentText2); Query queryForTechCrunch = parser.parse("technology"); indexSearcher = getIndexSearcher(); //you must reopen directory because the previous IndexSearcher only sees a snapshoted directory. hits = indexSearcher.search(queryForTechCrunch, Integer.MAX_VALUE); Assert.assertTrue(hits.scoreDocs.length == 1); Integer internalDocId2 = hits.scoreDocs[0].doc; Document docRetrieved2 = indexSearcher.doc(internalDocId2); indexSearcher.getIndexReader().close(); String docText2 = docRetrieved2.get(MyIndexedFields.DOC_TEXT_FIELD.name()); Assert.assertEquals(docText2, docContentText2); } private void reindex(UUID myUUID, String docContentText) throws IOException { try (IndexWriter indexWriter = new IndexWriter(directory, getIndexWriterConfig())) { Term term = new Term(MyIndexedFields.MY_UUID_FIELD.name(), myUUID.toString()); indexWriter.updateDocument(term, buildDoc(myUUID, docContentText)); }//auto-close } private void index(UUID myUUID, String docContentText) throws IOException { try (IndexWriter indexWriter = new IndexWriter(directory, getIndexWriterConfig())) { indexWriter.addDocument(buildDoc(myUUID, docContentText)); }//auto-close } private IndexWriterConfig getIndexWriterConfig() { return new IndexWriterConfig(Version.LUCENE_47, analyzer).setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); } private Document buildDoc(UUID myUUID, String docContentText) { Document doc = new Document(); doc.add(new Field( MyIndexedFields.MY_UUID_FIELD.name(), myUUID.toString(), StringField.TYPE_STORED));//use TYPE_STORED if you want to read it back in search result. doc.add(new Field( MyIndexedFields.DOC_TEXT_FIELD.name(), docContentText, TextField.TYPE_STORED)); return doc; } private IndexSearcher getIndexSearcher() throws IOException { DirectoryReader ireader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(ireader); return indexSearcher; } enum MyIndexedFields { MY_UUID_FIELD, DOC_TEXT_FIELD } }