从Lucene找到搜索命中的位置

使用Lucene,在搜索结果中找到匹配的推荐方法是什么?

更具体地说,假设索引文档具有字段“fullText”,其存储某些文档的纯文本内容。 此外,假设对于这些文件中的一个,内容是“快速的棕色狐狸跳过懒狗”。 接下来,搜索“狐狸狗”。 显然,这份文件很受欢迎。

在这种情况下,Lucene可以用来提供类似于找到的文档的匹配区域吗? 所以对于这种情况,我想产生类似的东西:

[{match: "fox", startIndex: 10, length: 3}, {match: "dog", startIndex: 34, length: 3}] 

我怀疑它可以通过org.apache.lucene.search.highlight包中提供的内容来实现。 我不确定整体方法……

我使用的是TermFreqVector。 这是一个工作演示,它打印术语位置,以及起始和结束术语索引:

 public class Search { public static void main(String[] args) throws IOException, ParseException { Search s = new Search(); s.doSearch(args[0], args[1]); } Search() { } public void doSearch(String db, String querystr) throws IOException, ParseException { // 1. Specify the analyzer for tokenizing text. // The same analyzer should be used as was used for indexing StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); Directory index = FSDirectory.open(new File(db)); // 2. query Query q = new QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(querystr); // 3. search int hitsPerPage = 10; IndexSearcher searcher = new IndexSearcher(index, true); IndexReader reader = IndexReader.open(index, true); searcher.setDefaultFieldSortScoring(true, false); TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true); searcher.search(q, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // 4. display term positions, and term indexes System.out.println("Found " + hits.length + " hits."); for(int i=0;i 

这是lucene 5.2.1的解决方案。 它仅适用于单个单词查询,但应展示基本原则。

基本思路是:

  1. 获取与您的查询匹配的每个文档的TokenStream
  2. 创建QueryScorer并使用检索到的tokenStream对其进行初始化。
  3. ‘循环’在流的每个标记上(由tokenStream.incrementToken() )并检查标记是否与搜索条件匹配(由queryScorer.getTokenScore() )。

这是代码:

 import java.io.IOException; import java.util.List; import java.util.Vector; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.de.GermanAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.TokenSources; public class OffsetSearcher { private IndexReader reader; public OffsetSearcher(IndexWriter indexWriter) throws IOException { reader = DirectoryReader.open(indexWriter, true); } public OffsetData[] getTermOffsets(Query query) throws IOException, InvalidTokenOffsetsException { List result = new Vector<>(); IndexSearcher searcher = new IndexSearcher(reader); TopDocs topDocs = searcher.search(query, 1000); ScoreDoc[] scoreDocs = topDocs.scoreDocs; Document doc; TokenStream tokenStream; CharTermAttribute termAtt; OffsetAttribute offsetAtt; QueryScorer queryScorer; OffsetData offsetData; String txt, tokenText; for (int i = 0; i < scoreDocs.length; i++) { int docId = scoreDocs[i].doc; doc = reader.document(docId); txt = doc.get(RunSearch.CONTENT); tokenStream = TokenSources.getTokenStream(RunSearch.CONTENT, reader.getTermVectors(docId), txt, new GermanAnalyzer(), -1); termAtt = (CharTermAttribute)tokenStream.addAttribute(CharTermAttribute.class); offsetAtt = (OffsetAttribute)tokenStream.addAttribute(OffsetAttribute.class); queryScorer = new QueryScorer(query); queryScorer.setMaxDocCharsToAnalyze(RunSearch.MAX_DOC_CHARS); TokenStream newStream = queryScorer.init(tokenStream); if (newStream != null) { tokenStream = newStream; } queryScorer.startFragment(null); tokenStream.reset(); int startOffset, endOffset; for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset() < RunSearch.MAX_DOC_CHARS); next = tokenStream.incrementToken()) { startOffset = offsetAtt.startOffset(); endOffset = offsetAtt.endOffset(); if ((endOffset > txt.length()) || (startOffset > txt.length())) { throw new InvalidTokenOffsetsException("Token " + termAtt.toString() + " exceeds length of provided text sized " + txt.length()); } float res = queryScorer.getTokenScore(); if (res > 0.0F && startOffset <= endOffset) { tokenText = txt.substring(startOffset, endOffset); offsetData = new OffsetData(tokenText, startOffset, endOffset, docId); result.add(offsetData); } } } return result.toArray(new OffsetData[result.size()]); } public void close() throws IOException { reader.close(); } public static class OffsetData { public String phrase; public int startOffset; public int endOffset; public int docId; public OffsetData(String phrase, int startOffset, int endOffset, int docId) { super(); this.phrase = phrase; this.startOffset = startOffset; this.endOffset = endOffset; this.docId = docId; } } }