如何在Lucene中实现tf-idf和余弦相似度?

如何在Lucene中实现tf-idf和余弦相似度? 我正在使用Lucene 4.2。 我创建的程序不使用tf-idf和Cosine相似,它只使用TopScoreDocCollector。

import com.mysql.jdbc.Statement; import java.io.BufferedReader; import java.io.File; import java.io.InputStreamReader; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.util.Version; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriter; import java.sql.DriverManager; import java.sql.Connection; import java.sql.ResultSet; import org.apache.lucene.analysis.id.IndonesianAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.*; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.RAMDirectory; public class IndexMysqlDBStemming { public static void main(String[] args) throws Exception { // 1. Create Index From Database Class.forName("com.mysql.jdbc.Driver").newInstance(); Connection connection = DriverManager.getConnection("jdbc:mysql://localhost/db_haiquran", "root", ""); IndonesianAnalyzer analyzer = new IndonesianAnalyzer(Version.LUCENE_42); //StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_42); QueryParser parser = new QueryParser(Version.LUCENE_42, "result", analyzer); Directory INDEX_DIR = new RAMDirectory(); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_42, analyzer); IndexWriter writer = new IndexWriter(INDEX_DIR, config); String query = "SELECT * FROM ayat"; java.sql.Statement statement = connection.createStatement(); ResultSet result = statement.executeQuery(query); while (result.next()) { Document document = new Document(); document.add(new Field("NO_INDEX_AYAT", result.getString("NO_INDEX_AYAT"), Field.Store.YES, Field.Index.NOT_ANALYZED)); document.add(new Field("NO_SURAT", result.getString("NO_SURAT"), Field.Store.YES, Field.Index.NOT_ANALYZED)); document.add(new Field("NO_AYAT", result.getString("NO_AYAT"), Field.Store.YES, Field.Index.NOT_ANALYZED)); document.add(new Field("TEXT_INDO", result.getString("TEXT_INDO"), Field.Store.YES, Field.Index.ANALYZED)); document.add(new Field("TEXT_ARAB", result.getString("TEXT_ARAB"), Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.updateDocument(new Term("NO_INDEX_AYAT", result.getString("NO_INDEX_AYAT")), document); } writer.close(); // 2. Query System.out.println("Enter your search keyword in here : "); BufferedReader bufferRead = new BufferedReader(new InputStreamReader(System.in)); String s = bufferRead.readLine(); String querystr = args.length > 0 ? args[0] :s; try { System.out.println(parser.parse(querystr)+"\n"); //amenit System.out.println(); } catch (ParseException ex) { // Exception } Query q = new QueryParser(Version.LUCENE_42, "TEXT_INDO", analyzer).parse(querystr); // 3. Search int hitsPerPage = 10; IndexReader reader = DirectoryReader.open(INDEX_DIR); IndexSearcher searcher = new IndexSearcher(reader); TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true); searcher.search(q, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // 4. Display results System.out.println("Found : " + hits.length + " hits."); System.out.println("No" + " ID " + "\t" + " Surat " + "\t" + " No Ayat " + "\t" + " Terjemahan Ayat " + "\t" + " Teks Arab "); for (int i=0; i<hits.length; i++) { int docID = hits[i].doc; Document d = searcher.doc(docID); System.out.println((i+1) + ". " + d.get("NO_INDEX_AYAT") + "\t" + d.get("NO_SURAT") + "\t" + d.get("NO_AYAT")+ "\t" + d.get("TEXT_INDO") + "\t" + d.get("TEXT_ARAB")); } reader.close(); } } 

如何使用tf-idf和余弦相似度显示计算结果?

除非有我遗失的东西,否则你已经完成了。 做得好!

默认情况下使用的相似性算法是DefaultSimilarity ,但您可以在其基类TFIDFSimilarity中找到大多数文档(和逻辑)。

TFIDFSimilarity确实是TF-IDF和余弦相似性评分模型的实现。

Interesting Posts