Mahout:调整基于项目的推荐者的余弦相似度

对于作业,我应该测试不同类型的推荐人,我必须先实施。 我一直在四处寻找一个好的图书馆(我最初想过Weka)并且偶然发现了Mahout。

因此,我必须提出:a)我对Mahout完全不熟悉b)我没有强大的推荐人背景,也没有他们的算法(否则我不会这样做……)和c)抱歉,但我我远非成为世界上最好的开发者==>如果你能使用外行术语(尽可能……)我会很感激:)

我一直在关注一些教程(例如, 这个 ,以及第2 部分 ),并在基于项目和基于用户的推荐器上获得了一些初步结果。

但是,我对基于项目的预测不满意。 到目前为止,我只发现了不考虑用户评级偏差的相似性函数。 我想知道是否有类似adjusted cosine similarity东西。 任何提示?

以下是我创建的AdjustedCosineSimilarity的示例。 您必须记住,由于sqrt计算,这将比PearsonCorrelationSimilarity慢,但会产生更好的结果。 至少对我的数据集来说,结果要好得多。 但是你应该做出权衡,质量/性能,根据你的需要,你应该使用你想要的实现。

 /** * Custom implementation of {@link AdjustedCosineSimilarity} * * @author dmilchevski * */ public class AdjustedCosineSimilarity extends AbstractSimilarity { /** * Creates new {@link AdjustedCosineSimilarity} * * @param dataModel * @throws TasteException */ public AdjustedCosineSimilarity(DataModel dataModel) throws TasteException { this(dataModel, Weighting.UNWEIGHTED); } /** * Creates new {@link AdjustedCosineSimilarity} * * @param dataModel * @param weighting * @throws TasteException */ public AdjustedCosineSimilarity(DataModel dataModel, Weighting weighting) throws TasteException { super(dataModel, weighting, true); Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn't have preference values"); } /** * Compute the result */ @Override double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) { if (n == 0) { return Double.NaN; } // Note that sum of X and sum of Y don't appear here since they are // assumed to be 0; // the data is assumed to be centered. double denominator = Math.sqrt(sumX2) * Math.sqrt(sumY2); if (denominator == 0.0) { // One or both parties has -all- the same ratings; // can't really say much similarity under this measure return Double.NaN; } return sumXY / denominator; } /** * Gets the average preference * @param prefs * @return */ private double averagePreference(PreferenceArray prefs){ double sum = 0.0; int n = prefs.length(); for(int i=0; i0){ return sum/n; } return 0.0d; } /** * Compute the item similarity between two items */ @Override public double itemSimilarity(long itemID1, long itemID2) throws TasteException { DataModel dataModel = getDataModel(); PreferenceArray xPrefs = dataModel.getPreferencesForItem(itemID1); PreferenceArray yPrefs = dataModel.getPreferencesForItem(itemID2); int xLength = xPrefs.length(); int yLength = yPrefs.length(); if (xLength == 0 || yLength == 0) { return Double.NaN; } long xIndex = xPrefs.getUserID(0); long yIndex = yPrefs.getUserID(0); int xPrefIndex = 0; int yPrefIndex = 0; double sumX = 0.0; double sumX2 = 0.0; double sumY = 0.0; double sumY2 = 0.0; double sumXY = 0.0; double sumXYdiff2 = 0.0; int count = 0; // No, pref inferrers and transforms don't appy here. I think. while (true) { int compare = xIndex < yIndex ? -1 : xIndex > yIndex ? 1 : 0; if (compare == 0) { // Both users expressed a preference for the item double x = xPrefs.getValue(xPrefIndex); double y = yPrefs.getValue(yPrefIndex); long xUserId = xPrefs.getUserID(xPrefIndex); long yUserId = yPrefs.getUserID(yPrefIndex); double xMean = averagePreference(dataModel.getPreferencesFromUser(xUserId)); double yMean = averagePreference(dataModel.getPreferencesFromUser(yUserId)); sumXY += (x - xMean) * (y - yMean); sumX += x; sumX2 += (x - xMean) * (x - xMean); sumY += y; sumY2 += (y - yMean) * (y - yMean); double diff = x - y; sumXYdiff2 += diff * diff; count++; } if (compare <= 0) { if (++xPrefIndex == xLength) { break; } xIndex = xPrefs.getUserID(xPrefIndex); } if (compare >= 0) { if (++yPrefIndex == yLength) { break; } yIndex = yPrefs.getUserID(yPrefIndex); } } double result; // See comments above on these computations double n = (double) count; double meanX = sumX / n; double meanY = sumY / n; // double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * // meanX * meanY; double centeredSumXY = sumXY - meanY * sumX; // double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * // meanX; double centeredSumX2 = sumX2 - meanX * sumX; // double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * // meanY; double centeredSumY2 = sumY2 - meanY * sumY; // result = computeResult(count, centeredSumXY, centeredSumX2, // centeredSumY2, sumXYdiff2); result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2); if (!Double.isNaN(result)) { result = normalizeWeightResult(result, count, dataModel.getNumUsers()); } return result; } }