如何使用SentiWordNet

我需要对包含推文的一些csv文件进行情绪分析。 我正在使用SentiWordNet进行情绪分析。

我得到了他们在他们的网站上提供的以下示例Java代码。 我不确定如何使用它。 我要分析的csv文件的路径是C:\Users\MyName\Desktop\tweets.csvSentiWordNet_3.0.0.txt的路径是C:\Users\MyName\Desktop\SentiWordNet_3.0.0\home\swn\www\admin\dump\SentiWordNet_3.0.0_20130122.txt 。 我是java的新手,请帮忙,谢谢! 以下示例java代码的链接是这样的 。

 import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.util.HashMap; import java.util.Iterator; import java.util.Set; import java.util.Vector; public class SWN3 { private String pathToSWN = "data"+File.separator+"SentiWordNet_3.0.0.txt"; private HashMap _dict; public SWN3(){ _dict = new HashMap(); HashMap<String, Vector> _temp = new HashMap<String, Vector>(); try{ BufferedReader csv = new BufferedReader(new FileReader(pathToSWN)); String line = ""; while((line = csv.readLine()) != null) { String[] data = line.split("\t"); Double score = Double.parseDouble(data[2])-Double.parseDouble(data[3]); String[] words = data[4].split(" "); for(String w:words) { String[] w_n = w.split("#"); w_n[0] += "#"+data[0]; int index = Integer.parseInt(w_n[1])-1; if(_temp.containsKey(w_n[0])) { Vector v = _temp.get(w_n[0]); if(index>v.size()) for(int i = v.size();i<index; i++) v.add(0.0); v.add(index, score); _temp.put(w_n[0], v); } else { Vector v = new Vector(); for(int i = 0;i<index; i++) v.add(0.0); v.add(index, score); _temp.put(w_n[0], v); } } } Set temp = _temp.keySet(); for (Iterator iterator = temp.iterator(); iterator.hasNext();) { String word = (String) iterator.next(); Vector v = _temp.get(word); double score = 0.0; double sum = 0.0; for(int i = 0; i < v.size(); i++) score += ((double)1/(double)(i+1))*v.get(i); for(int i = 1; i=0.75) sent = "strong_positive"; else if(score > 0.25 && score 0 && score>=0.25) sent = "weak_positive"; else if(score =-0.25) sent = "weak_negative"; else if(score =-0.5) sent = "negative"; else if(score<=-0.75) sent = "strong_negative"; _dict.put(word, sent); } } catch(Exception e){e.printStackTrace();} } public String extract(String word, String pos) { return _dict.get(word+"#"+pos); } } 

Newcode:

 public class SWN3 { private String pathToSWN = "C:\\Users\\MyName\\Desktop\\SentiWordNet_3.0.0\\home\\swn\\www\\admin\\dump\\SentiWordNet_3.0.0.txt"; private HashMap _dict; public SWN3(){ _dict = new HashMap(); HashMap<String, Vector> _temp = new HashMap<String, Vector>(); try{ BufferedReader csv = new BufferedReader(new FileReader(pathToSWN)); String line = ""; while((line = csv.readLine()) != null) { String[] data = line.split("\t"); Double score = Double.parseDouble(data[2])-Double.parseDouble(data[3]); String[] words = data[4].split(" "); for(String w:words) { String[] w_n = w.split("#"); w_n[0] += "#"+data[0]; int index = Integer.parseInt(w_n[1])-1; if(_temp.containsKey(w_n[0])) { Vector v = _temp.get(w_n[0]); if(index>v.size()) for(int i = v.size();i<index; i++) v.add(0.0); v.add(index, score); _temp.put(w_n[0], v); } else { Vector v = new Vector(); for(int i = 0;i<index; i++) v.add(0.0); v.add(index, score); _temp.put(w_n[0], v); } } } Set temp = _temp.keySet(); for (Iterator iterator = temp.iterator(); iterator.hasNext();) { String word = (String) iterator.next(); Vector v = _temp.get(word); double score = 0.0; double sum = 0.0; for(int i = 0; i < v.size(); i++) score += ((double)1/(double)(i+1))*v.get(i); for(int i = 1; i=0.75) sent = "strong_positive"; else if(score > 0.25 && score 0 && score>=0.25) sent = "weak_positive"; else if(score =-0.25) sent = "weak_negative"; else if(score =-0.5) sent = "negative"; else if(score=0.75) return "very positive"; else if(averageScore > 0.25 && averageScore=0.5) return "positive"; else if(averageScore =-0.25) return "negative"; else if(averageScore =-0.5) return "negative"; else if(averageScore<=-0.75) return "very negative"; return "neutral"; } public static void main(String[] args) { // TODO Auto-generated method stub } 

首先,删除文件第一个的所有“垃圾”(包括描述,指令等)。

一种可能的用法是更改SWN3一个make方法extract它返回一个Double

 public Double extract(String word) { Double total = new Double(0); if(_dict.get(word+"#n") != null) total = _dict.get(word+"#n") + total; if(_dict.get(word+"#a") != null) total = _dict.get(word+"#a") + total; if(_dict.get(word+"#r") != null) total = _dict.get(word+"#r") + total; if(_dict.get(word+"#v") != null) total = _dict.get(word+"#v") + total; return total; } 

然后,给出一个你要标记的字符串,你可以拆分它,这样它只有单词(没有符号和未知字符),并且使用每个单词的extract方法返回的结果,你可以决定平均重量是多少字符串:

 String[] words = twit.split("\\s+"); double totalScore = 0, averageScore; for(String word : words) { word = word.replaceAll("([^a-zA-Z\\s])", ""); if (_sw.extract(word) == null) continue; totalScore += _sw.extract(word); } verageScore = totalScore; if(averageScore>=0.75) return "very positive"; else if(averageScore > 0.25 && averageScore<0.5) return "positive"; else if(averageScore>=0.5) return "positive"; else if(averageScore < 0 && averageScore>=-0.25) return "negative"; else if(averageScore < -0.25 && averageScore>=-0.5) return "negative"; else if(averageScore<=-0.75) return "very negative"; return "neutral"; 

我发现这种方式更容易,对我来说效果很好。


更新:

我将_dict更改为_dict = new HashMap(); 因此它将具有String键和Double值。

所以我替换了_dict.put(word, sent);_dict.put(word, score);

为此你应该编写main函数,在其中提供csv的路径,从中提取单词。 然后通过发送单词及其pos来调用提取函数。