如何使用Jericho HTML Parser解析XML

我是java和servlet的新手,目前正在尝试使用Jericho XML Parser解析XML。 例如,我想从每个链接标签获取链接,但它不显示任何内容,总数显示为27(只能获得没有字符串的正确总数)。 任何知道如何做的人,请教我。

import java.io.IOException; import java.io.PrintWriter; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import java.net.MalformedURLException; import java.net.URL; import java.util.*; import net.htmlparser.jericho.Element; import net.htmlparser.jericho.Source; @WebServlet(urlPatterns = { "/HelloServlet"}) public class HelloServlet extends HttpServlet { private static final long serialVersionUID = 1L; @Override protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException,MalformedURLException{ resp.setContentType("text/html; charset=UTF-8"); PrintWriter out = resp.getWriter(); out.println(""); out.println(""); out.println(""); Source source = new Source(new URL("http://news.yahoo.com/rss/")); source.fullSequentialParse(); List Linklist = source.getAllElements("link"); if(Linklist!=null){ out.println("

total:"+Linklist.size()+"

"); for(Element link: Linklist){ out.println("

"+link.getContent().toString()+"

"); } } out.println(""); out.println(""); } }

根据Jericho HTML Parser主页,Jericho用于处理HTML文档。 但来自Yahoo的RSS是XML,您可以使用Java的标准XML来解析此文档并提取链接标记。 这是一个例子:

 import java.io.IOException; import java.net.URL; import java.util.LinkedList; import java.util.List; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; // ... private List getRssLinks() throws ParserConfigurationException, SAXException, IOException { final List rssLinks = new LinkedList(); final URL url = new URL("http://news.yahoo.com/rss/"); final Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder() .parse(url.openStream()); final NodeList linkNodes = doc.getElementsByTagName("link"); for(int i = 0; i < linkNodes.getLength(); i++) { final Element linkElement = (Element) linkNodes.item(i); rssLinks.add(linkElement.getTextContent()); } return rssLinks; }