使用Java StAX解析XML – 计算内容标记的数量

我有大的XML文件,我解析如下:

public class Solution { private static final String ROOM_ID = "RoomID"; private static final String CONTENT = "Content"; private static final String LOGIN_NAME = "LoginName"; private static final String CONVERSATION_ID = "ConversationID"; private static final String FILE_DUMP = "FileDump"; private static final String MESSAGE = "Message"; private static final String CONVERSATION = "Conversation"; private static final String START_TIME = "StartTime"; static class ConversationInfo { private String startTimeStr; private String conversationId; private String fileName; private int orderInFile; private final Set users = new HashSet(); private final List messages = new ArrayList(); public HashMap map = new HashMap(); @Override public String toString() { return String.format("%s %d %s %s %s %d %s", fileName, orderInFile, conversationId, startTimeStr, StringUtils.join(users, "***"), users.size(), StringUtils.join(messages, "&&&")); } } static class Message { public final String userName; public final String content; public Message(String name, String content) { this.userName = name; this.content = content; } @Override public String toString() { return userName + " " + content; } } public static void main(String[] args) throws XMLStreamException, IOException { File folder = new File("/xml/"); List m = new ArrayList(); File[] listOfFiles = folder.listFiles(); for (int i = 0; i < listOfFiles.length; i++) { File file = listOfFiles[i]; String fileName = file.getAbsolutePath(); System.out.println("File" + file); if (file.isFile() && file.getName().endsWith(".xml")) { XMLInputFactory xf = XMLInputFactory.newFactory(); try (FileInputStream fin = new FileInputStream(file)) { XMLStreamReader xr = xf.createXMLStreamReader(fin); LOOP: while (xr.hasNext()) { int event = xr.next(); switch (event) { case XMLStreamConstants.START_ELEMENT: { String elName = xr.getLocalName(); if (CONVERSATION.equals(elName)) { ConversationInfo convInfo = parseConversation(xr, file.getName()); if (convInfo != null) { m.add(convInfo); } } break; } case XMLStreamConstants.END_ELEMENT: { String elName = xr.getLocalName(); if (FILE_DUMP.equals(elName)) { break LOOP; } break; } case XMLStreamConstants.END_DOCUMENT: throw new IllegalStateException("xml not well-formed:  tag not closed"); } } } } } // ConversationInfo c = new ConversationInfo(); try (FileWriter w = new FileWriter("output.txt")) { int i = 1; for (ConversationInfo convInfo : m) { convInfo.orderInFile = i; w.write(String.format("%d %s\n", i++, convInfo)); } } } private static ConversationInfo parseConversation(XMLStreamReader xr, String fileName) throws XMLStreamException { ConversationInfo convInfo = new ConversationInfo(); convInfo.fileName = fileName; while (xr.hasNext()) { int event = xr.next(); switch (event) { case XMLStreamConstants.START_ELEMENT: { String elName = xr.getLocalName(); if (MESSAGE.equals(elName)) { Message message = parseMessage(xr); if (message != null) { convInfo.messages.add(message); convInfo.users.add(message.userName); convInfo.map.put(message.userName, message.content); } } else if (START_TIME.equals(elName)) { convInfo.startTimeStr = xr.getElementText(); } else if (ROOM_ID.equals(elName)) { convInfo.conversationId = xr.getElementText(); } break; } case XMLStreamConstants.END_ELEMENT: { String elName = xr.getLocalName(); if (CONVERSATION.equals(elName)) { return convInfo; } break; } case XMLStreamConstants.END_DOCUMENT: throw new XMLStreamException("xml not well-formed:  tag not closed"); } } throw new XMLStreamException( "unexpected end of xml file while parsing a conversation"); } private static Message parseMessage(XMLStreamReader xr) throws XMLStreamException { String userName = null; String content = null; while (xr.hasNext()) { int event = xr.next(); switch (event) { case XMLStreamConstants.START_ELEMENT: { String elName = xr.getLocalName(); if (LOGIN_NAME.equals(elName)) { userName = xr.getElementText(); } else if (CONTENT.equals(elName)) { content = StringUtils.trimToEmpty(xr.getElementText()); } break; } case XMLStreamConstants.END_ELEMENT: { String elName = xr.getLocalName(); if (MESSAGE.equals(elName)) { return new Message(userName, content); } break; } case XMLStreamConstants.END_DOCUMENT: throw new XMLStreamException("xml not well-formed:  tag not closed"); } } throw new XMLStreamException( "unexpected end of xml file while parsing a message"); } } 

我的input.xml是:

    IBXML 1.3  PCHAT-0x3000001CA8361 03/31/2016 13:39:01 1459431541   SWONG00 STEPHEN WONG 4397109 13133 231115 DBS BANK LIMITED HON SWONG00@Bloomberg.net STEPHENWONGWE@DBS.COM  03/31/2016 13:39:01 1459431541 PCHAT-0x3000001CA8361    G_LO GARY LO 7054548 13133 91189 DBS BANK (HONG KONG) G_LO@Bloomberg.net garyloyc@dbs.com  03/31/2016 14:56:22 1459436182 PCHAT-0x3000001CA8361    G_LO GARY LO 7054548 13133 91189 DBS BANK (HONG KONG) G_LO@Bloomberg.net garyloyc@dbs.com  03/31/2016 19:30:01 1459452601 PCHAT-0x3000001CA8361    SWONG00 STEPHEN WONG 4397109 13133 231115 DBS BANK LIMITED HON SWONG00@Bloomberg.net STEPHENWONGWE@DBS.COM  03/31/2016 19:33:56 1459452836 PCHAT-0x3000001CA8361    SWONG00 STEPHEN WONG 4397109 13133 231115 DBS BANK LIMITED HON SWONG00@Bloomberg.net STEPHENWONGWE@DBS.COM  03/31/2016 19:45:16 1459453516 PCHAT-0x3000001CA8361    SWONG00 STEPHEN WONG 4397109 13133 231115 DBS BANK LIMITED HON SWONG00@Bloomberg.net STEPHENWONGWE@DBS.COM  03/31/2016 23:08:09 1459465689 PCHAT-0x3000001CA8361    G_LO GARY LO 7054548 13133 91189 DBS BANK (HONG KONG) G_LO@Bloomberg.net garyloyc@dbs.com  03/31/2016 23:14:23 1459466063 PCHAT-0x3000001CA8361    G_LO GARY LO 7054548 13133 91189 DBS BANK (HONG KONG) G_LO@Bloomberg.net garyloyc@dbs.com  04/01/2016 00:10:57 1459469457 abcdefgghhhhhh PCHAT-0x3000001CA8361    WVU WHEELOCK VU 8266852 13133 91189 DBS BANK (HONG KONG) WVU@Bloomberg.net WHEELOCKVU@DBS.COM  04/01/2016 00:14:05 1459469645 PCHAT-0x3000001CA8361    FCHAN95 FLORENCE CHAN GOLDMAN SACHS (ASIA) FCHAN95@Bloomberg.net   04/01/2016 00:29:19 1459470559 PCHAT-0x3000001CA8361    FCHAN95 FLORENCE CHAN GOLDMAN SACHS (ASIA) FCHAN95@Bloomberg.net   04/01/2016 00:29:19 1459470559 ajdakjgdljsgdsafhkafa PCHAT-0x3000001CA8361    FCHAN95 FLORENCE CHAN GOLDMAN SACHS (ASIA) FCHAN95@Bloomberg.net   04/01/2016 00:29:19 1459470559 akjdgljsafdlshf;kdsjf PCHAT-0x3000001CA8361    WVU WHEELOCK VU 8266852 13133 91189 DBS BANK (HONG KONG) WVU@Bloomberg.net WHEELOCKVU@DBS.COM  04/01/2016 00:39:32 1459471172 sagdksajdlsahd PCHAT-0x3000001CA8361    SWONG00 STEPHEN WONG 4397109 13133 231115 DBS BANK LIMITED HON SWONG00@Bloomberg.net STEPHENWONGWE@DBS.COM  04/01/2016 01:01:27 1459472487 PCHAT-0x3000001CA8361    SWONG00 STEPHEN WONG 4397109 13133 231115 DBS BANK LIMITED HON SWONG00@Bloomberg.net STEPHENWONGWE@DBS.COM  04/01/2016 01:31:29 1459474289 ajdslsahdsj;a PCHAT-0x3000001CA8361    FCHAN95 FLORENCE CHAN GOLDMAN SACHS (ASIA) FCHAN95@Bloomberg.net   04/01/2016 02:49:46 1459478986 sagdkjsagdkjashdlasjd PCHAT-0x3000001CA8361    FCHAN95 FLORENCE CHAN GOLDMAN SACHS (ASIA) FCHAN95@Bloomberg.net   04/01/2016 02:49:46 1459478986 jsdhkshdksjdlsjdlks PCHAT-0x3000001CA8361    FCHAN95 FLORENCE CHAN GOLDMAN SACHS (ASIA) FCHAN95@Bloomberg.net   04/01/2016 03:47:37 1459482457 jshdkshdksjdlskld PCHAT-0x3000001CA8361    FCHAN95 FLORENCE CHAN GOLDMAN SACHS (ASIA) FCHAN95@Bloomberg.net   04/01/2016 03:47:37 1459482457 aasasasasas PCHAT-0x3000001CA8361  04/01/2016 03:47:37 1459482457   

目前我正在显示用户和内容,但我想打印

 userName(CountOfMessages UserSent)+userName(CountOfMessages UserSent) 

例: G_LO(1)+FCHAN95(6)+WVU(1)+SWONG00(1)

我尝试了HashMap但它没有按预期工作。 Java 8function也不能正常工作。 还尝试了Multiset番石榴但无济于事。

您可以使用SAX轻松实现它(注意,而不是StAX)。 您也应该能够使用StAX执行此操作。 我会在发布时发布更新。 SAX的参考实现如下

 import java.io.StringReader; import java.util.HashMap; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; public class NumCountHandler extends DefaultHandler { private HashMap countOfNum = new HashMap(); boolean isStartTagPass = false; @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { if (qName.equalsIgnoreCase("LoginName")) { isStartTagPass = true; } } @Override public void characters(char[] ch, int start, int length) throws SAXException { String attributeNum = new String(ch, start, length); if (isStartTagPass) { if (countOfNum.containsKey(attributeNum)) { Integer count = countOfNum.get(attributeNum); countOfNum.put(attributeNum, new Integer(count.intValue() + 1)); } else { countOfNum.put(attributeNum, new Integer(1)); } } } @Override public void endElement(String uri, String localName, String qName) throws SAXException { if (qName.equalsIgnoreCase("LoginName")) { isStartTagPass = false; } } public static void main(String[] args) { try { String xml = " IBXML 1.3  PCHAT-0x3000001CA8361 03/31/2016 13:39:01 1459431541  SWONG00STEPHENWONG439710913133231115DBS BANK LIMITED HONSWONG00@Bloomberg.netSTEPHENWONGWE@DBS.COM  03/31/2016 13:39:01 1459431541 PCHAT-0x3000001CA8361   G_LOGARYLO70545481313391189DBS BANK (HONG KONG)G_LO@Bloomberg.netgaryloyc@dbs.com  03/31/2016 14:56:22 1459436182 PCHAT-0x3000001CA8361   G_LOGARYLO70545481313391189DBS BANK (HONG KONG)G_LO@Bloomberg.netgaryloyc@dbs.com  03/31/2016 19:30:01 1459452601 PCHAT-0x3000001CA8361   SWONG00STEPHENWONG439710913133231115DBS BANK LIMITED HONSWONG00@Bloomberg.netSTEPHENWONGWE@DBS.COM  03/31/2016 19:33:56 1459452836 PCHAT-0x3000001CA8361   SWONG00STEPHENWONG439710913133231115DBS BANK LIMITED HONSWONG00@Bloomberg.netSTEPHENWONGWE@DBS.COM  03/31/2016 19:45:16 1459453516 PCHAT-0x3000001CA8361   SWONG00STEPHENWONG439710913133231115DBS BANK LIMITED HONSWONG00@Bloomberg.netSTEPHENWONGWE@DBS.COM  03/31/2016 23:08:09 1459465689 PCHAT-0x3000001CA8361   G_LOGARYLO70545481313391189DBS BANK (HONG KONG)G_LO@Bloomberg.netgaryloyc@dbs.com  03/31/2016 23:14:23 1459466063 PCHAT-0x3000001CA8361   G_LOGARYLO70545481313391189DBS BANK (HONG KONG)G_LO@Bloomberg.netgaryloyc@dbs.com  04/01/2016 00:10:57 1459469457 abcdefgghhhhhh PCHAT-0x3000001CA8361   WVUWHEELOCKVU82668521313391189DBS BANK (HONG KONG)WVU@Bloomberg.netWHEELOCKVU@DBS.COM  04/01/2016 00:14:05 1459469645 PCHAT-0x3000001CA8361   FCHAN95FLORENCECHANGOLDMAN SACHS (ASIA)FCHAN95@Bloomberg.net  04/01/2016 00:29:19 1459470559 PCHAT-0x3000001CA8361   FCHAN95FLORENCECHANGOLDMAN SACHS (ASIA)FCHAN95@Bloomberg.net  04/01/2016 00:29:19 1459470559 ajdakjgdljsgdsafhkafa PCHAT-0x3000001CA8361   FCHAN95FLORENCECHANGOLDMAN SACHS (ASIA)FCHAN95@Bloomberg.net  04/01/2016 00:29:19 1459470559 akjdgljsafdlshf;kdsjf PCHAT-0x3000001CA8361   WVUWHEELOCKVU82668521313391189DBS BANK (HONG KONG)WVU@Bloomberg.netWHEELOCKVU@DBS.COM  04/01/2016 00:39:32 1459471172 sagdksajdlsahd PCHAT-0x3000001CA8361   SWONG00STEPHENWONG439710913133231115DBS BANK LIMITED HONSWONG00@Bloomberg.netSTEPHENWONGWE@DBS.COM  04/01/2016 01:01:27 1459472487 PCHAT-0x3000001CA8361   SWONG00STEPHENWONG439710913133231115DBS BANK LIMITED HONSWONG00@Bloomberg.netSTEPHENWONGWE@DBS.COM  04/01/2016 01:31:29 1459474289 ajdslsahdsj;a PCHAT-0x3000001CA8361   FCHAN95FLORENCECHANGOLDMAN SACHS (ASIA)FCHAN95@Bloomberg.net  04/01/2016 02:49:46 1459478986 sagdkjsagdkjashdlasjd PCHAT-0x3000001CA8361   FCHAN95FLORENCECHANGOLDMAN SACHS (ASIA)FCHAN95@Bloomberg.net  04/01/2016 02:49:46 1459478986 jsdhkshdksjdlsjdlks PCHAT-0x3000001CA8361   FCHAN95FLORENCECHANGOLDMAN SACHS (ASIA)FCHAN95@Bloomberg.net  04/01/2016 03:47:37 1459482457 jshdkshdksjdlskld PCHAT-0x3000001CA8361   FCHAN95FLORENCECHANGOLDMAN SACHS (ASIA)FCHAN95@Bloomberg.net  04/01/2016 03:47:37 1459482457 aasasasasas PCHAT-0x3000001CA8361  04/01/2016 03:47:37 1459482457 "; ; InputSource is = new InputSource(new StringReader(xml)); SAXParserFactory factory = SAXParserFactory.newInstance(); SAXParser saxParser = factory.newSAXParser(); NumCountHandler userhandler = new NumCountHandler(); saxParser.parse(is, userhandler); userhandler.countOfNum .forEach((k, v) -> System.out.print(k +"("+v+") ")); } catch (Exception e) { e.printStackTrace(); } } } 

这打印: WVU(2)+G_LO(4)+FCHAN95(7)+SWONG00(6)+

=====添加了基于StaX的实现==========

使用Java 8function可以明显改进。 此外,我已经使用静态变量只为一个小型飞行员,应该能够使用类变量以及一些refractoring

 import java.io.StringReader; import java.util.HashMap; import javax.xml.stream.XMLEventReader; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.events.Characters; import javax.xml.stream.events.XMLEvent; public class NumCountHandlerStax { private boolean isStartTagPass = false; static private void groupAndProcess(Object object, HashMap countOfNum, NumCountHandlerStax staxBasedCounter) { XMLEvent event = (XMLEvent) object; if (event.isStartElement() && event.asStartElement().getName().getLocalPart().equals("LoginName")) { staxBasedCounter.isStartTagPass = true; } else if (event.isEndElement() && event.asEndElement().getName().getLocalPart().equals("LoginName")) { staxBasedCounter.isStartTagPass = false; } else if (staxBasedCounter.isStartTagPass && event.isCharacters()) { Characters characters = event.asCharacters(); String attributeNum = characters.getData(); if (countOfNum.containsKey(attributeNum)) { Integer count = countOfNum.get(attributeNum); countOfNum.put(attributeNum, new Integer(count.intValue() + 1)); } else { countOfNum.put(attributeNum, new Integer(1)); } } } public static void main(String[] args) { try { NumCountHandlerStax staxBasedCounter = new NumCountHandlerStax(); HashMap countOfNum = new HashMap(); String xml = " IBXML 1.3  PCHAT-0x3000001CA8361 03/31/2016 13:39:01 1459431541  SWONG00STEPHENWONG439710913133231115DBS BANK LIMITED HONSWONG00@Bloomberg.netSTEPHENWONGWE@DBS.COM  03/31/2016 13:39:01 1459431541 PCHAT-0x3000001CA8361   G_LOGARYLO70545481313391189DBS BANK (HONG KONG)G_LO@Bloomberg.netgaryloyc@dbs.com  03/31/2016 14:56:22 1459436182 PCHAT-0x3000001CA8361   G_LOGARYLO70545481313391189DBS BANK (HONG KONG)G_LO@Bloomberg.netgaryloyc@dbs.com  03/31/2016 19:30:01 1459452601 PCHAT-0x3000001CA8361   SWONG00STEPHENWONG439710913133231115DBS BANK LIMITED HONSWONG00@Bloomberg.netSTEPHENWONGWE@DBS.COM  03/31/2016 19:33:56 1459452836 PCHAT-0x3000001CA8361   SWONG00STEPHENWONG439710913133231115DBS BANK LIMITED HONSWONG00@Bloomberg.netSTEPHENWONGWE@DBS.COM  03/31/2016 19:45:16 1459453516 PCHAT-0x3000001CA8361   SWONG00STEPHENWONG439710913133231115DBS BANK LIMITED HONSWONG00@Bloomberg.netSTEPHENWONGWE@DBS.COM  03/31/2016 23:08:09 1459465689 PCHAT-0x3000001CA8361   G_LOGARYLO70545481313391189DBS BANK (HONG KONG)G_LO@Bloomberg.netgaryloyc@dbs.com  03/31/2016 23:14:23 1459466063 PCHAT-0x3000001CA8361   G_LOGARYLO70545481313391189DBS BANK (HONG KONG)G_LO@Bloomberg.netgaryloyc@dbs.com  04/01/2016 00:10:57 1459469457 abcdefgghhhhhh PCHAT-0x3000001CA8361   WVUWHEELOCKVU82668521313391189DBS BANK (HONG KONG)WVU@Bloomberg.netWHEELOCKVU@DBS.COM  04/01/2016 00:14:05 1459469645 PCHAT-0x3000001CA8361   FCHAN95FLORENCECHANGOLDMAN SACHS (ASIA)FCHAN95@Bloomberg.net  04/01/2016 00:29:19 1459470559 PCHAT-0x3000001CA8361   FCHAN95FLORENCECHANGOLDMAN SACHS (ASIA)FCHAN95@Bloomberg.net  04/01/2016 00:29:19 1459470559 ajdakjgdljsgdsafhkafa PCHAT-0x3000001CA8361   FCHAN95FLORENCECHANGOLDMAN SACHS (ASIA)FCHAN95@Bloomberg.net  04/01/2016 00:29:19 1459470559 akjdgljsafdlshf;kdsjf PCHAT-0x3000001CA8361   WVUWHEELOCKVU82668521313391189DBS BANK (HONG KONG)WVU@Bloomberg.netWHEELOCKVU@DBS.COM  04/01/2016 00:39:32 1459471172 sagdksajdlsahd PCHAT-0x3000001CA8361   SWONG00STEPHENWONG439710913133231115DBS BANK LIMITED HONSWONG00@Bloomberg.netSTEPHENWONGWE@DBS.COM  04/01/2016 01:01:27 1459472487 PCHAT-0x3000001CA8361   SWONG00STEPHENWONG439710913133231115DBS BANK LIMITED HONSWONG00@Bloomberg.netSTEPHENWONGWE@DBS.COM  04/01/2016 01:31:29 1459474289 ajdslsahdsj;a PCHAT-0x3000001CA8361   FCHAN95FLORENCECHANGOLDMAN SACHS (ASIA)FCHAN95@Bloomberg.net  04/01/2016 02:49:46 1459478986 sagdkjsagdkjashdlasjd PCHAT-0x3000001CA8361   FCHAN95FLORENCECHANGOLDMAN SACHS (ASIA)FCHAN95@Bloomberg.net  04/01/2016 02:49:46 1459478986 jsdhkshdksjdlsjdlks PCHAT-0x3000001CA8361   FCHAN95FLORENCECHANGOLDMAN SACHS (ASIA)FCHAN95@Bloomberg.net  04/01/2016 03:47:37 1459482457 jshdkshdksjdlskld PCHAT-0x3000001CA8361   FCHAN95FLORENCECHANGOLDMAN SACHS (ASIA)FCHAN95@Bloomberg.net  04/01/2016 03:47:37 1459482457 aasasasasas PCHAT-0x3000001CA8361  04/01/2016 03:47:37 1459482457 "; ; XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); XMLEventReader xmlEventReader = xmlInputFactory.createXMLEventReader(new StringReader(xml)); xmlEventReader.forEachRemaining(event -> groupAndProcess(event, countOfNum, staxBasedCounter)); countOfNum.forEach((k, v) -> System.out.print(k + "(" + v + ") ")); } catch (Exception e) { e.printStackTrace(); } } } Prints `WVU(2) G_LO(4) FCHAN95(7) SWONG00(6)` 

Using an XSLT 3.0 processor like Saxon 9.7 EE you could do it in a declarative way with a stylesheet using an accumulator:

             

Output for the sample you have posted is map{"SWONG00":1,"FCHAN95":6,"WVU":1,"G_LO":1} .

This is the example based on XPath and VTD-XML. VTD-XML will not blow up with big XML like DOM. XPath makes the code logic easy to understand, simple to maintain. if you want to count something different just throw in a different xpath query.

 import java.util.HashMap; import com.ximpleware.*;. public class stats { public static void main(String[] s)throws VTDException{ VTDGen vg = new VTDGen(); if (!vg.parseFile("d:\\xml\\dump.xml", false)){ System.out.println("parsing error"); return; } VTDNav vn = vg.getNav(); AutoPilot ap = new AutoPilot(vn); HashMap  hit = new HashMap(); ap.selectXPath("/FileDump/Conversation/Message/User/LoginName/text()"); int i=0; while((i=ap.evalXPath())!=-1){ String s1 = vn.toNormalizedString(i); if (hit.containsKey(s1)){ Integer it = hit.get(s1); hit.put(s1, new Integer(it.intValue()+1)); }else{ hit.put(s1, new Integer(1)); } } System.out.println(hit.toString()); } }