java中的html截断器

是否有任何实用程序(或示例源代码)在Java中截断HTML(用于预览)? 我想在服务器上而不是在客户端上进行截断。

我正在使用HTMLUnit来解析HTML。

更新:
我希望能够预览HTML,因此截断器将保持HTML结构,同时在所需的输出长度之后剥离元素。

我认为您需要编写自己的XML解析器来完成此任务。 拉出body节点,添加节点,直到二进制长度<某个固定大小,然后重建文档。 如果HTMLUnit不创建语义XHTML,我建议使用tagsoup 。

如果您需要XML解析器/处理程序,我建议使用XOM 。

我写了另一个java版本的truncateHTML。 此函数将字符串截断为多个字符,同时保留整个单词和HTML标记。

 public static String truncateHTML(String text, int length, String suffix) { // if the plain text is shorter than the maximum length, return the whole text if (text.replaceAll("<.*?>", "").length() <= length) { return text; } String result = ""; boolean trimmed = false; if (suffix == null) { suffix = "..."; } /* * This pattern creates tokens, where each line starts with the tag. * For example, "One, Two, Three" produces the following: * One, * Two * , Three */ Pattern tagPattern = Pattern.compile("(<.+?>)?([^<>]*)"); /* * Checks for an empty tag, for example img, br, etc. */ Pattern emptyTagPattern = Pattern.compile("^<\\s*(img|br|input|hr|area|base|basefont|col|frame|isindex|link|meta|param).*>$"); /* * Modified the pattern to also include H1-H6 tags * Checks for closing tags, allowing leading and ending space inside the brackets */ Pattern closingTagPattern = Pattern.compile("^<\\s*/\\s*([a-zA-Z]+[1-6]?)\\s*>$"); /* * Modified the pattern to also include H1-H6 tags * Checks for opening tags, allowing leading and ending space inside the brackets */ Pattern openingTagPattern = Pattern.compile("^<\\s*([a-zA-Z]+[1-6]?).*?>$"); /* * Find   > ... */ Pattern entityPattern = Pattern.compile("(&[0-9a-z]{2,8};|&#[0-9]{1,7};|[0-9a-f]{1,6};)"); // splits all html-tags to scanable lines Matcher tagMatcher = tagPattern.matcher(text); int numTags = tagMatcher.groupCount(); int totalLength = suffix.length(); List openTags = new ArrayList(); boolean proposingChop = false; while (tagMatcher.find()) { String tagText = tagMatcher.group(1); String plainText = tagMatcher.group(2); if (proposingChop && tagText != null && tagText.length() != 0 && plainText != null && plainText.length() != 0) { trimmed = true; break; } // if there is any html-tag in this line, handle it and add it (uncounted) to the output if (tagText != null && tagText.length() > 0) { boolean foundMatch = false; // if it's an "empty element" with or without xhtml-conform closing slash Matcher matcher = emptyTagPattern.matcher(tagText); if (matcher.find()) { foundMatch = true; // do nothing } // closing tag? if (!foundMatch) { matcher = closingTagPattern.matcher(tagText); if (matcher.find()) { foundMatch = true; // delete tag from openTags list String tagName = matcher.group(1); openTags.remove(tagName.toLowerCase()); } } // opening tag? if (!foundMatch) { matcher = openingTagPattern.matcher(tagText); if (matcher.find()) { // add tag to the beginning of openTags list String tagName = matcher.group(1); openTags.add(0, tagName.toLowerCase()); } } // add html-tag to result result += tagText; } // calculate the length of the plain text part of the line; handle entities (eg  ) as one character int contentLength = plainText.replaceAll("&[0-9a-z]{2,8};|&#[0-9]{1,7};|[0-9a-f]{1,6};", " ").length(); if (totalLength + contentLength > length) { // the number of characters which are left int numCharsRemaining = length - totalLength; int entitiesLength = 0; Matcher entityMatcher = entityPattern.matcher(plainText); while (entityMatcher.find()) { String entity = entityMatcher.group(1); if (numCharsRemaining > 0) { numCharsRemaining--; entitiesLength += entity.length(); } else { // no more characters left break; } } // keep us from chopping words in half int proposedChopPosition = numCharsRemaining + entitiesLength; int endOfWordPosition = plainText.indexOf(" ", proposedChopPosition-1); if (endOfWordPosition == -1) { endOfWordPosition = plainText.length(); } int endOfWordOffset = endOfWordPosition - proposedChopPosition; if (endOfWordOffset > 6) { // chop the word if it's extra long endOfWordOffset = 0; } proposedChopPosition = numCharsRemaining + entitiesLength + endOfWordOffset; if (plainText.length() >= proposedChopPosition) { result += plainText.substring(0, proposedChopPosition); proposingChop = true; if (proposedChopPosition < plainText.length()) { trimmed = true; break; // maximum length is reached, so get off the loop } } else { result += plainText; } } else { result += plainText; totalLength += contentLength; } // if the maximum length is reached, get off the loop if(totalLength >= length) { trimmed = true; break; } } for (String openTag : openTags) { result += ""; } if (trimmed) { result += suffix; } return result; } 

这里有一个PHP函数: http : //snippets.dzone.com/posts/show/7125

我已经为初始版本创建了一个快速而又脏的Java端口,但是在评论中有后续的改进版本值得考虑(尤其是处理整个单词的版本):

 public static String truncateHtml(String s, int l) { Pattern p = Pattern.compile("<[^>]+>([^<]*)"); int i = 0; List tags = new ArrayList(); Matcher m = p.matcher(s); while(m.find()) { if (m.start(0) - i >= l) { break; } String t = StringUtils.split(m.group(0), " \t\n\r\0\u000B>")[0].substring(1); if (t.charAt(0) != '/') { tags.add(t); } else if ( tags.get(tags.size()-1).equals(t.substring(1))) { tags.remove(tags.size()-1); } i += m.start(1) - m.start(0); } Collections.reverse(tags); return s.substring(0, Math.min(s.length(), l+i)) + ((tags.size() > 0) ? "" : "") + ((s.length() > l) ? "\u2026" : ""); } 

注意: StringUtils.join()需要Apache Commons Lang。

我可以为你提供一个我编写的Python脚本: http : //www.ellipsix.net/ext-tmp/summarize.txt 。 不幸的是,我没有Java版本,但您可以自己翻译并根据需要进行修改以满足您的需求。 这不是很复杂,只是我为我的网站一起入侵的东西,但我已经使用它一年多一点,它通常似乎工作得很好。

如果你想要一些健壮的东西,XML(或SGML)解析器几乎肯定比我做的更好。

我找到了这个博客: dencat:用Java截断HTML

它包含一个pythons的Java端口,Django模板函数truncate_html_words

 public class SimpleHtmlTruncator { public static String truncateHtmlWords(String text, int max_length) { String input = text.trim(); if (max_length > input.length()) { return input; } if (max_length < 0) { return new String(); } StringBuilder output = new StringBuilder(); /** * Pattern pattern_opentag = Pattern.compile("(<[^/].*?[^/]>).*"); * Pattern pattern_closetag = Pattern.compile("().*"); Pattern * pattern_selfclosetag = Pattern.compile("(<.*?/>).*");* */ String HTML_TAG_PATTERN = "<(\"[^\"]*\"|'[^']*'|[^'\">])*>"; Pattern pattern_overall = Pattern.compile(HTML_TAG_PATTERN + "|" + "\\s*\\w*\\s*"); Pattern pattern_html = Pattern.compile("(" + HTML_TAG_PATTERN + ")" + ".*"); Pattern pattern_words = Pattern.compile("(\\s*\\w*\\s*).*"); int characters = 0; Matcher all = pattern_overall.matcher(input); while (all.find()) { String matched = all.group(); Matcher html_matcher = pattern_html.matcher(matched); Matcher word_matcher = pattern_words.matcher(matched); if (html_matcher.matches()) { output.append(html_matcher.group()); } else if (word_matcher.matches()) { if (characters < max_length) { String word = word_matcher.group(); if (characters + word.length() < max_length) { output.append(word); } else { output.append(word.substring(0, (max_length - characters) > word.length() ? word.length() : (max_length - characters))); } characters += word.length(); } } } return output.toString(); } public static void main(String[] args) { String text = SimpleHtmlTruncator.truncateHtmlWords("

abc

defghij

ghi

", 4); System.out.println(text); } }