path是专业的xml结构化文档的查询语言,语法功能强大,本文不涉及xpath语法教程。
jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据,但是选取某个元素时还是没有xpath那么简单直接,而且xpath带了很多选择库。
然而遗憾的时,jsoup并不支持xpath,于是博主就写了一个让jsoup支持的xpath的工具类,希望能帮助到有需要的朋友!
工具类
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 | package com.ry.mytools.util; import com.sun.org.apache.xerces.internal.dom.ElementImpl; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Attribute; import org.jsoup.select.Elements; import org.jsoup.select.NodeTraversor; import org.jsoup.select.NodeVisitor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.*; import javax.xml.namespace.QName; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathFactory; import java.io.StringWriter; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; /** * * Jsoup的xpath解析工具类 * * * * @author liuhh * * * */ @SuppressWarnings("restriction") public class JsoupParserUtil { protected final static DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); private final static Logger log = LoggerFactory.getLogger(JsoupParserUtil.class); private final static XPath xPath = XPathFactory.newInstance().newXPath(); protected static TransformerFactory tf = TransformerFactory.newInstance(); private static final Lock LOCK = new ReentrantLock(); /** * 得到该节点的子节点个数 */ public static int getEleChildNum(final org.jsoup.nodes.Element ele, final String xpath) { try { Object res = parse(ele, xpath, XPathConstants.NODESET); if (null != res && res instanceof NodeList) { NodeList nodeList = (NodeList) res; return nodeList == null ? 0 : nodeList.getLength(); } } catch (Exception e) { log.error("根据xpath:{},获取子节点个数出现错误,错误原因:" + e.getMessage(), xpath); } return 0; } /** * 判断文档中是否存在xpath节点 */ public static boolean exists(final org.jsoup.nodes.Element ele, final String xpath) { try { Object res = parse(ele, xpath, XPathConstants.BOOLEAN); if (null != res && res instanceof Boolean) { return (boolean) res; } return false; } catch (Exception e) { log.error("检查xpath:{},是否存在时出现错误,!" + e.getMessage(), xpath); } return false; } /** * 根据xpath得到w3c的Element对象 */ public static ElementImpl getW3cElementImpl(final org.jsoup.nodes.Element ele, final String xpath) { try { Object res = parse(ele, xpath, XPathConstants.NODE); if (null != res && res instanceof ElementImpl) { return (ElementImpl) res; } return null; } catch (Exception e) { log.error("根据xpath:{},得到w3c的Element对象出现错误,原因:" + e.getMessage(), xpath); } return null; } /** * 根据xpath得到jsoup的Element对象 */ public static org.jsoup.nodes.Element getJsoupElement(final org.jsoup.nodes.Element ele, final String xpath) { try { Object res = parse(ele, xpath, XPathConstants.NODE); if (null != res && res instanceof ElementImpl) { ElementImpl elementImpl = (ElementImpl) res; return getJsoupEle(elementImpl); } return null; } catch (Exception e) { log.error("根据xpath:{},得到jsoup的Element对象出现错误,原因:" + e.getMessage(), xpath); } return null; } /** * 根据xpath得到jsoup的Elements对象 */ public static Elements getJsoupElements(final org.jsoup.nodes.Element ele, final String xpath) { try { NodeList nodeList = getNodeList(ele, xpath); if (null != nodeList && nodeList.getLength() > 0) { int len = nodeList.getLength(); Elements elements = new Elements(); for (int i = 0; i < len; i++) { Node node = nodeList.item(i); if (null != node && node instanceof ElementImpl) { org.jsoup.nodes.Element element = getJsoupEle(((ElementImpl) node)); elements.add(element); } } return elements; } } catch (Exception e) { log.error("根据xpath:{},得到jsoup的Element对象出现错误,原因:" + e.getMessage(), xpath); } return null; } /** * 从Jsoup的Element中解析出W3C的NodeList */ public static NodeList getNodeList(final org.jsoup.nodes.Element ele, final String xpath) { try { Object res = parse(ele, xpath, XPathConstants.NODESET); if (null != res && res instanceof NodeList) { return (NodeList) res; } } catch (Exception e) { log.error(e.getMessage(), e); } return null; } /** * 得到节点的某一个属性 */ public static String getXpathString(final org.jsoup.nodes.Element ele, final String xpath) { try { int textNum = getEleChildNum(ele, xpath); if (1 == textNum) { Object res = parse(ele, xpath, XPathConstants.STRING); if (null != res) { return res.toString(); } } else { List<String> res = getXpathListString(ele, xpath); if (res != null && res.size() > 0) { StringBuilder stringBuilder = new StringBuilder(); for (Iterator<String> iterator = res.iterator(); iterator.hasNext(); ) { String text = iterator.next(); if (null != text) { stringBuilder.append(text.replace("\r\n", ".")); } } return stringBuilder.toString(); } } return null; } catch (Exception e) { e.printStackTrace(); log.error("根据xpath:{}查询字符串时出现错误:" + e.getMessage(), xpath); } return null; } /** * 查询字符串列表 */ public static List<String> getXpathListString(final org.jsoup.nodes.Element ele, final String xpath) { try { Object res = parse(ele, xpath, XPathConstants.NODESET); if (null != res && res instanceof NodeList) { NodeList nodeList = (NodeList) res; int length = nodeList.getLength(); if (length <= 0) { return null; } List<String> list = new ArrayList<>(); for (int i = 0; i < length; i++) { Node node = nodeList.item(i); list.add(null == node ? null : node.getNodeValue()); } return list; } return null; } catch (Exception e) { log.error("根据xpath:{}查询字符串列表时出现错误:" + e.getMessage(), xpath); } return null; } /** * 获取xpath解析结果 */ public static Object parse(final org.jsoup.nodes.Element doc, final String xPathStr, final QName qName) { Node node = fromJsoup(doc); return parse(node, xPathStr, qName); } public static Object parse(final Node doc, final String xPathStr, final QName qName) { try { if (doc == null) { log.warn("解析文档为null!"); return null; } if (StringUtils.isBlank(xPathStr)) { log.warn("解析的Xpath路径为空!"); return null; } if (null == qName) { log.warn("解析类型为null!"); return null; } try { LOCK.lock(); Object res = xPath.evaluate(xPathStr, doc, qName); return res; } finally { // TODO: handle finally clause LOCK.unlock(); } } catch (Exception e) { log.warn("解析Xpath:{},出现错误,解析类型:{},错误原因:{}!", xPathStr, qName, e.getMessage()); } return null; } /** * 根据ElementImpl得到Jsoup的Element */ public static org.jsoup.nodes.Element getJsoupEle(final ElementImpl elementImpl) { try { String value = getW3cDocString(elementImpl); org.jsoup.nodes.Document document = Jsoup.parse(value); return document.body().child(0); } catch (Exception e) { // TODO: handle exception log.error("根据ElementImpl得到Jsoup的Element出现错误,错误原因:" + e.getMessage()); return null; } } /** * 将w3c的Document转为jsoup的Document */ public static org.jsoup.nodes.Document fromW3C(final Document doc) throws Exception { String string = getW3cDocString(doc); org.jsoup.nodes.Document res = Jsoup.parse(string); return res; } /** * 将jsoup的Document转为w3c的Document */ public static Node fromJsoup(final org.jsoup.nodes.Element in) { DocumentBuilder builder; try { if (null == in) { return null; } builder = factory.newDocumentBuilder(); Document out = builder.newDocument(); if (in instanceof org.jsoup.nodes.Document) { List<org.jsoup.nodes.Node> childs = in.childNodes(); if (childs != null && childs.size() > 0) { org.jsoup.nodes.Element rootEl = in.child(0); NodeTraversor traversor = new NodeTraversor(new W3CBuilder(out)); traversor.traverse(rootEl); return out; } else { // out.setNodeValue(in.); return out; } } else if (in instanceof org.jsoup.nodes.Element) { NodeTraversor traversor = new NodeTraversor(new W3CBuilder(out)); traversor.traverse(in); return out; } } catch (ParserConfigurationException e) { return null; } return null; } /** * 将W3c的doc转为字符串 */ public static String getW3cDocString(final Node doc) throws Exception { try (StringWriter writer = new StringWriter()) { DOMSource domSource = new DOMSource(doc); StreamResult result = new StreamResult(writer); LOCK.lock(); try { Transformer transformer = tf.newTransformer(); transformer.transform(domSource, result); return writer.toString(); } finally { LOCK.unlock(); } } catch (TransformerException e) { throw new IllegalStateException(e); } } /** * 将Jsoup的node属性拷贝到w3c的Element中 */ public static void copyAttributes(final org.jsoup.nodes.Node source, final Element el) { for (Attribute attribute : source.attributes()) { el.setAttribute(attribute.getKey(), attribute.getValue()); } } } class W3CBuilder implements NodeVisitor { private final Document doc; private Element dest; public W3CBuilder(Document doc) { this.doc = doc; } @Override public void head(final org.jsoup.nodes.Node source, int depth) { if (source instanceof org.jsoup.nodes.Element) { org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source; Element el = doc.createElement(sourceEl.tagName()); JsoupParserUtil.copyAttributes(sourceEl, el); if (dest == null) { doc.appendChild(el); } else { dest.appendChild(el); } dest = el; } else if (source instanceof org.jsoup.nodes.TextNode) { org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source; Text text = doc.createTextNode(sourceText.getWholeText()); dest.appendChild(text); } else if (source instanceof org.jsoup.nodes.Comment) { org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source; Comment comment = doc.createComment(sourceComment.getData()); dest.appendChild(comment); } else if (source instanceof org.jsoup.nodes.DataNode) { org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source; Text node = doc.createTextNode(sourceData.getWholeData()); dest.appendChild(node); } else { } } @Override public void tail(final org.jsoup.nodes.Node source, int depth) { if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof Element) { dest = (Element) dest.getParentNode(); } } } |
测试
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | import java.io.IOException; import java.net.URL; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; public class JsoupParserUtilsTest { public static void main(String[] args) throws Exception, IOException { String url = "http://mil.news.sina.com.cn/china/2016-09-29/doc-ifxwmamy9955666.shtml"; Document doc = Jsoup.parse(new URL(url), 10000); String titleXpath = "//*[@id='main_title']/text()"; String timeXpath = "//*[@id='page-tools']/span/span[position() = 1]"; System.out.println(JsoupParserUtils.exists(doc, "/html/body/div[position>1000000]")); System.out.println(JsoupParserUtils.getXpathString(doc, titleXpath)); Element element = JsoupParserUtils.getJsoupElement(doc, timeXpath); System.out.println(element.text()); System.out.println(element.attr("class")); } } |
————————————————
让你的Jsoup支持Xpath