From 69e3511a6c7bd0d125907a7346c251f5b218d9b9 Mon Sep 17 00:00:00 2001 From: yhongowo <10617231+yhongowo@user.noreply.gitee.com> Date: Sat, 19 Mar 2022 09:56:51 +0000 Subject: [PATCH] =?UTF-8?q?update=20src/main/java/fun/ticsmyc/crawler/Tool?= =?UTF-8?q?s.java.=20=E4=BD=BF=E7=94=A8Htmlunit=EF=BC=8C=E8=8E=B7=E5=8F=96?= =?UTF-8?q?=E5=8E=9F=E7=BD=91=E7=AB=99=E4=B8=AD=E4=BB=A5JavaScript?= =?UTF-8?q?=E5=8A=A0=E8=BD=BD=E7=9A=84=E6=95=B0=E6=8D=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/main/java/fun/ticsmyc/crawler/Tools.java | 21 ++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/main/java/fun/ticsmyc/crawler/Tools.java b/src/main/java/fun/ticsmyc/crawler/Tools.java index c6f8195..b824f48 100644 --- a/src/main/java/fun/ticsmyc/crawler/Tools.java +++ b/src/main/java/fun/ticsmyc/crawler/Tools.java @@ -1,5 +1,7 @@ package fun.ticsmyc.crawler; +import com.gargoylesoftware.htmlunit.WebClient; +import com.gargoylesoftware.htmlunit.html.HtmlPage; import org.apache.log4j.Logger; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -44,13 +46,28 @@ public class Tools { } /** - * 通过Jsoup获取整个html页面 + * htmlunit执行javascript,再将页面解析为xml获取到Jsoup的document对象 * @param url * @return */ public static void getPageByJSoup(String url) { try { - page = Jsoup.connect(url).get(); +// page = Jsoup.connect(url).get(); + //Htmlunit模拟的浏览器,设置css,js等支持及其它的一些简单设置 + WebClient browser = new WebClient(); + browser.getOptions().setCssEnabled(false); + browser.getOptions().setJavaScriptEnabled(true); + browser.getOptions().setThrowExceptionOnScriptError(false); + + //获取页面 + HtmlPage htmlPage = browser.getPage(url); + //设置等待js的加载时间 + browser.waitForBackgroundJavaScript(600); + + //使用xml的方式解析获取到jsoup的document对象 + page = Jsoup.parse(htmlPage.asXml()); + System.out.println(page); + } catch (IOException e) { logger.error("jsoup获取页面失败"); } -- Gitee