1 Star 0 Fork 2

码大侠/红袖网小说爬虫(最简单版)

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
Main.java 3.36 KB
一键复制 编辑 原始数据 按行查看 历史
誓天不相FU0 提交于 2018-10-23 13:15 . 1.0版本
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Main {
public static void main(String[] args) throws Exception{
//获取整个网页内容
String str;
//写入要抓取的url
URL url = new URL("https://www.hongxiu.com/book/11809422404543803#Catalog");
URLConnection conn = url.openConnection();
InputStream is = conn.getInputStream();
StringBuffer sb = new StringBuffer();
BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream(), "utf-8"));
while((str=reader.readLine())!=null){
sb.append(str);
}
//进行正则匹配
String content = sb.toString();
String regex = "<(a.*? data-cid=\"(.*?)\")>(.*?)</a>";
Pattern pattern = Pattern.compile(regex, Pattern.DOTALL);
Matcher matcher = pattern.matcher(content);
//找出所需要的内容
while(matcher.find()) {
String s=matcher.group(2);
String href = "https:"+s.substring(0, 61);//处理字符串
String tittle = matcher.group(3);
if(href.contains("https://www.hongxiu.com/chapter/")){
try {
// 下载链接和标题
downHtm(href, tittle);
//根据章节链接存放内容
downTxt(href,tittle);
} catch(Exception e) {
e.printStackTrace();
}
}
}
is.close();
}
public static void downTxt(String href, String tittle) throws Exception{
//获取整个网页内容
String str;
//写入要抓取的url
URL url = new URL(href);
URLConnection conn = url.openConnection();
InputStream is = conn.getInputStream();
StringBuffer sb = new StringBuffer();
BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream(), "utf-8"));
while((str=reader.readLine())!=null){
sb.append(str);
}
//进行正则匹配---找出文章内容
String regex = "<div class=\"read-content j_readContent\">(.*?)</div>";
String content = sb.toString();
Pattern pattern = Pattern.compile(regex, Pattern.DOTALL);
Matcher matcher = pattern.matcher(content);
//找出所需要的内容
while(matcher.find()) {
String pstring = matcher.group(1).replace("<p>", "").replace("<%-cInfo.content %>", "");
File f=new File(tittle+".txt");
FileWriter fw=null;
fw = new FileWriter(f, true);
fw.write(pstring);
fw.flush();
fw.close();
}
is.close();
}
public static void downHtm(String href,String tittle) throws Exception{
//对于得到的url和标题----
// 先写入一个目录(里面包含 目录和链接),点击链接之后跳转到本地文件的details目录,
// 里面存放的是每一章节的内容
File f=new File("1.html");
FileWriter fw=null;
fw = new FileWriter(f, true);
fw.write("\r\n"+tittle+" "+href+"\r\n");
fw.flush();
fw.close();
}
}
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Java
1
https://gitee.com/madaxia8/javaget.git
git@gitee.com:madaxia8/javaget.git
madaxia8
javaget
红袖网小说爬虫(最简单版)
master

搜索帮助