初学java爬虫,用jsoup爬晋江免费章节

128 阅读3分钟
  • 小说实体
public class Novel {
	String title;		//书名
	String author;		//作者
	List<Chapter> chapters;//章节
	String novelId;	//小说id
	String url;		//小说url

	public Novel(String novelId) {//构造函数
		this.novelId = novelId;
		this.url = NovelUtils.NOVEL_HOME_URL+novelId;
	}
	//getter、setter方法省略
}
  • 章节实体
public class Chapter {
	String chapterNum;		//章节号
	String chapterTitle;		//章节名
	String url;	//章节地址
	boolean isVip;	//是否vip
	String content;	//小说内容
	
	//getter、setter方法省略
}
  • 获取小说和章节工具类
public class NovelUtils {
	
	public final static String DOMAIN = "http://my.jjwxc.net";
	public final static String NOVEL_HOME_URL = "http://www.jjwxc.net/onebook.php?novelid=";
	public final static String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586";
	private final static String FILE_ENCODING = "UTF-8";	//文件编码
	
	public static Novel getNovelInfo(Novel novel) {
		//小说首页dom
		Document doc = null;
		try {
			doc = Jsoup.connect(novel.getUrl())
					.timeout(5000)
					.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8")//.header()设置头信息
					.header("Accept-Encoding", "gzip, deflate")
					.header("Accept-Language", "zh-CN,zh;q=0.9")
					.header("Cache-Control", "max-age=0")
					.header("Host", "www.jjwxc.net")
					.ignoreContentType(true)
					.ignoreHttpErrors(true)
					.userAgent(USER_AGENT)
					.get();
		} catch (IOException e1) {
			System.out.println("小说首页获取失败");
			e1.printStackTrace();
		}
		//小说名和作者名
		novel.setTitle(doc.getElementById("oneboolt").getElementsByTag("tbody").get(0).child(0).getElementsByTag("h1").get(0).text().trim());
		novel.setAuthor(doc.getElementById("oneboolt").getElementsByTag("tbody").get(0).child(0).getElementsByTag("h2").get(0).getElementsByTag("a").text().trim());
		System.out.println(novel.getTitle()+" - "+novel.getAuthor());
		//章节table列表
		List<Element> chapterTrList = doc.getElementById("oneboolt").getElementsByTag("tbody").get(0).getElementsByTag("tr");
		//待set进novel的章节列表
		List<Chapter> chapterList = new ArrayList<Chapter>();
		//获取每章节信息
		for (Element chapterTr : chapterTrList) {
			//如果不是章节列
			if (!chapterTr.hasAttr("itemprop")) {
				continue;
			}
			Chapter chapter = new Chapter();
			chapter.setVip(chapterTr.getElementsByTag("td").size() == 5);//一行只有5列则为vip章节
			if (chapter.isVip()) {//不获取v章内容
				break;
			}
			chapter.setChapterNum(chapterTr.child(0).text().trim());
			chapter.setChapterTitle(chapterTr.child(1).text().trim());
			//锁章没有url
			try {
				chapter.setUrl(chapterTr.child(1).getElementsByTag("a").get(0).attr("href"));
			} catch(IndexOutOfBoundsException e) {
				chapter.setContent("章节已锁");
			}
			chapter.setContent(getChapterContent(chapter, novel.getUrl()));
			chapterList.add(chapter);
		}
		novel.setChapters(chapterList);
		return novel;
	}
	
	public static String getChapterContent(Chapter chapter, String novelUrl) {
		System.out.println("正在获取第"+chapter.getChapterNum()+"章 "+chapter.getChapterTitle());
		//如果是锁章,返回
		if (chapter.getUrl() == null) {
			return chapter.getContent();
		}
		//获取文章dom
		Document doc = null;
		try {
			doc = Jsoup.connect(chapter.getUrl())
					.timeout(5000)
					.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3")
					.header("Accept-Encoding", "gzip, deflate")
					.header("Accept-Language", "zh-CN,zh;q=0.9")
					.header("Cache-Control", "max-age=0")
					.header("Connection", "keep-alive")
					.header("Host", "www.jjwxc.net")
					.header("Referer", novelUrl)
					.ignoreContentType(true)
					.ignoreHttpErrors(true)
					.userAgent(USER_AGENT)
					.get();
		} catch (IOException e) {
			System.out.println("章节获取失败");
			e.printStackTrace();
		}
		//为了保留换行
		String content = Jsoup.clean(new String(doc.getElementsByClass("noveltext").get(0).html()), "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
		//去头去尾
		int beginIndex = content.indexOf("查看收藏列表") + "查看收藏列表".length();
		int endIndex = content.lastIndexOf("插入书签");
		int authorIndex = content.lastIndexOf("插入书签") + "插入书签".length();
		if (beginIndex != -1 && endIndex != -1 && authorIndex != -1) {
			content = content.substring(beginIndex, endIndex).trim() + content.substring(authorIndex).trim();
		}
		return content;
	}
	
	public static void outputFile(Novel novel) {
		System.out.println("输出文件");
		//文件名:作品名 - 作者.txt
		String fileName = novel.getTitle() + " - " + novel.getAuthor() + ".txt";
		File file = new File(fileName);
		//文件不存在则创建文件
		if (!file.exists()) {
			try {
				file.createNewFile();
			} catch (IOException e1) {
				e1.printStackTrace();
			}
		}
		//输出到文件
		try {
			BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName), FILE_ENCODING));
			List<Chapter> chapterList = novel.getChapters();
			for (Chapter chapter : chapterList) {
				System.out.println("正在输出第"+chapter.getChapterNum()+"章 "+chapter.getChapterTitle());
				bw.write("第"+chapter.getChapterNum()+"章");
				if (chapter.getContent() != null) {
					bw.write(chapter.getContent());
				}
				bw.newLine();
			}
			bw.flush();
			bw.close();
		} catch(Exception e) {
			System.out.println("输出失败");
			e.printStackTrace();
		}
		System.out.println("输出完成");
	}
}
public class App {
	
	private static Scanner sc = new Scanner(System.in);
	
	public static void main(String[] args) {
		//输入小说id
		System.out.println("请输入小说id");
		String novelId = sc.next();
		
		//小说实体
		Novel novel = new Novel(novelId);
		novel = NovelUtils.getNovelInfo(novel);
		NovelUtils.outputFile(novel);
	}

}

附上设置代理、登陆和获取cookies的代码:

  • 代理
	private static String ip;	//ip
	private static int port;	//端口
	private static Proxy proxy;	//代理
	public static void getProxy() {
		try {
			Document doc = Jsoup.connect(proxyApi).get();//proxyApi为提取代理的api。这里我用的是蘑菇代理
			String[] proxyArr = doc.text().split(":");
			ip = proxyArr[0];
			port = Integer.parseInt(proxyArr[1]);
			System.out.println(ip+":"+port);
			proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(ip, port));
		} catch (IOException e) {
			System.out.println("代理获取失败");
			e.printStackTrace();
		}
	}
  • 使用代理登陆和获取cookies
	private static Map<String, String> cookies;
	public static void login(String loginName, String password) {
		Response response = null;
		try {
			response = Jsoup.connect("http://my.jjwxc.net/login.php")
					.proxy(proxy)//设置代理
					.followRedirects(false)
					.userAgent(USER_AGENT)//设置useragent
					.data("loginname",loginName)//.data()传参
					.data("loginpassword",password)
					.header("Accept", "*/*")//设置头信息
					.header("Encoding", "gzip, deflate")
					.header("Accept-Language", "zh-CN,zh;q=0.9")
					.header("Host", "my.jjwxc.net")
					.header("Referer", "http://www.jjwxc.net/")
					.ignoreContentType(true)
					.ignoreHttpErrors(true)
					.execute();
		} catch (IOException e) {
			System.out.println("登陆失败");
			e.printStackTrace();
		}
		cookies = response.cookies();//获取cookies
	}