package com.xy.xmweb.Controller;
/**
* 此类为页面抓取工具类
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.xy.entity.INewsData;
public class JsoupFirstExtract {
/**
* @param args
*/
public static void main(String[] args) {
//parseHtml();
//parseBody();
//parseUrl();
System.out.println("=========================================");
System.out.println("=========================================");
System.out.println("=========================================");
System.out.println("=========================================");
//navigation();
//extractElement();
// navigation();
try {
String httpCount = JsoupFirstExtract.clawer2("http://www1.xy.com/myoffice/news.do?optType=1&pageNo=1&pageSize=10");
//使用jSoup解析里头的内容
//就像操作html doc文档对象一样操作网页中的元素
Document doc = Jsoup.parse(httpCount, "http://www1.xy.com/");
Element body = doc.body();
Element span = body.select("td").first();
Elements links = span.getElementsByTag("a");
for (Element element : links) {
String linkAbsHref = element.absUrl("href");
String linkText = element.text();
System.out.println("linkAbsHref=:"+linkAbsHref);
System.out.println(""+linkText+"");
}
} catch (Exception e) {
e.printStackTrace();
}
int pageSize = 10;
try {
//http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize=10
Document doc = Jsoup.connect("http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize="+pageSize).timeout(10000).get();
Elements as = doc.select("a[href]");
System.out.println(as.size());
if(pageSize > as.size()){
pageSize = as.size();
}
// for (Element a : as) {
// System.out.println(a.attr("href") + "###" + a.html());
// }
Elements tds = doc.select("td:not([title])");
// for (Element td : tds) {
// System.out.println(td.html());
// }
for(int i=0;i list = getIntfaceData("http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize=",10);
if (list != null && list.size() > 0) {
for (int i = 0; i < list.size(); i++) {
INewsData newsData = list.get(i);
System.out.println("=============newDate----getAhref-----:"+newsData.getAhref());
System.out.println("=============newDate----getDatetime-----:"+newsData.getDatetime());
System.out.println("=============newDate----getTitle-----:"+newsData.getTitle());
}
}
}
public static List getIntfaceData(String url, int pageSize) {
List list = new ArrayList();
try {
//Document docconect = Jsoup.connect("http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize="+pageSize).timeout(10000).get();
Document doc = Jsoup.connect(url+pageSize).timeout(10000).get();
// Document doc = Jsoup.parse(docconect.toString(),"http://www1.xy.com/");
Elements as = doc.select("a[href]");
//System.out.println("======条数====="+as.size());
if(pageSize > as.size()){
pageSize = as.size();
}
Elements tds = doc.select("td:not([title])");
for(int i=0;iParsed HTML into a doc.
";
Document doc = Jsoup.parse(html);
System.out.println(doc);
System.out.println("Print the html head --------------------");
System.out.println(doc.head());
System.out.println("Print the html body --------------------");
System.out.println(doc.body());
System.out.println("Print the html title --------------------");
System.out.println(doc.title());
}
public static void parseBody() {
String html = "Lorem ipsum.
";
Document doc = Jsoup.parseBodyFragment(html);
Element body = doc.body();
System.out.println("Print the body --------------------");
System.out.println(body);
}
public static void parseUrl() {
try {
Document doc = Jsoup.connect("http://www1.xy.com/myoffice/news.do?optType=1&pageNo=1&pageSize=10").get();
System.out.println("Print the Url --------------------");
System.out.println(doc);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void navigation() {
String html="First parse"
+ "Parsed HTML into a doc.
";
Document doc = Jsoup.parse(html, "http://192.168.3.84/gamestore/index.html");
Element content = doc.getElementById("content");
Elements links = content.getElementsByTag("a");
for (Element link : links) {
String linkHref = link.attr("href");
String linkAbsHref = link.absUrl("href");
String linkText = link.text();
System.out.println(linkHref);
System.out.println(linkAbsHref);
System.out.println(linkText);
}
}
public static void extractElement() {
String html = "An example link.
";
Document doc = Jsoup.parse(html);
Element link = doc.select("a").first();
String text = doc.body().text(); // "An example link"
String linkHref = link.attr("href"); // "http://example.com/"
String linkText = link.text(); // "example""
String linkOuterH = link.outerHtml();
// "example"
String linkInnerH = link.html(); // "example"
System.out.println(text);
System.out.println(linkHref);
System.out.println(linkText);
System.out.println(linkOuterH);
System.out.println(linkInnerH);
}
/**
* 当有些网页中存在一些嵌套的redirect连接时,它就会报Server redirected too many times这样的错误,
* 这是因为此网页内部又有一些代码是转向其它网页的,循环过多导致程序出错。如果只想抓取本URL中的网页内容,
* 而不愿意让它有其它 的网页跳转,可以用以下的代码。
* @param myurl
* @throws Exception
*/
@SuppressWarnings("static-access")
public static String clawer2(String myurl) throws Exception {
URL urlmy = new URL(myurl);
HttpURLConnection con = (HttpURLConnection) urlmy.openConnection();
con.setFollowRedirects(true);
con.setInstanceFollowRedirects(false);
con.connect();
BufferedReader br = new BufferedReader(new InputStreamReader(con.getInputStream(),"UTF-8"));
String s = "";
StringBuffer sb = new StringBuffer("");
while ((s = br.readLine()) != null) {
sb.append(s+"\r\n");
}
return sb.toString();
}
}