jsoup页面抓取测试

package com.xy.xmweb.Controller;
/**
 * 此类为页面抓取工具类
 */

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.xy.entity.INewsData;


public class JsoupFirstExtract {

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		//parseHtml();		
		//parseBody();
		//parseUrl();
		System.out.println("=========================================");
		System.out.println("=========================================");
		System.out.println("=========================================");
		System.out.println("=========================================");
		//navigation();		
		//extractElement();
//		navigation();
		
		try {
			String httpCount = JsoupFirstExtract.clawer2("http://www1.xy.com/myoffice/news.do?optType=1&pageNo=1&pageSize=10");
			
			//使用jSoup解析里头的内容  
	        //就像操作html doc文档对象一样操作网页中的元素  
			Document doc = Jsoup.parse(httpCount, "http://www1.xy.com/");
			Element body = doc.body();
			Element span = body.select("td").first();
			Elements links = span.getElementsByTag("a");
			for (Element element : links) {
				String linkAbsHref = element.absUrl("href");
				String linkText = element.text();
				System.out.println("linkAbsHref=:"+linkAbsHref);
				System.out.println(""+linkText+"");
			}
			
		} catch (Exception e) {
			e.printStackTrace();
		}
		
		int pageSize = 10;

		try {			
			//http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize=10
			Document doc = Jsoup.connect("http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize="+pageSize).timeout(10000).get(); 
			Elements as = doc.select("a[href]");
			System.out.println(as.size());
			if(pageSize > as.size()){
				pageSize = as.size();
			}
//			for (Element a : as) {
//				System.out.println(a.attr("href") + "###" + a.html()); 
//			}
			Elements tds = doc.select("td:not([title])");
//			for (Element td : tds) {
//				System.out.println(td.html()); 
//			}
			for(int i=0;i list = getIntfaceData("http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize=",10);
		if (list != null && list.size() > 0) {
			for (int i = 0; i < list.size(); i++) {
				INewsData newsData = list.get(i);
				System.out.println("=============newDate----getAhref-----:"+newsData.getAhref());
				System.out.println("=============newDate----getDatetime-----:"+newsData.getDatetime());
				System.out.println("=============newDate----getTitle-----:"+newsData.getTitle());
			}
		}
		
		}
	
	
	public static List getIntfaceData(String url, int pageSize) {
		
		List list = new ArrayList();
		try {
			//Document docconect = Jsoup.connect("http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize="+pageSize).timeout(10000).get();
			Document doc = Jsoup.connect(url+pageSize).timeout(10000).get();
//			Document doc = Jsoup.parse(docconect.toString(),"http://www1.xy.com/");
			Elements as = doc.select("a[href]");
			//System.out.println("======条数====="+as.size());
			if(pageSize > as.size()){
				pageSize = as.size();
			}
			Elements tds = doc.select("td:not([title])");
			for(int i=0;iParsed HTML into a doc.";
		Document doc = Jsoup.parse(html);		
		System.out.println(doc);
		System.out.println("Print the html head --------------------");
		System.out.println(doc.head());
		System.out.println("Print the html body --------------------");
		System.out.println(doc.body());
		System.out.println("Print the html title --------------------");
		System.out.println(doc.title());
	}
	
	public static void parseBody() {
		String html = "Lorem ipsum.";
		Document doc = Jsoup.parseBodyFragment(html);
		Element body = doc.body();	
		System.out.println("Print the body --------------------");
		System.out.println(body);
	}
	
	public static void parseUrl() {
		try {
			Document doc = Jsoup.connect("http://www1.xy.com/myoffice/news.do?optType=1&pageNo=1&pageSize=10").get();
			System.out.println("Print the Url --------------------");
			System.out.println(doc);
			
			
			
			
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	
	public static void navigation() {
		String html="First parse"
				  + "Parsed HTML into a doc.
"
				  + "hahaha"
				  + "bababa"
				  +"";
		Document doc = Jsoup.parse(html, "http://192.168.3.84/gamestore/index.html");
		
		Element content = doc.getElementById("content");
		Elements links = content.getElementsByTag("a");
		for (Element link : links) {
		  String linkHref = link.attr("href");
		  String linkAbsHref = link.absUrl("href");
		  String linkText = link.text();
		  
		  System.out.println(linkHref);
		  System.out.println(linkAbsHref);
		  System.out.println(linkText);
		}

	}
	
	public static void extractElement() {
		String html = "An example link.";
		Document doc = Jsoup.parse(html);
		Element link = doc.select("a").first();

		String text = doc.body().text(); // "An example link"
		String linkHref = link.attr("href"); // "http://example.com/"
		String linkText = link.text(); // "example""

		String linkOuterH = link.outerHtml(); 
		    // "example"
		String linkInnerH = link.html(); // "example"
		
		System.out.println(text);
		System.out.println(linkHref);
		System.out.println(linkText);
		System.out.println(linkOuterH);
		System.out.println(linkInnerH);
	}
	
	/** 
	 * 当有些网页中存在一些嵌套的redirect连接时，它就会报Server redirected too many times这样的错误， 
	 * 这是因为此网页内部又有一些代码是转向其它网页的，循环过多导致程序出错。如果只想抓取本URL中的网页内容， 
	 * 而不愿意让它有其它 的网页跳转，可以用以下的代码。 
	 * @param myurl 
	 * @throws Exception 
	 */  
	  
	@SuppressWarnings("static-access")  
	public static String clawer2(String myurl) throws Exception {  
	    URL urlmy = new URL(myurl);  
	    HttpURLConnection con = (HttpURLConnection) urlmy.openConnection();  
	    con.setFollowRedirects(true);  
	    con.setInstanceFollowRedirects(false);  
	    con.connect();  
	      
	    BufferedReader br = new BufferedReader(new InputStreamReader(con.getInputStream(),"UTF-8"));  
	    String s = "";  
	    StringBuffer sb = new StringBuffer("");  
	    while ((s = br.readLine()) != null) {  
	    sb.append(s+"\r\n");  
	      
	    }  
	       return sb.toString();  
	}

}