需求
1,寻找目标网站规律特点
2,反爬的处理。\
目的
将网站中的每一个专辑内的图片都下载下来。
代码
import java.io.File;
public slass SpiderWeb{
private static String fileParent="美女图片";
public static void main(String[] args){
//1,先创建总文件夹
File file= new File(fileParent);
if(!file.exists()){
file.mkdir();
}
//2,分析页码情况
for(int i=1;i<=5;i++){
String pagePath="http://www.win4000.com/meinvtage4_+i+".html";
//3,去下载当前页中所有的专辑
downPage(pagePath);
}
}
//下载当前页中的专辑
private static void downPage(String pagePath){
//4,分析当前页中专辑的信息。
URL url=new URL(pagePath);
Document document=Jsoup.parse(url,100000);
Element div= document.getElementsByAttributeValue("class","tab_box").get(1);
Element lis=div.getElementsByTag("li");
//5,获取每一个专辑的名称和URL
for(Element li:lis){
String picsPath = li.getElementsByTag("a").get(0).attr("herf");
String picsName = li.getElementsByTag("img").get(0).attr("title");
downPictures(picsName,picsPath);
}
}
//创建专辑的文件夹并分析专辑中图片的内容
private static void downPictures(String picsName,String picsPath){
File picsDir = new File(fileParent,picsName);
if(!picsDir.exists()){
picsDir.mkdir();
}
//6,分析专辑中图片总数
URL url=new URL(picsPath);
Document document = Jsoup.parse(url,100000);
int total =Integer.parseInt( document.getElementsByTag("em").get(0).html());
//7,遍历总个数,分析每一个图片的URL
for(int i=1;i<=total;i++){
StringBuilder sb = new StringBuilder( picsPath);
sb.insert(34,"_"+i);
String picPath = sb.toString();
//图片所在的目录,图片的名称编号,图片所在的网页
new Thread(new DownLoadPictureTask(picDir,i+".jpg",picsPath)).start();
}
}
}
class DownLoadPictureTask implements Runnable{
private File picsDir;
private String picsName;
private String picsPath;
public DownLoadPictureTask(File picsDir,String picsName,String picsPath){
this.picsDir=picsDir;
this.picsName=picsName;
this.picsPath=picsPath;
}
@override
public void run(){
BufferedInputStream bis=null;//服务器来的输入流
BufferedOutputStream bos=null;//向文件写的输出流
try{
//创建图片的文件
File file = new File(picsDir,picName);
//解析成当前图片网页中图片的具体地址,通过IO流技术下载,HTTP编程
URL url= new URL(picPath);
Document document=Jsoup.parse(url,100000);
String imgPath = doucument.getElementsByAttributeValue("class",pic-large).get(0).attr("url");
//通过HTTP编程获取imgpath的字节输入流开始下载图片
URL imgURL= new URL(imgPath);
HttpURLConnection conn = (HttpURLConnection)imgURL.openConnection();
//模拟成浏览器
conn.setRequestProperty("Accept","");
conn.setRequestProperty("User-Agent","");
conn.connect();
bis=new BufferedInputStream(conn.getInputStream());
bos=new BufferedOutputStream(new FileOutputStream(file));
byte[] buf=new byte[1024];
int len=0;
while((len=bis.read(buf))!=-1){
bos.write(buf,0,len);
bos.flush();
}
bosclose();
conn.disconnect();
}catch(Exception e){
e.printStackTrace();
}
}
}