获得”我要个性网“的微信头像-CSDN博客

25 阅读1分钟

更新:使用了java线程池,加速爬去过程,代码连接。因为该例子中爬各个网页间互不影响,也是将程序改为并行的

--2019.12.10

===============================================

不重在爬虫,而在学习过程

# -*- coding:utf-8 -*-
import urllib2, urllib, time
from bs4 import BeautifulSoup
import sys, os
reload(sys)
sys.setdefaultencoding('utf-8') #设置输出格式

def crawl(url, website = ""):
    img_dir = "我要个性网"
    if os.path.isdir(img_dir) == False:
        os.mkdir(img_dir)
    #加头部信息,模拟浏览器
    headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/64.0.3282.119 Chrome/64.0.3282.119 Safari/537.36'}
    req = urllib2.Request(url, headers=headers) #创建对象
    page = urllib2.urlopen(req, timeout=20) #设置超时,防止URL不可访问或相应时间慢
    contents = page.read() #获取源码, readline获取一行
    #print contents
    soup = BeautifulSoup(contents, 'html.parser')
    alinks = soup.find_all('a', {'class':'imgTitle'})
    global record
    for alink in alinks:
        # if record < 655: #断或卡后再连接设置参数
        #     record += 1
        #     continue
        dirpath = img_dir + '/' + str(record).zfill(3) + '_' + alink.text
        print dirpath
        if(alink.text.__contains__('/')):
            deal_error(dirpath + '\n')
            dirpath = img_dir + '/' + str(record).zfill(3) + '_' + alink.text.replace('/', 'or')
        if os.path.isdir(dirpath) == False:
            os.mkdir(dirpath)
        suburl = website + alink.get('href')
        #print suburl
        subreq = urllib2.Request(suburl, headers=headers)
        subpage = urllib2.urlopen(subreq, timeout=20)
        subcontents = subpage.read()
        # if record == 1:
        #     print subcontents
        subsoup = BeautifulSoup(subcontents, 'html.parser')
        imgs = subsoup.find_all('img', {'class':'lazy'})
        cur = 0
        for img in imgs:
            cur += 1
            link = img.get('src')
            #print link
            filename = dirpath + '/%02d.jpg'%cur
            print filename
            try:
                urllib.urlretrieve(link, filename) #下载并保存到images文件夹
            except:
                deal_error(filename + "\n" + link + "\n")
        record += 1

def deal_error(string):
    fout = open("log_error.txt", "at")
    fout.write(string)
    fout.close()

record = 1
url = 'http://www.woyaogexing.com/touxiang/weixin/index.html' 
website = 'http://www.woyaogexing.com'
crawl(url, website)
pageNum = 1
while (True): 
    pageNum += 1
    print "请求第==================================================%d===================页" % pageNum
    url = 'http://www.woyaogexing.com/touxiang/weixin/index_%d.html' % pageNum
    crawl(url, website)

#遇到的问题 Connection reset by peer
#Temporary failure in name resolution
#最终会404 NOT FOUND异常终止程序

python3代码

# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import requests, os, threading, re

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)\
    AppleWebKit/537.36 (KHTML, like Gecko)\
    Chrome/69.0.3497.100 Safari/537.36'}

def loge(msg):
    with open('error_log.txt', 'at+') as fout:
        try:
            fout.write(msg)
        except:
            fout.write('Warning: 编码错误')

def save_img(url, path):
    with open(path, 'wb') as fout:
        response = requests.get(url, headers).content
        fout.write(response)

def spider(url, website=''):
    path = os.path.join(os.getcwd(), 'images')
    if not os.path.exists(path):
        os.mkdir(path)
    response = requests.get(url, headers=headers).content
    soup = BeautifulSoup(response, 'html.parser')
    divs = soup.select('.txList')
    next_page = soup.find('div', {'class':'page'})
    for div in divs:
        try:
            title = re.sub('[\/:*?"<>|\n.]', '_', div.a.get('title'))
            dir_name = os.path.join(path, title)
            if not os.path.exists(dir_name):
                os.mkdir(dir_name)
        except:
            loge('Error: ' + str(div))
            continue
        response = requests.get(website + str(div.a.get('href'))).content
        soup = BeautifulSoup(response, 'html.parser')
        lis = soup.select('.tx-img')
        for li in lis:
            img_url = 'http:' + li.a.get('href')
            file_path = os.path.join(dir_name, img_url.split('/')[-1])
            thread = threading.Thread(target=save_img, args=(img_url, file_path))
            thread.start()
        print(os.getpid(), url)
    if next_page:
        next_url = website + str(next_page.findAll('a')[-1].get('href'))
        thread = threading.Thread(target=spider, args=(next_url, website))
        thread.start()

def main():
    website = 'https://www.woyaogexing.com'
    url = 'https://www.woyaogexing.com/touxiang/weixin/'
    # index_40后网页结构变了
    spider(url, website)


if __name__ == '__main__':
    main()