更新:使用了java线程池,加速爬去过程,代码连接。因为该例子中爬各个网页间互不影响,也是将程序改为并行的
--2019.12.10
===============================================
不重在爬虫,而在学习过程
# -*- coding:utf-8 -*-
import urllib2, urllib, time
from bs4 import BeautifulSoup
import sys, os
reload(sys)
sys.setdefaultencoding('utf-8') #设置输出格式
def crawl(url, website = ""):
img_dir = "我要个性网"
if os.path.isdir(img_dir) == False:
os.mkdir(img_dir)
#加头部信息,模拟浏览器
headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/64.0.3282.119 Chrome/64.0.3282.119 Safari/537.36'}
req = urllib2.Request(url, headers=headers) #创建对象
page = urllib2.urlopen(req, timeout=20) #设置超时,防止URL不可访问或相应时间慢
contents = page.read() #获取源码, readline获取一行
#print contents
soup = BeautifulSoup(contents, 'html.parser')
alinks = soup.find_all('a', {'class':'imgTitle'})
global record
for alink in alinks:
# if record < 655: #断或卡后再连接设置参数
# record += 1
# continue
dirpath = img_dir + '/' + str(record).zfill(3) + '_' + alink.text
print dirpath
if(alink.text.__contains__('/')):
deal_error(dirpath + '\n')
dirpath = img_dir + '/' + str(record).zfill(3) + '_' + alink.text.replace('/', 'or')
if os.path.isdir(dirpath) == False:
os.mkdir(dirpath)
suburl = website + alink.get('href')
#print suburl
subreq = urllib2.Request(suburl, headers=headers)
subpage = urllib2.urlopen(subreq, timeout=20)
subcontents = subpage.read()
# if record == 1:
# print subcontents
subsoup = BeautifulSoup(subcontents, 'html.parser')
imgs = subsoup.find_all('img', {'class':'lazy'})
cur = 0
for img in imgs:
cur += 1
link = img.get('src')
#print link
filename = dirpath + '/%02d.jpg'%cur
print filename
try:
urllib.urlretrieve(link, filename) #下载并保存到images文件夹
except:
deal_error(filename + "\n" + link + "\n")
record += 1
def deal_error(string):
fout = open("log_error.txt", "at")
fout.write(string)
fout.close()
record = 1
url = 'http://www.woyaogexing.com/touxiang/weixin/index.html'
website = 'http://www.woyaogexing.com'
crawl(url, website)
pageNum = 1
while (True):
pageNum += 1
print "请求第==================================================%d===================页" % pageNum
url = 'http://www.woyaogexing.com/touxiang/weixin/index_%d.html' % pageNum
crawl(url, website)
#遇到的问题 Connection reset by peer
#Temporary failure in name resolution
#最终会404 NOT FOUND异常终止程序
python3代码
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import requests, os, threading, re
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)\
AppleWebKit/537.36 (KHTML, like Gecko)\
Chrome/69.0.3497.100 Safari/537.36'}
def loge(msg):
with open('error_log.txt', 'at+') as fout:
try:
fout.write(msg)
except:
fout.write('Warning: 编码错误')
def save_img(url, path):
with open(path, 'wb') as fout:
response = requests.get(url, headers).content
fout.write(response)
def spider(url, website=''):
path = os.path.join(os.getcwd(), 'images')
if not os.path.exists(path):
os.mkdir(path)
response = requests.get(url, headers=headers).content
soup = BeautifulSoup(response, 'html.parser')
divs = soup.select('.txList')
next_page = soup.find('div', {'class':'page'})
for div in divs:
try:
title = re.sub('[\/:*?"<>|\n.]', '_', div.a.get('title'))
dir_name = os.path.join(path, title)
if not os.path.exists(dir_name):
os.mkdir(dir_name)
except:
loge('Error: ' + str(div))
continue
response = requests.get(website + str(div.a.get('href'))).content
soup = BeautifulSoup(response, 'html.parser')
lis = soup.select('.tx-img')
for li in lis:
img_url = 'http:' + li.a.get('href')
file_path = os.path.join(dir_name, img_url.split('/')[-1])
thread = threading.Thread(target=save_img, args=(img_url, file_path))
thread.start()
print(os.getpid(), url)
if next_page:
next_url = website + str(next_page.findAll('a')[-1].get('href'))
thread = threading.Thread(target=spider, args=(next_url, website))
thread.start()
def main():
website = 'https://www.woyaogexing.com'
url = 'https://www.woyaogexing.com/touxiang/weixin/'
# index_40后网页结构变了
spider(url, website)
if __name__ == '__main__':
main()