Python学习笔记(四) 爬取网站数据(静态,动态)
1. 知识点
threading
:python3
版本之后的新线程函数requests
: 自带函数,用于请求网络地址os
: 自带函数,用于操作文件相关openpyxl
: 开源第三方的excel导出的库,需要手动下载pip install openpyxlBeautifulSoup
:html
代码美化工具
2. 基本函数的使用
2.1 网站静态数据爬取(需要熟悉h5标签元素选择器)
- 访问网站之后通过页面审查元素的方式,查看
Element
部分
- 找到你想爬取的内容部分,记录改内容的最外层标签元素或者类名
import requests,sys
from bs4 import BeautifulSoup
class downloader(object):
def __init__(self):
self.server = 'http://bodboy.gitee.io/'
self.target = 'http://bodboy.gitee.io/blog/'
self.names = []
self.urls =[]
self.nums = 0
def getUrls(self):
req = requests.get(url=self.target)
req.encoding ='utf-8'
html = req.text
div = BeautifulSoup(html)
div_list = div.find_all('header' ,class_ ='article-header')
self.nums = len(div_list)
for each in div_list:
list_url = each.find_all('a' ,class_ ='article-title')
print(list_url)
url = list_url[0]
self.urls.append(self.server + url.get('href'))
self.names.append(url.string)
def write(self ,name ,path ,text):
write_flag = True
with open(path,'a',encoding='utf-8') as f:
f.writelines(text)
f.write('\n\n')
if __name__ == "__main__":
dl = downloader()
dl.getUrls()
print("开始下载文件....")
for i in range(dl.nums):
dl.write(dl.names[i], '博客文件.txt',dl.names[i] + '\r' + dl.urls[i])
sys.stdout.write('已下载:%.3f%%' % float(i/dl.nums) + '\r')
sys.stdout.flush()
print('文件下载完成')
- 运行文件
2.2 网站动态数据爬取(爬取接口数据)
- 实际上就是调用接口的形式,拿去接口中的数据,按照你想要的方式展示
例如,这个是拿网站某个接口数据,导出指定数据并下载图片
import threading
import requests
import os
from openpyxl import Workbook
class capturePc():
def __init__(self):
self.base_url = 'http://api.newibao.com/web/essay/publicEssayList'
self.params ={
'page':1,
'size':64
}
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
'Accept-Encoding': 'gzip, deflate'
}
def get_list(self):
req = requests.get(url=self.base_url,headers=self.headers,params=self.params)
data = req.json()
print()
try :
if data['data']['list'] :
data = data['data']['list']
return data
except :
print("no data find")
return None
def get_img_list(self):
res = self.get_list()
imgUrls = []
if res :
for e in res :
if e['picUrl']:
urls = e['picUrl']
imgUrls += urls
print("图片列表为:",imgUrls)
return imgUrls
def get_excel_data(self):
res = self.get_list()
excel_list =[]
if res :
for e in res:
cell = [e['addTime'],e['brief'],e['columnName'],e['details'],e['name'],e['updateTime']]
excel_list.append(cell)
print(excel_list)
return excel_list
def download_img(self,name):
if not os.path.exists(name):
os.mkdir(name)
print("文件夹{}创建成功".format(name))
imgList = self.get_img_list()
names =0
if imgList :
for i in imgList:
names += 1
threading.Thread(target=self.download, args=(names, i,name)).start()
def download(self,name,image_url,path):
print('开始下载:', name)
content = requests.get(image_url).content
path = '%s/%s.jpg' % (path, name)
with open(path, 'wb') as f:
f.write(content)
print('下载完成', name)
def export_excel(self):
wb = Workbook()
data_fileName = '蓝海图文数据.xlsx'
ws = wb.active
header =['创建时间','标题','分类','详情','名称','更新时间']
for row in range(len(header)):
c = row +1
ws.cell(row=1, column = c,value =header[row])
listIndex = self.get_excel_data()
if listIndex :
for imn in range(len(listIndex)):
ws.append(listIndex[imn])
wb.save(filename = data_fileName)
print("写入成功")
if __name__ == "__main__":
a = capturePc()
a.export_excel()
- 运行结果