1. 准备
www.python.org/downloads/w… 下载最新的Python安装包
记得勾选环境变量
2. 测试Python是否安装成功
python -V
3. 安装PyCharm
安装个汉化插件,然后新建个py项目,在main.py 开始
4. 开干
事情是这么个事情,需要一个excel包括所有烟的名称,类型,粗细,单价,条价,卖点口味,最好再带每一个烟的图片,我这一想这种类有点多啊!那就开干python来爬数据 www.cnxiangyan.com/jiage/list_…
5. 写了一天,终于跑出来了
import requests
from bs4 import BeautifulSoup # 将复杂HTML文档转换成一个复杂的树形结构
import time # 时间相关
import os # 操作系统标准库os
import re # 正则相关依赖
import xlsxwriter # excel相关依赖
import sys
import io
import urllib.request # 接受URL请求的相关模块
from PIL import Image
# 有些网站需要头信息来进行请求
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Host': 'www.cnxiangyan.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.5 Safari/605.1.15'}
# 获取列表页面的数据
def fire():
for i in range(1, 114):
print("开始爬取第 {} 页".format(i))
# 涉及到分页 共 113 页 1466 条
# http://www.cnxiangyan.com/brand/list_1.html
# http://www.cnxiangyan.com/brand/list_2.html
# http://www.cnxiangyan.com/brand/list_3.html
# 分页规律 1,2,3 这样 {}
url = 'http://www.cnxiangyan.com/brand/list_{}.html'.format(i)
res = requests.get(url, headers=headers).text
data = get_poster_url(res, i) # 获取详细信息
# download_picture(data) #下载图片
time.sleep(5) # 分页循环
# 获取详细信息
def get_poster_url(res, i):
content = BeautifulSoup(res, "html.parser") # 拿到整个网页 html
data = content.find_all('ul')[0].find_all('li')
cursor = 0
picture_list = []
if i > 1: # 判断
cursor = 13 * (i - 1)
for d in data:
cursor += 1
# .strip() 去除字符串首尾的空格
# replace('\n', '').replace('\r', '') 去除回车符\r 和 换行符\n
# 焦油量
taste = d.find_all('span', attrs={'class': 'show_nx'})[1].text.replace('\n', '').replace('\r', '').strip()
# 名字
name = d.find('a').text.replace('\n', '').replace('\r', '').strip()
# 类型
types = d.find_all('span', attrs={'class': 'show_nx'})[0].text.replace('\n', '').replace('\r', '').strip()
# 单价
price = d.find_all('span', attrs={'class': 'show_jg'})[0].text
# 条价
total = d.find_all('span', attrs={'class': 'show_jg'})[1].text
plist = d.find('img')['src']
writeExcel(row=cursor, pic=plist, name=name, types=types, price=price, taste=taste,
total=total)
# print(picture_list)
return picture_list
def download_picture(pic_l):
if not os.path.exists(r'picture'):
os.mkdir(r'picture')
for i in pic_l:
# 爬出来的图片格式是//img1.cnxiangyan.com/2022/0106/61d69bd1eafc6.png?x-image-process=style/sy,需要拼接上http
urls = "http:{}".format(i['plist'])
# 我需要把?后参数去掉 分割字符串
pattern = r'[?]' # 定义分隔符
result = re.split(pattern, urls) # 以pattern的值 分割字符串
# 取到数组 ['http://img1.cnxiangyan.com/2022/0106/61d69bd1eafc6.png','x-image-process=style/sy']
pic = requests.get(result[0])
# 这样保存文件需要个名字 ['http:', '', 'img1.cnxiangyan.com', '2022', '0106', '61d69bd1eafc6.png']
# 这个就是创建 picture 文件夹 保存下来 我的图片文件
# p_name = result[0].split('/')[5]
# with open('picture\\' + p_name, 'wb') as f:
# f.write(pic.content)
# 将数据写入工作表中
def writeExcel(row=0, pic='', name='', types='', price='', taste='', total=''):
format1 = {
'align': 'center', # 水平位置设置:居中
'valign': 'vcenter', # 垂直位置设置,居中
}
str_format = workbook.add_format(format1)
if row == 0:
worksheet.set_column('A:A', 30) # 设置A列宽度30
worksheet.set_column('B:B', 20)
worksheet.set_column('C:C', 20)
worksheet.set_column('D:D', 20)
worksheet.set_column('E:E', 20)
worksheet.set_column('F:F', 20)
worksheet.write(row, 0, '图片', str_format)
worksheet.write(row, 1, '名称', str_format)
worksheet.write(row, 2, '类型', str_format)
worksheet.write(row, 3, '焦油量', str_format)
worksheet.write(row, 4, '单价', str_format)
worksheet.write(row, 5, '条价', str_format)
else:
fixed_size = 100
worksheet.set_row(row, 100) # 设置行高100
if pic != "":
url = 'https://cnxiangyan-upload.xiazai63.com{}'.format(pic)
image_data = io.BytesIO(urllib.request.urlopen(url).read())
img = Image.open(image_data)
if img.size[1] > fixed_size:
x = float(fixed_size / img.size[1])
else:
x = 1
worksheet.insert_image('A{}'.format(row + 1), url, {'image_data': image_data, 'x_scale': x, 'y_scale': x})
worksheet.write(row, 1, name, str_format)
worksheet.write(row, 2, types, str_format)
worksheet.write(row, 3, taste, str_format)
worksheet.write(row, 4, price, str_format)
worksheet.write(row, 5, total, str_format)
if __name__ == '__main__': job_city_school = sys.argv[1:] # 接受程序外部传入的参数
try:
# 创建一个excel表格
workbook = xlsxwriter.Workbook('tian.xlsx')
# 为创建的excel表格添加一个工作表
worksheet = workbook.add_worksheet()
writeExcel(row=0) # 先创建一个头部行
fire()
workbook.close()
print('所有条目写入完成..')
except ValueError:
print('输入文本有误 ')