记录一下

81 阅读3分钟

1. 准备

www.python.org/downloads/w… 下载最新的Python安装包

记得勾选环境变量 安装.png

2. 测试Python是否安装成功

python -V

3. 安装PyCharm

安装个汉化插件,然后新建个py项目,在main.py 开始

4. 开干

事情是这么个事情,需要一个excel包括所有烟的名称,类型,粗细,单价,条价,卖点口味,最好再带每一个烟的图片,我这一想这种类有点多啊!那就开干python来爬数据 www.cnxiangyan.com/jiage/list_…

5. 写了一天,终于跑出来了

import requests
from bs4 import BeautifulSoup  # 将复杂HTML文档转换成一个复杂的树形结构
import time  # 时间相关
import os  # 操作系统标准库os
import re  # 正则相关依赖
import xlsxwriter  # excel相关依赖
import sys
import io
import urllib.request  # 接受URL请求的相关模块
from PIL import Image

# 有些网站需要头信息来进行请求
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
           'Host': 'www.cnxiangyan.com',
           'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.5 Safari/605.1.15'}


# 获取列表页面的数据
def fire():
    for i in range(1, 114):
        print("开始爬取第 {} 页".format(i))
        # 涉及到分页 共 113 页 1466 条
        # http://www.cnxiangyan.com/brand/list_1.html
        # http://www.cnxiangyan.com/brand/list_2.html
        # http://www.cnxiangyan.com/brand/list_3.html
        # 分页规律 1,2,3 这样 {}
        url = 'http://www.cnxiangyan.com/brand/list_{}.html'.format(i)
        res = requests.get(url, headers=headers).text
        data = get_poster_url(res, i)  # 获取详细信息
        # download_picture(data) #下载图片
        time.sleep(5)  # 分页循环


# 获取详细信息
def get_poster_url(res, i):
    content = BeautifulSoup(res, "html.parser")  # 拿到整个网页 html
    data = content.find_all('ul')[0].find_all('li')
    cursor = 0
    picture_list = []
    if i > 1:  # 判断
        cursor = 13 * (i - 1)
    for d in data:
        cursor += 1
        # .strip() 去除字符串首尾的空格
        #  replace('\n', '').replace('\r', '') 去除回车符\r 和 换行符\n
        # 焦油量
        taste = d.find_all('span', attrs={'class': 'show_nx'})[1].text.replace('\n', '').replace('\r', '').strip()
        # 名字
        name = d.find('a').text.replace('\n', '').replace('\r', '').strip()
        # 类型
        types = d.find_all('span', attrs={'class': 'show_nx'})[0].text.replace('\n', '').replace('\r', '').strip()
        # 单价
        price = d.find_all('span', attrs={'class': 'show_jg'})[0].text
        # 条价
        total = d.find_all('span', attrs={'class': 'show_jg'})[1].text
        plist = d.find('img')['src']
        writeExcel(row=cursor, pic=plist, name=name, types=types, price=price, taste=taste,
                   total=total)
        # print(picture_list)
    return picture_list


def download_picture(pic_l):
    if not os.path.exists(r'picture'):
        os.mkdir(r'picture')
    for i in pic_l:
        # 爬出来的图片格式是//img1.cnxiangyan.com/2022/0106/61d69bd1eafc6.png?x-image-process=style/sy,需要拼接上http
        urls = "http:{}".format(i['plist'])
        # 我需要把?后参数去掉 分割字符串
        pattern = r'[?]'  # 定义分隔符
        result = re.split(pattern, urls)  # 以pattern的值 分割字符串
        # 取到数组 ['http://img1.cnxiangyan.com/2022/0106/61d69bd1eafc6.png','x-image-process=style/sy']
        pic = requests.get(result[0])
        # 这样保存文件需要个名字 ['http:', '', 'img1.cnxiangyan.com', '2022', '0106', '61d69bd1eafc6.png']
        # 这个就是创建 picture 文件夹 保存下来 我的图片文件
        # p_name = result[0].split('/')[5]
        # with open('picture\\' + p_name, 'wb') as f:
        #     f.write(pic.content)


# 将数据写入工作表中
def writeExcel(row=0, pic='', name='', types='', price='', taste='', total=''):
    format1 = {
        'align': 'center',  # 水平位置设置:居中
        'valign': 'vcenter',  # 垂直位置设置,居中
    }
    str_format = workbook.add_format(format1)
    if row == 0:
        worksheet.set_column('A:A', 30)  # 设置A列宽度30
        worksheet.set_column('B:B', 20)
        worksheet.set_column('C:C', 20)
        worksheet.set_column('D:D', 20)
        worksheet.set_column('E:E', 20)
        worksheet.set_column('F:F', 20)

        worksheet.write(row, 0, '图片', str_format)
        worksheet.write(row, 1, '名称', str_format)
        worksheet.write(row, 2, '类型', str_format)
        worksheet.write(row, 3, '焦油量', str_format)
        worksheet.write(row, 4, '单价', str_format)
        worksheet.write(row, 5, '条价', str_format)

    else:
        fixed_size = 100
        worksheet.set_row(row, 100)  # 设置行高100
        if pic != "":
            url = 'https://cnxiangyan-upload.xiazai63.com{}'.format(pic)
            image_data = io.BytesIO(urllib.request.urlopen(url).read())
            img = Image.open(image_data)
            if img.size[1] > fixed_size:
                x = float(fixed_size / img.size[1])
            else:
                x = 1
            worksheet.insert_image('A{}'.format(row + 1), url, {'image_data': image_data, 'x_scale': x, 'y_scale': x})
        worksheet.write(row, 1, name, str_format)
        worksheet.write(row, 2, types, str_format)
        worksheet.write(row, 3, taste, str_format)
        worksheet.write(row, 4, price, str_format)
        worksheet.write(row, 5, total, str_format)


if __name__ == '__main__': job_city_school = sys.argv[1:]  # 接受程序外部传入的参数
try:
    # 创建一个excel表格
    workbook = xlsxwriter.Workbook('tian.xlsx')
    # 为创建的excel表格添加一个工作表
    worksheet = workbook.add_worksheet()
    writeExcel(row=0)  # 先创建一个头部行
    fire()
    workbook.close()
    print('所有条目写入完成..')
except ValueError:
    print('输入文本有误 ') 

6. 纪念一下成果 哈哈哈哈哈

pythone.png

excel.png