python爬虫大众点评

535 阅读5分钟
# coding=utf-8
# -*- coding:uft-8 -*-
# 大众点评评论
# http://www.dianping.com/shop/HatnCwbSYxOxus7c/review_all
import requests
import random
import re
import csv
from fake_useragent import UserAgent
from lxml import etree
import time
import xlrd







ua = UserAgent(verify_ssl=False)
ua = ua.random
print(ua)
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53',
    'Referer': 'http://www.dianping.com/shop/HatnCwbSYxOxus7c/review_all',
    'Cookie': 'fspop=test; _lxsdk_cuid=18108b6f907c8-0fada7b4e64b84-4c647e53-e1000-18108b6f907c8; _lxsdk=18108b6f907c8-0fada7b4e64b84-4c647e53-e1000-18108b6f907c8; _hc.v=3393bbac-f15d-65a7-5ed9-dcd7d4d70e91.1653708618; s_ViewType=10; dplet=de725ab40f27508d89d46e5b958c61fb; dper=af853f51104eba198a196bd5c82dc4d19abe67253a67ceffa6ba297ee135c0e2025636327ad7bca469ca1abedaa1510315ab39041177dc59d8f21f91a5527d142efcedc604f77eb805f4362ac038a2463305afc303004aadd85e0ec20e464ca3; ua=dpuser_4113673911; ctu=ec326d55805904185d4cd84daf56afa79f631c389f1dea42a40a64e1a0760c6c; cy=342; cye=macau; aburl=1; Hm_lvt_dbeeb675516927da776beeb1d9802bd4=1653711281; ll=7fd06e815b796be3df069dec7836c3df; _lx_utm=utm_source%3Dbing%26utm_medium%3Dorganic; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1653722024,1653726865,1653728377,1653782003; WEBDFPID=v817zy8037vu53yxz9405u15u968593v8189w8z9vww979587zv57934-1653868622951-1653782220372MWQAGQSfd79fef3d01d5e9aadc18ccd4d0c95079562; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1653782370; _lxsdk_s=1810d16b9ad-3bc-f91-c30%7C%7C1143',
}
ip_list = [{'http': 'http://112.80.248.75:80'}]
ip = ip_list.pop(random.randint(0, len(ip_list) - 1))

print(f"{'正在使用的ip'}{ip}")



# 读取列表信息
worksheet = xlrd.open_workbook('澳门数据.xls')
sheet_names= worksheet.sheet_names()
cols=[]
for sheet_name in sheet_names:
    sheet = worksheet.sheet_by_name(sheet_name)
    rows = sheet.nrows # 获取行数
    cols = sheet.ncols # 获取列数,尽管没用到
    all_content = []
    cols = sheet.col_values(1) # 获取第二列内容, 数据格式为此数据的原有格式(原:字符串,读取:字符串;  原:浮点数, 读取:浮点数)
    # print(cols)打印列信息
    cols.remove('name')

for i in cols:
    # print(i)
    # 循环列表景点获取pinglun
    site_name = i
    # 断点测试
    print(site_name)
    url = f'https://www.dianping.com/search/keyword/342/0_{site_name}'
    # 断点测试
    print(url)
    nameres = requests.get(url,headers=headers,proxies=ip_list)
    a = re.findall(f'<h4>{site_name}</h4>', nameres.text, re.S)
    if a:
        print(f"找到{site_name}正在爬取数据==========================================")
    else:
        print(f'没有找到{site_name}正在跳过==========================================')
        continue
    namehtml = etree.HTML(nameres.text)
    nameurl = namehtml.xpath('//*[@id="shop-all-list"]/ul/li/div[1]/a/@href')[0]
    print(nameurl)
    shop = str(nameurl).split('/')[-1].replace("'",'').replace(']','')
    # 断点测试
    print(shop)

    time.sleep(3)

    # 全部评价
    page = 1
    index = True
    url =f'http://www.dianping.com/shop/{shop}/review_all/p{page}'
    # 断点测试
    print(url)
    # page = requests.get(url=url,headers=headers,proxies=ip_list)
    # page = etree.HTML(page.text)
    # page = int(page.xpath('//*[@id="review-list"]/div[2]/div[3]/div[3]/div[3]/div/a[9]/text()')[0])
    # page = int(page.xpath('//*[@id="review-list"]/div[2]/div[3]/div[3]/div[3]/div/a[6]')
    # print("共有"+str(page)+"页")

    while(index):
        time.sleep(5)
        url = f'http://www.dianping.com/shop/{shop}/review_all/p{page}'
        res = requests.get(url=url, headers=headers, proxies=ip_list)
        print(f"{ip}{'可以使用'}")
        print(f"{'返回状态码:'}{res}")

        cssjiami = etree.HTML(res.text).xpath(
            '//*[@id="review-list"]/div[2]/div[3]/div[3]/div[2]/ul/li/div/div[contains(@class,"review-words")]/svgmtsi/@class')
        # print(cssjiami)
        # print(len(cssjiami))

        res = res.text
        a = re.findall('<p class="not-found-words">抱歉!页面无法访问......</p>', res, re.S)
        b = re.findall("<div class='logo' id='logo'>验证中心</div>", res, re.S)
        c = re.findall("点评和打分都将是其他网友的参考依据,并影响该商户评价。", res, re.S)
        d = re.findall("下一页", res, re.S)
        if a:
            print('IP被封,死心了吧')
            temp = input("请输入内容回车继续")
            if temp == 'text':
                print(res)
        elif b:
            print('去浏览器进行手工验证')
            temp = input()
            continue
        else:
            # with open(str(range(1,6)) + '.html', 'w', encoding='utf-8') as file:
            #     file.write(res)
            print(url + '已经下载完毕')
        if d:
            page+=1
        else:
            index = False

        csshref = 'http://s3plus.meituan.net/v1/' + res.split('href="//s3plus.meituan.net/v1/')[1].split('.css">')[0] + '.css'
        print(f'css链接:   {csshref}')
        cssText = requests.get(csshref).text
        svgHref = 'http://' + cssText.split('url(//')[2].split('.svg')[0] + '.svg'
        print(f'svg链接:   {svgHref}')
        svgres = requests.get(svgHref).text
        mimicon = []
        for i in cssjiami:
            cssxzuobiao = re.compile(f"{i}{{background:-(.*?).0px")
            cssyzuobiao = re.compile(f"{i}{{background:.*?px -(.*?).0px;}}")
            x = int(''.join(cssxzuobiao.findall(cssText)))
            y = int(''.join(cssyzuobiao.findall(cssText))) + 23
            # print(x,y)
            svgzz = re.compile(f'<text x="0" y="{y}">(.*?)</text>')
            svgcon = svgzz.findall(svgres)
            svgcon = str(svgcon).replace("'", '').replace('[', '').replace(']', '')
            mimicon.append(svgcon[int(x / 14):int(x / 14 + 1)])
        # print(mimicon)
        # print(len(mimicon))

        # print(cssText)

        heightDic = {}
        ex = '<path id="(.*?)" d="M0 (.*?) H600"/>'
        for hei in re.compile(ex).findall(svgres):
            heightDic[hei[0]] = hei[1]
        # print(f'heightDic: {str(heightDic)[:500]}')
        wordDic = {}
        ex = '<textPath xlink:href="#(.*?)" textLength="(.*?)">(.*?)</textPath>'
        for row in re.compile(ex).findall(svgres):
            for word in row[2]:
                wordDic[((row[2].index(word) + 1) * -14 + 14, int(heightDic[row[0]]) * -1 + 23)] = word
        # print(f'wordDic: {str(wordDic)[:500]}')
        cssDic = {}
        ex = '.(.*?){background:(.*?).0px (.*?).0px;}'
        for css in re.compile(ex).findall(cssText):
            cssDic[css[0]] = (int(css[1]), int(css[2]))
        # print(f'cssDic: {str(cssDic)[:500]}')
        decryptDic = {'<svgmtsi class="' + i + '"></svgmtsi>': wordDic.get(cssDic[i], '?') for i in cssDic}
        # print(f'decryptDic: {str(decryptDic)[:500]}')
        for key in decryptDic:
            res = res.replace(key, decryptDic[key])
        print('-' * 100)

        tree = etree.HTML(res)
        names = []
        dates = []
        scores = []
        comments = ''
        for li in tree.xpath('//div[@class="reviews-items"]/ul/li'):
            name = li.xpath('.//a[@class="name"]/text()')[0].strip()
            date = li.xpath('.//span[@class="time"]/text()')[0].strip()
            try:
                score = '.'.join(li.xpath('.//div[@class="review-rank"]/span[1]/@class')[0].split()[1][-2:])
            except :
                try:
                    score = '.'.join(li.xpath('//*[@id="review-list"]/div[2]/div[3]/div[3]/div[2]/ul/li[1]/div/div[2]/span[2]/span[1]/text()')[0])
                except:
                    score='null'
            comment = ''.join(li.xpath('.//div[contains(@class,"review-words")]/text()')).replace('\n', '').strip()
            names.append(name)
            dates.append(date)
            scores.append(score)
            comments += 'wys' + comment
        print('-' * 100)

        for i in mimicon:
            comments = comments.replace('?', i, 1).replace('[', '').replace('{', '').replace('}', '')
        comments = comments.split('wys')
        comments.pop(0)
        # print(text)

        print(names)
        print(dates)
        print(scores)
        print(comments)

        csv_file = open(f'C:/Users/29836/Desktop/大众点评/data/{site_name}.csv', 'a+', newline='', encoding='GB18030')
        # 调用open()函数打开csv文件,传入参数:文件名“demo.csv”、写入模式“w”、newline=''、encoding='gbk'
        writer = csv.writer(csv_file)
        # 用csv.writer()函数创建一个writer对象。
        writer.writerow(['名字', '时间', '评分', '评论'])
        # 调用writer对象的writerow()方法,可以在csv文件里写入一行文字 “电影”和“豆瓣评分”。
        for i in range(0, len(names)):
            writer.writerow([names[i], dates[i], scores[i], comments[i]])
        # 在csv文件里写入一行文字 “熊出没之夺宝熊兵”和“10.0”
        csv_file.close()
        # 关闭文件

        time.sleep(3)
    print("文件名"+site_name+'---------------------------------------------------------------')