import requests
import random
import re
import csv
from fake_useragent import UserAgent
from lxml import etree
import time
import xlrd
ua = UserAgent(verify_ssl=False)
ua = ua.random
print(ua)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0
'Referer': 'http://www.dianping.com/shop/HatnCwbSYxOxus7c/review_all',
'Cookie': 'fspop=test
}
ip_list = [{'http': 'http://112.80.248.75:80'}]
ip = ip_list.pop(random.randint(0, len(ip_list) - 1))
print(f"{'正在使用的ip'}{ip}")
worksheet = xlrd.open_workbook('澳门数据.xls')
sheet_names= worksheet.sheet_names()
cols=[]
for sheet_name in sheet_names:
sheet = worksheet.sheet_by_name(sheet_name)
rows = sheet.nrows
cols = sheet.ncols
all_content = []
cols = sheet.col_values(1)
cols.remove('name')
for i in cols:
site_name = i
print(site_name)
url = f'https://www.dianping.com/search/keyword/342/0_{site_name}'
print(url)
nameres = requests.get(url,headers=headers,proxies=ip_list)
a = re.findall(f'<h4>{site_name}</h4>', nameres.text, re.S)
if a:
print(f"找到{site_name}正在爬取数据==========================================")
else:
print(f'没有找到{site_name}正在跳过==========================================')
continue
namehtml = etree.HTML(nameres.text)
nameurl = namehtml.xpath('//*[@id="shop-all-list"]/ul/li/div[1]/a/@href')[0]
print(nameurl)
shop = str(nameurl).split('/')[-1].replace("'",'').replace(']','')
print(shop)
time.sleep(3)
page = 1
index = True
url =f'http://www.dianping.com/shop/{shop}/review_all/p{page}'
print(url)
while(index):
time.sleep(5)
url = f'http://www.dianping.com/shop/{shop}/review_all/p{page}'
res = requests.get(url=url, headers=headers, proxies=ip_list)
print(f"{ip}{'可以使用'}")
print(f"{'返回状态码:'}{res}")
cssjiami = etree.HTML(res.text).xpath(
'//*[@id="review-list"]/div[2]/div[3]/div[3]/div[2]/ul/li/div/div[contains(@class,"review-words")]/svgmtsi/@class')
res = res.text
a = re.findall('<p class="not-found-words">抱歉!页面无法访问......</p>', res, re.S)
b = re.findall("<div class='logo' id='logo'>验证中心</div>", res, re.S)
c = re.findall("点评和打分都将是其他网友的参考依据,并影响该商户评价。", res, re.S)
d = re.findall("下一页", res, re.S)
if a:
print('IP被封,死心了吧')
temp = input("请输入内容回车继续")
if temp == 'text':
print(res)
elif b:
print('去浏览器进行手工验证')
temp = input()
continue
else:
print(url + '已经下载完毕')
if d:
page+=1
else:
index = False
csshref = 'http://s3plus.meituan.net/v1/' + res.split('href="//s3plus.meituan.net/v1/')[1].split('.css">')[0] + '.css'
print(f'css链接: {csshref}')
cssText = requests.get(csshref).text
svgHref = 'http://' + cssText.split('url(//')[2].split('.svg')[0] + '.svg'
print(f'svg链接: {svgHref}')
svgres = requests.get(svgHref).text
mimicon = []
for i in cssjiami:
cssxzuobiao = re.compile(f"{i}{{background:-(.*?).0px")
cssyzuobiao = re.compile(f"{i}{{background:.*?px -(.*?).0px;}}")
x = int(''.join(cssxzuobiao.findall(cssText)))
y = int(''.join(cssyzuobiao.findall(cssText))) + 23
svgzz = re.compile(f'<text x="0" y="{y}">(.*?)</text>')
svgcon = svgzz.findall(svgres)
svgcon = str(svgcon).replace("'", '').replace('[', '').replace(']', '')
mimicon.append(svgcon[int(x / 14):int(x / 14 + 1)])
heightDic = {}
ex = '<path id="(.*?)" d="M0 (.*?) H600"/>'
for hei in re.compile(ex).findall(svgres):
heightDic[hei[0]] = hei[1]
wordDic = {}
ex = '<textPath xlink:href="#(.*?)" textLength="(.*?)">(.*?)</textPath>'
for row in re.compile(ex).findall(svgres):
for word in row[2]:
wordDic[((row[2].index(word) + 1) * -14 + 14, int(heightDic[row[0]]) * -1 + 23)] = word
cssDic = {}
ex = '.(.*?){background:(.*?).0px (.*?).0px;}'
for css in re.compile(ex).findall(cssText):
cssDic[css[0]] = (int(css[1]), int(css[2]))
decryptDic = {'<svgmtsi class="' + i + '"></svgmtsi>': wordDic.get(cssDic[i], '?') for i in cssDic}
for key in decryptDic:
res = res.replace(key, decryptDic[key])
print('-' * 100)
tree = etree.HTML(res)
names = []
dates = []
scores = []
comments = ''
for li in tree.xpath('//div[@class="reviews-items"]/ul/li'):
name = li.xpath('.//a[@class="name"]/text()')[0].strip()
date = li.xpath('.//span[@class="time"]/text()')[0].strip()
try:
score = '.'.join(li.xpath('.//div[@class="review-rank"]/span[1]/@class')[0].split()[1][-2:])
except :
try:
score = '.'.join(li.xpath('//*[@id="review-list"]/div[2]/div[3]/div[3]/div[2]/ul/li[1]/div/div[2]/span[2]/span[1]/text()')[0])
except:
score='null'
comment = ''.join(li.xpath('.//div[contains(@class,"review-words")]/text()')).replace('\n', '').strip()
names.append(name)
dates.append(date)
scores.append(score)
comments += 'wys' + comment
print('-' * 100)
for i in mimicon:
comments = comments.replace('?', i, 1).replace('[', '').replace('{', '').replace('}', '')
comments = comments.split('wys')
comments.pop(0)
print(names)
print(dates)
print(scores)
print(comments)
csv_file = open(f'C:/Users/29836/Desktop/大众点评/data/{site_name}.csv', 'a+', newline='', encoding='GB18030')
writer = csv.writer(csv_file)
writer.writerow(['名字', '时间', '评分', '评论'])
for i in range(0, len(names)):
writer.writerow([names[i], dates[i], scores[i], comments[i]])
csv_file.close()
time.sleep(3)
print("文件名"+site_name+'---------------------------------------------------------------')