import pandas as pd
from bs4 import BeautifulSoup
import re
import requests
import json
def download_page(url,para = None):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.59'}
if para:
response = requests.get(url,params=para,headers = headers)
else:
response = requests.get(url,headers = headers)
response.encoding = response.apparent_encoding
if response.status_code == 200:
return response.text
else:
print ("failed to download the page")
def xueqiu(Start,End):
comments_list = []
headers = {"Refer":"https://xueqiu.com/k?q=%E9%99%90%E5%94%AE%E8%82%A1%E8%A7%A3%E7%A6%81",
"Host": "xueqiu.com",
"Cookie":"acw_tc=2760820216245210669791692ead10af083a4c98f7b2541837fa80a96e6849
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0
for i in range(int(Start),int(End)+1):
url='https://xueqiu.com/query/v1/search/status.json?sortId=3&q=%E9%99%90%E5%94%AE%E8%82%A1%E8%A7%A3%E7%A6%81&count=10''&page='+ str(i)
response = requests.get(url, headers=headers, verify=False, timeout=30)
content = response.text
result = json.loads(content)
comments = result['list']
for i in range (0,len(comments)):
comment={}
comment['time']=comments[i]['timeBefore']
comment['target']=comments[i]['target']
comment['text'] = get_text("https://xueqiu.com"+comments[i]['target'])
comments_list.append(comment)
return comments_list
def get_text(url):
soup=BeautifulSoup(download_page(url))
pattern = re.compile("article__bd__detail.*?")
all_comments = soup.find_all("div",{'class':pattern})
text1=all_comments[0]
con=text1.get_text()
return con
def output_csv(datalist):
print(type(datalist),len(datalist))
import csv
csv_file = open("comments_data.csv", 'a+', newline='', encoding='utf-8-sig')
writer = csv.writer(csv_file)
writer.writerow(['Date', 'URL', 'Content'])
for data in datalist:
writer.writerow([data['time'], "https://xueqiu.com"+data['target'],data['text'],])
csv_file.close()
if __name__=="__main__":
Start = input('请输入开始爬取的页数:')
End = input('请输入结束爬取的页数:')
result = xueqiu(Start,End)
output_csv(result)