爬取热搜
import json
import time
import requests
import sqlalchemy
import pandas as pd
path = 'weibo.sqlite3'
engine = sqlalchemy.create_engine(f'sqlite:///{path}')
conn = engine.connect()
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36'
}
def get_data():
"""微博热搜
Args:
params (dict): {}
Returns:
json: {title: 标题, url: 地址, num: 热度数值, hot: 热搜等级}
"""
data = []
response = requests.get("https://weibo.com/ajax/side/hotSearch", headers=headers)
data_json = response.json()['data']['realtime']
jyzy = {
'电影': '影',
'剧集': '剧',
'综艺': '综',
'音乐': '音'
}
for data_item in data_json:
hot = ''
# 如果是广告,则不添加
if 'is_ad' in data_item:
continue
if 'flag_desc' in data_item:
hot = jyzy.get(data_item['flag_desc'])
if 'is_boom' in data_item:
hot = '爆'
if 'is_hot' in data_item:
hot = '热'
if 'is_fei' in data_item:
hot = '沸'
if 'is_new' in data_item:
hot = '新'
dic = {
'title': data_item['note'],
'url': 'https://s.weibo.com/weibo?q=%23' + data_item['word'] + '%23',
'num': data_item['num'],
'hot': hot
}
data.append(dic)
return data
if __name__ == '__main__':
data = get_data()
data = pd.DataFrame(data=data)
# 找到所有的热搜
sql = 'select distinct title from weibo_resou'
crawled_resou = pd.read_sql(sql, conn)['title'].values.tolist()
data = data[~data['title'].isin(crawled_resou)]
print(data)
data.to_sql('weibo_resou', conn, if_exists='append', index=False, chunksize=100)
conn.close()
爬取微博内容
需要登录微博拿到自己的cookie
import time
import requests
import sqlalchemy
from bs4 import BeautifulSoup
import pandas as pd
import pickle as pkl
import torch
BASE_DIR = 'mysite'
path = 'weibo.sqlite3'
engine = sqlalchemy.create_engine(f'sqlite:///{path}')
conn = engine.connect()
# TODO 填写自己的cookie
cookie = '''
'''
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36',
'Cookie': cookie.strip(),
'Referer': 'https://s.weibo.com/weibo?q=%23%E5%B0%8F%E4%BC%99%E6%B5%8F%E8%A7%88%E9%9D%9E%E6%B3%95%E6%B6%89%E9%BB%84APP%E4%BA%94%E5%A4%A9%E8%A2%AB%E9%AA%97142%E4%B8%87%23&Refer=top&page=4',
}
# 加载模型
with open(BASE_DIR + '/notebooks/save_models/data_bundle.pkl', 'rb') as fp:
data_bundle = pkl.load(fp)
from fastNLP.io.model_io import ModelLoader
model = ModelLoader.load_pytorch_model(BASE_DIR + '/notebooks/save_models/cnn.senti.pkl')
char_vocab = data_bundle.get_vocab('chars')
def predict(sentence):
"""
预测
"""
idx = [char_vocab.to_index(s) for s in sentence]
idx = torch.tensor([idx])
return model.predict(idx)['pred'][0].item()
def crawl_content(url, resou_id):
"""
开始爬取微博
"""
try:
response = requests.get(url, headers=headers)
except:
print(url, '请求失败')
return False
if response.status_code != 200:
print('响应码: ', response.status_code)
False
soup = BeautifulSoup(response.text, 'html5lib')
data = []
for div in soup.select('div[action-type="feed_list_item"] div.card'):
try:
content = div.select_one('div.content p.txt').get_text().strip().replace(' ', '').replace('\n', '')
print(content)
label = predict(content)
data.append([content, label])
except:
continue
data = pd.DataFrame(data=data)
if data.shape[0] > 0:
data.columns = ['content', 'label']
data['resou_id'] = resou_id
data.to_sql('weibo_weibo', conn, if_exists='append', chunksize=100, index=False)
print('插入成功')
else:
print('无数据')
print(url, 'end')
# 查看是否有下一页
if soup.select_one('div.m-page a.next') is None:
return False
return True
# 开始爬取
if __name__ == '__main__':
sql = 'select id, title, url from weibo_resou order by id desc'
sql = f'''
select
id,
title,
url,
from
weibo_resou
where
id not in (
select distinct resou_id from weibo_weibo
)
order by
id desc
'''
df = pd.read_sql(sql, conn)
for _, row in df.iterrows():
_id = row['id']
url = row['url']
# 爬取20页
for page_index in range(1, 21):
_url = url + f'&page={page_index}'
print(_url, 'start')
has_next_page = crawl_content(_url, _id)
time.sleep(2)
# 没有下一页了
if has_next_page is False:
break
time.sleep(3)
爬取微博用户信息
需要登录微博拿到自己的cookie
def parse_user(self, response):
html = response.text
# 获取发布人id
user_id = response.meta['user_id']
# 获取user
user = response.meta['user']
# 正则获取所在省份
pattern = r'.*IP属地:(.*?)<\\/span>'
try:
province = re.search(pattern, html).group(1).strip().replace('\\t', '').strip()
except:
province = None
参考链接
- Scrapy中使用cookie. zhuanlan.zhihu.com/p/337212121