下载地址:www.pan38.com/dow/share.p… 提取密码:1281
这个微博采集工具包含三个主要功能:1.采集微博评论 2.采集博主粉丝数据 3.采集微博内容。代码使用了requests库进行网络请求,BeautifulSoup解析HTML,fake-useragent生成随机User-Agent。使用时需要替换有效的微博Cookie。
import requests import json import time import random import csv from bs4 import BeautifulSoup from fake_useragent import UserAgent from urllib.parse import urlencode
class WeiboCrawler: def init(self): self.session = requests.Session() self.ua = UserAgent() self.headers = { 'User-Agent': self.ua.random, 'Referer': 'weibo.com/', 'Cookie': '' # 需要替换为有效的微博cookie } self.base_url = 'weibo.com/ajax/' self.comment_count = 0 self.follower_count = 0 self.post_count = 0
def get_user_id(self, screen_name):
url = f"{self.base_url}profile/info?uid={screen_name}"
try:
response = self.session.get(url, headers=self.headers)
if response.status_code == 200:
data = response.json()
return data['data']['user']['id']
return None
except Exception as e:
print(f"获取用户ID失败: {e}")
return None
def get_comments(self, weibo_id, max_count=1000):
comments = []
url = f"{self.base_url}statuses/buildComments"
params = {
'is_reload': 1,
'id': weibo_id,
'is_show_bulletin': 2,
'is_mix': 0,
'count': 20,
'uid': 0
}
while len(comments) < max_count:
try:
response = self.session.get(url, headers=self.headers, params=params)
if response.status_code == 200:
data = response.json()
if not data['data']:
break
comments.extend(data['data'])
params['max_id'] = data['max_id']
self.comment_count += len(data['data'])
print(f"已采集 {self.comment_count} 条评论")
time.sleep(random.uniform(1, 3))
else:
break
except Exception as e:
print(f"获取评论失败: {e}")
break
return comments
def get_followers(self, user_id, max_count=1000):
followers = []
url = f"{self.base_url}friendships/friends"
params = {
'uid': user_id,
'page': 1,
'count': 20
}
while len(followers) < max_count:
try:
response = self.session.get(url, headers=self.headers, params=params)
if response.status_code == 200:
data = response.json()
if not data['users']:
break
followers.extend(data['users'])
params['page'] += 1
self.follower_count += len(data['users'])
print(f"已采集 {self.follower_count} 个粉丝")
time.sleep(random.uniform(1, 3))
else:
break
except Exception as e:
print(f"获取粉丝失败: {e}")
break
return followers
def get_user_posts(self, user_id, max_count=100):
posts = []
url = f"{self.base_url}profile/myhot"
params = {
'uid': user_id,
'page': 1,
'feature': 0
}
while len(posts) < max_count:
try:
response = self.session.get(url, headers=self.headers, params=params)
if response.status_code == 200:
data = response.json()
if not data['data']['list']:
break
posts.extend(data['data']['list'])
params['page'] += 1
self.post_count += len(data['data']['list'])
print(f"已采集 {self.post_count} 条微博")
time.sleep(random.uniform(1, 3))
else:
break
except Exception as e:
print(f"获取微博失败: {e}")
break
return posts
def save_to_csv(self, data, filename):
if not data:
return
keys = data[0].keys()
with open(filename, 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.DictWriter(f, fieldnames=keys)
writer.writeheader()
writer.writerows(data)
print(f"数据已保存到 {filename}")
if name == 'main': crawler = WeiboCrawler()
# 示例使用
screen_name = input("请输入微博博主昵称: ")
user_id = crawler.get_user_id(screen_name)
if user_id:
print(f"开始采集 {screen_name} 的数据...")
# 采集粉丝数据
followers = crawler.get_followers(user_id, max_count=500)
crawler.save_to_csv(followers, f'{screen_name}_followers.csv')
# 采集微博内容
posts = crawler.get_user_posts(user_id, max_count=200)
crawler.save_to_csv(posts, f'{screen_name}_posts.csv')
# 采集微博评论(第一条微博)
if posts:
weibo_id = posts[0]['id']
comments = crawler.get_comments(weibo_id, max_count=300)
crawler.save_to_csv(comments, f'{screen_name}_comments.csv')
print("数据采集完成!")
else:
print("无法获取用户ID,请检查昵称是否正确或Cookie是否有效")