微博采集评论工具,微博博主主页粉丝采集软件,采集提取微博内容【python】

73 阅读2分钟

下载地址:www.pan38.com/dow/share.p… 提取密码:1281

这个微博采集工具包含三个主要功能:1.采集微博评论 2.采集博主粉丝数据 3.采集微博内容。代码使用了requests库进行网络请求,BeautifulSoup解析HTML,fake-useragent生成随机User-Agent。使用时需要替换有效的微博Cookie。

import requests import json import time import random import csv from bs4 import BeautifulSoup from fake_useragent import UserAgent from urllib.parse import urlencode

class WeiboCrawler: def init(self): self.session = requests.Session() self.ua = UserAgent() self.headers = { 'User-Agent': self.ua.random, 'Referer': 'weibo.com/', 'Cookie': '' # 需要替换为有效的微博cookie } self.base_url = 'weibo.com/ajax/' self.comment_count = 0 self.follower_count = 0 self.post_count = 0

def get_user_id(self, screen_name):
    url = f"{self.base_url}profile/info?uid={screen_name}"
    try:
        response = self.session.get(url, headers=self.headers)
        if response.status_code == 200:
            data = response.json()
            return data['data']['user']['id']
        return None
    except Exception as e:
        print(f"获取用户ID失败: {e}")
        return None

def get_comments(self, weibo_id, max_count=1000):
    comments = []
    url = f"{self.base_url}statuses/buildComments"
    params = {
        'is_reload': 1,
        'id': weibo_id,
        'is_show_bulletin': 2,
        'is_mix': 0,
        'count': 20,
        'uid': 0
    }
    
    while len(comments) < max_count:
        try:
            response = self.session.get(url, headers=self.headers, params=params)
            if response.status_code == 200:
                data = response.json()
                if not data['data']:
                    break
                comments.extend(data['data'])
                params['max_id'] = data['max_id']
                self.comment_count += len(data['data'])
                print(f"已采集 {self.comment_count} 条评论")
                time.sleep(random.uniform(1, 3))
            else:
                break
        except Exception as e:
            print(f"获取评论失败: {e}")
            break
    return comments

def get_followers(self, user_id, max_count=1000):
    followers = []
    url = f"{self.base_url}friendships/friends"
    params = {
        'uid': user_id,
        'page': 1,
        'count': 20
    }
    
    while len(followers) < max_count:
        try:
            response = self.session.get(url, headers=self.headers, params=params)
            if response.status_code == 200:
                data = response.json()
                if not data['users']:
                    break
                followers.extend(data['users'])
                params['page'] += 1
                self.follower_count += len(data['users'])
                print(f"已采集 {self.follower_count} 个粉丝")
                time.sleep(random.uniform(1, 3))
            else:
                break
        except Exception as e:
            print(f"获取粉丝失败: {e}")
            break
    return followers

def get_user_posts(self, user_id, max_count=100):
    posts = []
    url = f"{self.base_url}profile/myhot"
    params = {
        'uid': user_id,
        'page': 1,
        'feature': 0
    }
    
    while len(posts) < max_count:
        try:
            response = self.session.get(url, headers=self.headers, params=params)
            if response.status_code == 200:
                data = response.json()
                if not data['data']['list']:
                    break
                posts.extend(data['data']['list'])
                params['page'] += 1
                self.post_count += len(data['data']['list'])
                print(f"已采集 {self.post_count} 条微博")
                time.sleep(random.uniform(1, 3))
            else:
                break
        except Exception as e:
            print(f"获取微博失败: {e}")
            break
    return posts

def save_to_csv(self, data, filename):
    if not data:
        return
    keys = data[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8-sig') as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        writer.writerows(data)
    print(f"数据已保存到 {filename}")

if name == 'main': crawler = WeiboCrawler()

# 示例使用
screen_name = input("请输入微博博主昵称: ")
user_id = crawler.get_user_id(screen_name)

if user_id:
    print(f"开始采集 {screen_name} 的数据...")
    
    # 采集粉丝数据
    followers = crawler.get_followers(user_id, max_count=500)
    crawler.save_to_csv(followers, f'{screen_name}_followers.csv')
    
    # 采集微博内容
    posts = crawler.get_user_posts(user_id, max_count=200)
    crawler.save_to_csv(posts, f'{screen_name}_posts.csv')
    
    # 采集微博评论(第一条微博)
    if posts:
        weibo_id = posts[0]['id']
        comments = crawler.get_comments(weibo_id, max_count=300)
        crawler.save_to_csv(comments, f'{screen_name}_comments.csv')
    
    print("数据采集完成!")
else:
    print("无法获取用户ID,请检查昵称是否正确或Cookie是否有效")