小红书评论采集工具,微博采集提取评论区数据,粉丝UID用户信息【仅供学习】

154 阅读3分钟

下载地址:www.pan38.com/dow/share.p… 提取密码:1133

这个项目包含四个主要模块:小红书爬虫、微博爬虫、数据处理模块和主程序。使用时需要替换代码中的Cookie值,并安装必要的依赖库。请确保遵守相关平台的使用条款,仅用于学习研究目的。

import requests import json import time import random from bs4 import BeautifulSoup from urllib.parse import urlencode import re import pandas as pd

class XiaoHongShuCrawler: def init(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Referer': 'www.xiaohongshu.com/', 'Cookie': 'your_cookie_here' } self.session = requests.Session() self.comment_data = [] self.user_data = []

def get_note_id(self, url):
    pattern = r'/note/([0-9a-f]+)'
    match = re.search(pattern, url)
    return match.group(1) if match else None
    
def get_comments(self, note_id, max_count=100):
    base_url = 'https://www.xiaohongshu.com/fe_api/burdock/weixin/v2/note/{}/comment'
    url = base_url.format(note_id)
    params = {
        'pageSize': 20,
        'page': 1,
        'topCommentId': '',
        'imageFormats': 'jpg,webp,avif'
    }
    
    count = 0
    while count < max_count:
        try:
            response = self.session.get(url, headers=self.headers, params=params)
            if response.status_code != 200:
                break
                
            data = response.json()
            if not data.get('data', {}).get('comments'):
                break
                
            for comment in data['data']['comments']:
                user_info = {
                    'user_id': comment['user']['userId'],
                    'nickname': comment['user']['nickname'],
                    'avatar': comment['user']['images']
                }
                comment_info = {
                    'comment_id': comment['id'],
                    'content': comment['content'],
                    'likes': comment['likes'],
                    'time': comment['time'],
                    'note_id': note_id
                }
                self.comment_data.append(comment_info)
                self.user_data.append(user_info)
                count += 1
                
                if count >= max_count:
                    break
                    
            params['page'] += 1
            time.sleep(random.uniform(1, 3))
            
        except Exception as e:
            print(f"Error: {e}")
            break
            
def save_to_csv(self, filename_prefix='xiaohongshu'):
    comment_df = pd.DataFrame(self.comment_data)
    user_df = pd.DataFrame(self.user_data)
    
    comment_df.to_csv(f'{filename_prefix}_comments.csv', index=False, encoding='utf-8-sig')
    user_df.to_csv(f'{filename_prefix}_users.csv', index=False, encoding='utf-8-sig')
    
def run(self, url, max_comments=100):
    note_id = self.get_note_id(url)
    if not note_id:
        print("Invalid Xiaohongshu URL")
        return
        
    print(f"Start crawling note: {note_id}")
    self.get_comments(note_id, max_comments)
    self.save_to_csv()
    print(f"Finished! Collected {len(self.comment_data)} comments and {len(self.user_data)} users")

if name == 'main': crawler = XiaoHongShuCrawler() url = input("Enter Xiaohongshu note URL: ") crawler.run(url, max_comments=200)

requests import json import time import random import re import pandas as pd from urllib.parse import urlencode from lxml import etree

class WeiboCrawler: def init(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Referer': 'weibo.com/', 'Cookie': 'your_weibo_cookie_here' } self.session = requests.Session() self.comment_data = [] self.user_data = []

def get_weibo_id(self, url):
    pattern = r'weibo\.com/\d+/(\w+)'
    match = re.search(pattern, url)
    return match.group(1) if match else None
    
def get_comments(self, weibo_id, max_count=100):
    base_url = 'https://weibo.com/ajax/statuses/buildComments'
    params = {
        'flow': 0,
        'is_reload': 1,
        'id': weibo_id,
        'is_show_bulletin': 2,
        'is_mix': 0,
        'count': 20,
        'uid': ''
    }
    
    count = 0
    page = 1
    while count < max_count:
        try:
            params['page'] = page
            response = self.session.get(base_url, headers=self.headers, params=params)
            if response.status_code != 200:
                break
                
            data = response.json()
            if not data.get('data'):
                break
                
            for comment in data['data']:
                user_info = {
                    'user_id': comment['user']['id'],
                    'screen_name': comment['user']['screen_name'],
                    'profile_image_url': comment['user']['profile_image_url'],
                    'verified': comment['user']['verified'],
                    'verified_type': comment['user']['verified_type']
                }
                comment_info = {
                    'comment_id': comment['id'],
                    'text': comment['text'],
                    'created_at': comment['created_at'],
                    'like_count': comment['like_count'],
                    'weibo_id': weibo_id
                }
                self.comment_data.append(comment_info)
                self.user_data.append(user_info)
                count += 1
                
                if count >= max_count:
                    break
                    
            page += 1
            time.sleep(random.uniform(1, 3))
            
        except Exception as e:
            print(f"Error: {e}")
            break
            
def get_followers(self, user_id, max_count=50):
    base_url = 'https://weibo.com/ajax/friendships/friends'
    params = {
        'uid': user_id,
        'page': 1,
        'count': 20
    }
    
    count = 0
    while count < max_count:
        try:
            response = self.session.get(base_url, headers=self.headers, params=params)
            if response.status_code != 200:
                break
                
            data = response.json()
            if not data.get('users'):
                break
                
            for user in data['users']:
                follower_info = {
                    'user_id': user['id'],
                    'screen_name': user['screen_name'],
                    'profile_url': user['profile_url'],
                    'followers_count': user['followers_count'],
                    'friends_count': user['friends_count'],
                    'statuses_count': user['statuses_count']
                }
                self.user_data.append(follower_info)
                count += 1
                
                if count >= max_count:
                    break
                    
            params['page'] += 1
            time.sleep(random.uniform(2, 4))
            
        except Exception as e:
            print(f"Error: {e}")
            break
            
def save_to_csv(self, filename_prefix='weibo'):
    comment_df = pd.DataFrame(self.comment_data)
    user_df = pd.DataFrame(self.user_data)
    
    comment_df.to_csv(f'{filename_prefix}_comments.csv', index=False, encoding='utf-8-sig')
    user_df.to_csv(f'{filename_prefix}_users.csv', index=False, encoding='utf-8-sig')
    
def run(self, url, max_comments=100, max_followers=0):
    weibo_id = self.get_weibo_id(url)
    if not weibo_id:
        print("Invalid Weibo URL")
        return
        
    print(f"Start crawling weibo: {weibo_id}")
    self.get_comments(weibo_id, max_comments)
    
    if max_followers > 0:
        user_id = input("Enter Weibo user ID to crawl followers (leave empty to skip): ")
        if user_id:
            self.get_followers(user_id, max_followers)
            
    self.save_to_csv()
    print(f"Finished! Collected {len(self.comment_data)} comments and {len(self.user_data)} users")

if name == 'main': crawler = WeiboCrawler() url = input("Enter Weibo URL: ") crawler.run(url, max_comments=200, max_followers=50)