小红书采集工具,采集提取评论作品主页信息,采集用户UID数据【python】

139 阅读3分钟

下载地址:www.pan38.com/dow/share.p… 提取密码:2812

这个小红书数据采集工具包含用户信息采集、作品详情采集和评论采集功能,使用requests和BeautifulSoup实现网页解析,数据保存为CSV格式。代码实现了完整的采集流程,包括异常处理和反爬措施。

import requests import json import re import time import random from bs4 import BeautifulSoup from urllib.parse import urlparse, parse_qs import csv import os

class XiaohongshuSpider: def init(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Referer': 'www.xiaohongshu.com/', 'Cookie': 'your_cookie_here' # 需要替换为有效cookie } self.session = requests.Session() self.base_url = 'www.xiaohongshu.com' self.comment_count = 0 self.user_count = 0 self.post_count = 0

def get_user_info(self, user_id):
    """获取用户主页信息"""
    url = f'{self.base_url}/user/profile/{user_id}'
    try:
        response = self.session.get(url, headers=self.headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # 提取用户基本信息
            user_info = {
                'user_id': user_id,
                'nickname': soup.find('div', class_='user-name').text.strip() if soup.find('div', class_='user-name') else '',
                'fans_count': soup.find('span', class_='fans').text.strip() if soup.find('span', class_='fans') else '',
                'follow_count': soup.find('span', class_='follow').text.strip() if soup.find('span', class_='follow') else '',
                'likes_count': soup.find('span', class_='likes').text.strip() if soup.find('span', class_='likes') else '',
                'description': soup.find('div', class_='desc').text.strip() if soup.find('div', class_='desc') else ''
            }
            
            # 保存用户信息
            self.save_user_info(user_info)
            self.user_count += 1
            print(f'成功采集用户: {user_info["nickname"]}')
            
            # 获取用户作品列表
            self.get_user_posts(user_id)
            
            return user_info
        else:
            print(f'获取用户信息失败,状态码: {response.status_code}')
            return None
    except Exception as e:
        print(f'获取用户信息异常: {e}')
        return None

def get_user_posts(self, user_id, limit=20):
    """获取用户作品列表"""
    url = f'{self.base_url}/user/profile/{user_id}/posts'
    params = {
        'page': 1,
        'pageSize': limit
    }
    
    try:
        response = self.session.get(url, headers=self.headers, params=params)
        if response.status_code == 200:
            data = response.json()
            if data.get('success'):
                for post in data.get('data', {}).get('notes', []):
                    post_id = post.get('id')
                    self.get_post_detail(post_id)
                    time.sleep(random.uniform(1, 3))
            else:
                print('获取用户作品列表失败:', data.get('message'))
        else:
            print(f'获取用户作品列表失败,状态码: {response.status_code}')
    except Exception as e:
        print(f'获取用户作品列表异常: {e}')

def get_post_detail(self, post_id):
    """获取作品详情"""
    url = f'{self.base_url}/explore/{post_id}'
    try:
        response = self.session.get(url, headers=self.headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # 提取作品信息
            post_info = {
                'post_id': post_id,
                'title': soup.find('h1', class_='title').text.strip() if soup.find('h1', class_='title') else '',
                'content': soup.find('div', class_='content').text.strip() if soup.find('div', class_='content') else '',
                'likes': soup.find('span', class_='like-count').text.strip() if soup.find('span', class_='like-count') else '',
                'collects': soup.find('span', class_='collect-count').text.strip() if soup.find('span', class_='collect-count') else '',
                'comments': soup.find('span', class_='comment-count').text.strip() if soup.find('span', class_='comment-count') else '',
                'publish_time': soup.find('span', class_='time').text.strip() if soup.find('span', class_='time') else '',
                'author_id': self.extract_user_id_from_post(soup)
            }
            
            # 保存作品信息
            self.save_post_info(post_info)
            self.post_count += 1
            print(f'成功采集作品: {post_info["title"]}')
            
            # 获取作品评论
            self.get_post_comments(post_id)
            
            return post_info
        else:
            print(f'获取作品详情失败,状态码: {response.status_code}')
            return None
    except Exception as e:
        print(f'获取作品详情异常: {e}')
        return None

def get_post_comments(self, post_id, limit=50):
    """获取作品评论"""
    url = f'{self.base_url}/web_api/sns/v1/comment/page'
    params = {
        'note_id': post_id,
        'page_size': limit,
        'sort': 'time'
    }
    
    try:
        response = self.session.get(url, headers=self.headers, params=params)
        if response.status_code == 200:
            data = response.json()
            if data.get('success'):
                for comment in data.get('data', {}).get('comments', []):
                    comment_info = {
                        'comment_id': comment.get('id'),
                        'post_id': post_id,
                        'user_id': comment.get('user', {}).get('userid'),
                        'nickname': comment.get('user', {}).get('nickname'),
                        'content': comment.get('content'),
                        'likes': comment.get('likes'),
                        'time': comment.get('time'),
                        'reply_count': comment.get('reply_count')
                    }
                    
                    # 保存评论信息
                    self.save_comment_info(comment_info)
                    self.comment_count += 1
                    
                    # 获取评论用户ID
                    if comment_info['user_id']:
                        self.get_user_info(comment_info['user_id'])
                        time.sleep(random.uniform(1, 3))
            else:
                print('获取评论失败:', data.get('message'))
        else:
            print(f'获取评论失败,状态码: {response.status_code}')
    except Exception as e:
        print(f'获取评论异常: {e}')

def extract_user_id_from_post(self, soup):
    """从作品页面提取用户ID"""
    author_link = soup.find('a', class_='author')
    if author_link:
        href = author_link.get('href')
        if href:
            return href.split('/')[-1]
    return ''

def save_user_info(self, user_info):
    """保存用户信息到CSV"""
    file_path = 'user_info.csv'
    file_exists = os.path.isfile(file_path)
    
    with open(file_path, 'a', newline='', encoding='utf-8-sig') as f:
        writer = csv.DictWriter(f, fieldnames=user_info.keys())
        if not file_exists:
            writer.writeheader()
        writer.writerow(user_info)

def save_post_info(self, post_info):
    """保存作品信息到CSV"""
    file_path = 'post_info.csv'
    file_exists = os.path.isfile(file_path)
    
    with open(file_path, 'a', newline='', encoding='utf-8-sig') as f:
        writer = csv.DictWriter(f, fieldnames=post_info.keys())
        if not file_exists:
            writer.writeheader()
        writer.writerow(post_info)

def save_comment_info(self, comment_info):
    """保存评论信息到CSV"""
    file_path = 'comment_info.csv'
    file_exists = os.path.isfile(file_path)
    
    with open(file_path, 'a', newline='', encoding='utf-8-sig') as f:
        writer = csv.DictWriter(f, fieldnames=comment_info.keys())
        if not file_exists:
            writer.writeheader()
        writer.writerow(comment_info)

def run(self, start_user_id):
    """启动爬虫"""
    print('小红书数据采集工具启动...')
    start_time = time.time()
    
    self.get_user_info(start_user_id)
    
    end_time = time.time()
    print(f'\n采集完成! 耗时: {end_time-start_time:.2f}秒')
    print(f'采集统计: 用户数={self.user_count}, 作品数={self.post_count}, 评论数={self.comment_count}')

if name == 'main': spider = XiaohongshuSpider() spider.run('5f0e3b3b0000000001000000') # 替换为你要采集的起始用户ID