下载地址:www.pan38.com/dow/share.p… 提取密码:2812
这个小红书数据采集工具包含用户信息采集、作品详情采集和评论采集功能,使用requests和BeautifulSoup实现网页解析,数据保存为CSV格式。代码实现了完整的采集流程,包括异常处理和反爬措施。
import requests import json import re import time import random from bs4 import BeautifulSoup from urllib.parse import urlparse, parse_qs import csv import os
class XiaohongshuSpider: def init(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Referer': 'www.xiaohongshu.com/', 'Cookie': 'your_cookie_here' # 需要替换为有效cookie } self.session = requests.Session() self.base_url = 'www.xiaohongshu.com' self.comment_count = 0 self.user_count = 0 self.post_count = 0
def get_user_info(self, user_id):
"""获取用户主页信息"""
url = f'{self.base_url}/user/profile/{user_id}'
try:
response = self.session.get(url, headers=self.headers)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# 提取用户基本信息
user_info = {
'user_id': user_id,
'nickname': soup.find('div', class_='user-name').text.strip() if soup.find('div', class_='user-name') else '',
'fans_count': soup.find('span', class_='fans').text.strip() if soup.find('span', class_='fans') else '',
'follow_count': soup.find('span', class_='follow').text.strip() if soup.find('span', class_='follow') else '',
'likes_count': soup.find('span', class_='likes').text.strip() if soup.find('span', class_='likes') else '',
'description': soup.find('div', class_='desc').text.strip() if soup.find('div', class_='desc') else ''
}
# 保存用户信息
self.save_user_info(user_info)
self.user_count += 1
print(f'成功采集用户: {user_info["nickname"]}')
# 获取用户作品列表
self.get_user_posts(user_id)
return user_info
else:
print(f'获取用户信息失败,状态码: {response.status_code}')
return None
except Exception as e:
print(f'获取用户信息异常: {e}')
return None
def get_user_posts(self, user_id, limit=20):
"""获取用户作品列表"""
url = f'{self.base_url}/user/profile/{user_id}/posts'
params = {
'page': 1,
'pageSize': limit
}
try:
response = self.session.get(url, headers=self.headers, params=params)
if response.status_code == 200:
data = response.json()
if data.get('success'):
for post in data.get('data', {}).get('notes', []):
post_id = post.get('id')
self.get_post_detail(post_id)
time.sleep(random.uniform(1, 3))
else:
print('获取用户作品列表失败:', data.get('message'))
else:
print(f'获取用户作品列表失败,状态码: {response.status_code}')
except Exception as e:
print(f'获取用户作品列表异常: {e}')
def get_post_detail(self, post_id):
"""获取作品详情"""
url = f'{self.base_url}/explore/{post_id}'
try:
response = self.session.get(url, headers=self.headers)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# 提取作品信息
post_info = {
'post_id': post_id,
'title': soup.find('h1', class_='title').text.strip() if soup.find('h1', class_='title') else '',
'content': soup.find('div', class_='content').text.strip() if soup.find('div', class_='content') else '',
'likes': soup.find('span', class_='like-count').text.strip() if soup.find('span', class_='like-count') else '',
'collects': soup.find('span', class_='collect-count').text.strip() if soup.find('span', class_='collect-count') else '',
'comments': soup.find('span', class_='comment-count').text.strip() if soup.find('span', class_='comment-count') else '',
'publish_time': soup.find('span', class_='time').text.strip() if soup.find('span', class_='time') else '',
'author_id': self.extract_user_id_from_post(soup)
}
# 保存作品信息
self.save_post_info(post_info)
self.post_count += 1
print(f'成功采集作品: {post_info["title"]}')
# 获取作品评论
self.get_post_comments(post_id)
return post_info
else:
print(f'获取作品详情失败,状态码: {response.status_code}')
return None
except Exception as e:
print(f'获取作品详情异常: {e}')
return None
def get_post_comments(self, post_id, limit=50):
"""获取作品评论"""
url = f'{self.base_url}/web_api/sns/v1/comment/page'
params = {
'note_id': post_id,
'page_size': limit,
'sort': 'time'
}
try:
response = self.session.get(url, headers=self.headers, params=params)
if response.status_code == 200:
data = response.json()
if data.get('success'):
for comment in data.get('data', {}).get('comments', []):
comment_info = {
'comment_id': comment.get('id'),
'post_id': post_id,
'user_id': comment.get('user', {}).get('userid'),
'nickname': comment.get('user', {}).get('nickname'),
'content': comment.get('content'),
'likes': comment.get('likes'),
'time': comment.get('time'),
'reply_count': comment.get('reply_count')
}
# 保存评论信息
self.save_comment_info(comment_info)
self.comment_count += 1
# 获取评论用户ID
if comment_info['user_id']:
self.get_user_info(comment_info['user_id'])
time.sleep(random.uniform(1, 3))
else:
print('获取评论失败:', data.get('message'))
else:
print(f'获取评论失败,状态码: {response.status_code}')
except Exception as e:
print(f'获取评论异常: {e}')
def extract_user_id_from_post(self, soup):
"""从作品页面提取用户ID"""
author_link = soup.find('a', class_='author')
if author_link:
href = author_link.get('href')
if href:
return href.split('/')[-1]
return ''
def save_user_info(self, user_info):
"""保存用户信息到CSV"""
file_path = 'user_info.csv'
file_exists = os.path.isfile(file_path)
with open(file_path, 'a', newline='', encoding='utf-8-sig') as f:
writer = csv.DictWriter(f, fieldnames=user_info.keys())
if not file_exists:
writer.writeheader()
writer.writerow(user_info)
def save_post_info(self, post_info):
"""保存作品信息到CSV"""
file_path = 'post_info.csv'
file_exists = os.path.isfile(file_path)
with open(file_path, 'a', newline='', encoding='utf-8-sig') as f:
writer = csv.DictWriter(f, fieldnames=post_info.keys())
if not file_exists:
writer.writeheader()
writer.writerow(post_info)
def save_comment_info(self, comment_info):
"""保存评论信息到CSV"""
file_path = 'comment_info.csv'
file_exists = os.path.isfile(file_path)
with open(file_path, 'a', newline='', encoding='utf-8-sig') as f:
writer = csv.DictWriter(f, fieldnames=comment_info.keys())
if not file_exists:
writer.writeheader()
writer.writerow(comment_info)
def run(self, start_user_id):
"""启动爬虫"""
print('小红书数据采集工具启动...')
start_time = time.time()
self.get_user_info(start_user_id)
end_time = time.time()
print(f'\n采集完成! 耗时: {end_time-start_time:.2f}秒')
print(f'采集统计: 用户数={self.user_count}, 作品数={self.post_count}, 评论数={self.comment_count}')
if name == 'main': spider = XiaohongshuSpider() spider.run('5f0e3b3b0000000001000000') # 替换为你要采集的起始用户ID