下载地址:www.pan38.com/dow/share.p… 提取密码:1133
这个项目包含四个主要模块:小红书爬虫、微博爬虫、数据处理模块和主程序。使用时需要替换代码中的Cookie值,并安装必要的依赖库。请确保遵守相关平台的使用条款,仅用于学习研究目的。
import requests import json import time import random from bs4 import BeautifulSoup from urllib.parse import urlencode import re import pandas as pd
class XiaoHongShuCrawler: def init(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Referer': 'www.xiaohongshu.com/', 'Cookie': 'your_cookie_here' } self.session = requests.Session() self.comment_data = [] self.user_data = []
def get_note_id(self, url):
pattern = r'/note/([0-9a-f]+)'
match = re.search(pattern, url)
return match.group(1) if match else None
def get_comments(self, note_id, max_count=100):
base_url = 'https://www.xiaohongshu.com/fe_api/burdock/weixin/v2/note/{}/comment'
url = base_url.format(note_id)
params = {
'pageSize': 20,
'page': 1,
'topCommentId': '',
'imageFormats': 'jpg,webp,avif'
}
count = 0
while count < max_count:
try:
response = self.session.get(url, headers=self.headers, params=params)
if response.status_code != 200:
break
data = response.json()
if not data.get('data', {}).get('comments'):
break
for comment in data['data']['comments']:
user_info = {
'user_id': comment['user']['userId'],
'nickname': comment['user']['nickname'],
'avatar': comment['user']['images']
}
comment_info = {
'comment_id': comment['id'],
'content': comment['content'],
'likes': comment['likes'],
'time': comment['time'],
'note_id': note_id
}
self.comment_data.append(comment_info)
self.user_data.append(user_info)
count += 1
if count >= max_count:
break
params['page'] += 1
time.sleep(random.uniform(1, 3))
except Exception as e:
print(f"Error: {e}")
break
def save_to_csv(self, filename_prefix='xiaohongshu'):
comment_df = pd.DataFrame(self.comment_data)
user_df = pd.DataFrame(self.user_data)
comment_df.to_csv(f'{filename_prefix}_comments.csv', index=False, encoding='utf-8-sig')
user_df.to_csv(f'{filename_prefix}_users.csv', index=False, encoding='utf-8-sig')
def run(self, url, max_comments=100):
note_id = self.get_note_id(url)
if not note_id:
print("Invalid Xiaohongshu URL")
return
print(f"Start crawling note: {note_id}")
self.get_comments(note_id, max_comments)
self.save_to_csv()
print(f"Finished! Collected {len(self.comment_data)} comments and {len(self.user_data)} users")
if name == 'main': crawler = XiaoHongShuCrawler() url = input("Enter Xiaohongshu note URL: ") crawler.run(url, max_comments=200)
requests import json import time import random import re import pandas as pd from urllib.parse import urlencode from lxml import etree
class WeiboCrawler: def init(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Referer': 'weibo.com/', 'Cookie': 'your_weibo_cookie_here' } self.session = requests.Session() self.comment_data = [] self.user_data = []
def get_weibo_id(self, url):
pattern = r'weibo\.com/\d+/(\w+)'
match = re.search(pattern, url)
return match.group(1) if match else None
def get_comments(self, weibo_id, max_count=100):
base_url = 'https://weibo.com/ajax/statuses/buildComments'
params = {
'flow': 0,
'is_reload': 1,
'id': weibo_id,
'is_show_bulletin': 2,
'is_mix': 0,
'count': 20,
'uid': ''
}
count = 0
page = 1
while count < max_count:
try:
params['page'] = page
response = self.session.get(base_url, headers=self.headers, params=params)
if response.status_code != 200:
break
data = response.json()
if not data.get('data'):
break
for comment in data['data']:
user_info = {
'user_id': comment['user']['id'],
'screen_name': comment['user']['screen_name'],
'profile_image_url': comment['user']['profile_image_url'],
'verified': comment['user']['verified'],
'verified_type': comment['user']['verified_type']
}
comment_info = {
'comment_id': comment['id'],
'text': comment['text'],
'created_at': comment['created_at'],
'like_count': comment['like_count'],
'weibo_id': weibo_id
}
self.comment_data.append(comment_info)
self.user_data.append(user_info)
count += 1
if count >= max_count:
break
page += 1
time.sleep(random.uniform(1, 3))
except Exception as e:
print(f"Error: {e}")
break
def get_followers(self, user_id, max_count=50):
base_url = 'https://weibo.com/ajax/friendships/friends'
params = {
'uid': user_id,
'page': 1,
'count': 20
}
count = 0
while count < max_count:
try:
response = self.session.get(base_url, headers=self.headers, params=params)
if response.status_code != 200:
break
data = response.json()
if not data.get('users'):
break
for user in data['users']:
follower_info = {
'user_id': user['id'],
'screen_name': user['screen_name'],
'profile_url': user['profile_url'],
'followers_count': user['followers_count'],
'friends_count': user['friends_count'],
'statuses_count': user['statuses_count']
}
self.user_data.append(follower_info)
count += 1
if count >= max_count:
break
params['page'] += 1
time.sleep(random.uniform(2, 4))
except Exception as e:
print(f"Error: {e}")
break
def save_to_csv(self, filename_prefix='weibo'):
comment_df = pd.DataFrame(self.comment_data)
user_df = pd.DataFrame(self.user_data)
comment_df.to_csv(f'{filename_prefix}_comments.csv', index=False, encoding='utf-8-sig')
user_df.to_csv(f'{filename_prefix}_users.csv', index=False, encoding='utf-8-sig')
def run(self, url, max_comments=100, max_followers=0):
weibo_id = self.get_weibo_id(url)
if not weibo_id:
print("Invalid Weibo URL")
return
print(f"Start crawling weibo: {weibo_id}")
self.get_comments(weibo_id, max_comments)
if max_followers > 0:
user_id = input("Enter Weibo user ID to crawl followers (leave empty to skip): ")
if user_id:
self.get_followers(user_id, max_followers)
self.save_to_csv()
print(f"Finished! Collected {len(self.comment_data)} comments and {len(self.user_data)} users")
if name == 'main': crawler = WeiboCrawler() url = input("Enter Weibo URL: ") crawler.run(url, max_comments=200, max_followers=50)