小红书爬虫采集工具, 小红书截流获客软件,直播间评论作品提取软件

160 阅读2分钟

下载地址:www.pan38.com/share.php?c… 提取密码:7789

这个项目包含三个主要模块:爬虫核心、数据处理和代理管理。使用时需要安装requests、beautifulsoup4、pandas等库。请注意遵守目标网站的robots.txt和服务条款。

import requests
from bs4 import BeautifulSoup
import json
import time
import random
from fake_useragent import UserAgent

class XiaoHongShuSpider:
def init(self):
self.session = requests.Session()
self.ua = UserAgent()
self.base_url = "www.xiaohongshu.com"
self.headers = {
'Accept': 'text/html,application/xhtml+xml',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive'
}

def get_random_header(self):
    return {
        'User-Agent': self.ua.random,
        'Referer': self.base_url
    }

def get_note_detail(self, note_id):
    url = f"{self.base_url}/explore/{note_id}"
    try:
        response = self.session.get(url, headers=self.get_random_header())
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            # 解析页面内容
            title = soup.find('h1').text if soup.find('h1') else ''
            content = soup.find('div', class_='content').text if soup.find('div', class_='content') else ''
            return {
                'title': title,
                'content': content,
                'note_id': note_id
            }
        return None
    except Exception as e:
        print(f"Error fetching note {note_id}: {str(e)}")
        return None

def get_live_comments(self, live_id):
    api_url = f"{self.base_url}/api/live/{live_id}/comments"
    try:
        response = self.session.get(api_url, headers=self.get_random_header())
        if response.status_code == 200:
            return response.json()
        return None
    except Exception as e:
        print(f"Error fetching live comments {live_id}: {str(e)}")
        return None

def search_keyword(self, keyword, page=1):
    params = {
        'keyword': keyword,
        'page': page
    }
    try:
        response = self.session.get(
            f"{self.base_url}/search_api/v1/search",
            params=params,
            headers=self.get_random_header()
        )
        if response.status_code == 200:
            return response.json()
        return None
    except Exception as e:
        print(f"Error searching {keyword}: {str(e)}")
        return None

if name == "main":
spider = XiaoHongShuSpider()

# 示例用法
note_data = spider.get_note_detail("123456789")
print(note_data)
time.sleep(random.uniform(1, 3))
comments = spider.get_live_comments("987654321")
print(comments)

import pandas as pd
from datetime import datetime

class DataProcessor:
@staticmethod
def save_to_csv(data, filename):
df = pd.DataFrame(data)
df['crawl_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
df.to_csv(filename, index=False, encoding='utf_8_sig')
print(f"Data saved to {filename}")

@staticmethod
def filter_keyword_comments(comments, keywords):
    if not comments or not isinstance(comments, list):
        return []

    filtered = []
    for comment in comments:
        if any(keyword.lower() in comment.get('content', '').lower() 
              for keyword in keywords):
            filtered.append(comment)
    return filtered

@staticmethod
def analyze_user_behavior(notes_data):
    if not notes_data:
        return {}

    df = pd.DataFrame(notes_data)
    analysis = {
        'total_notes': len(df),
        'avg_title_length': df['title'].str.len().mean(),
        'avg_content_length': df['content'].str.len().mean(),
        'top_keywords': DataProcessor._extract_keywords(df['content'].str.cat(sep=' '))
    }
    return analysis

@staticmethod
def _extract_keywords(text, top_n=10):
    # 简单的关键词提取逻辑
    from collections import Counter
    import jieba
    words = jieba.cut(text)
    return Counter(words).most_common(top_n)

import requests
from threading import Lock

class ProxyManager:
def init(self, api_url=None):
self.proxies = []
self.current_index = 0
self.lock = Lock()
self.api_url = api_url

def load_proxies(self, file_path=None):
    if file_path:
        with open(file_path, 'r') as f:
            self.proxies = [line.strip() for line in f if line.strip()]
    elif self.api_url:
        try:
            response = requests.get(self.api_url)
            if response.status_code == 200:
                self.proxies = response.json().get('data', [])
        except Exception as e:
            print(f"Error loading proxies from API: {str(e)}")

def get_proxy(self):
    if not self.proxies:
        return None

    with self.lock:
        proxy = self.proxies[self.current_index]
        self.current_index = (self.current_index + 1) % len(self.proxies)
        return {
            'http': f'http://{proxy}',
            'https': f'http://{proxy}'
        }

def check_proxy_available(self, proxy):
    try:
        test_url = "http://www.baidu.com"
        response = requests.get(
            test_url, 
            proxies=proxy,
            timeout=5
        )
        return response.status_code == 200
    except:
        return False

def validate_all_proxies(self):
    available_proxies = []
    for proxy in self.proxies:
        formatted = {
            'http': f'http://{proxy}',
            'https': f'http://{proxy}'
        }
        if self.check_proxy_available(formatted):
            available_proxies.append(proxy)
    self.proxies = available_proxies
    return len(available_proxies)