创建项目
scrapy startproject myproject [project_dir]
要创建新的爬虫
scrapy genspider juejin juejin.com
编写爬取过程
首先,由页面F12可知掘金文章是AJax请求加载的数据,在F12请求中可得到请求的url与请求参数、请求头
编写request请求
import json
from payapa.juejinItem import JuejinItem
import scrapy
from scrapy import Spider, Request
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError, TCPTimedOutError
class JuejinSpider(scrapy.Spider):
name = 'juejin'
# allowed_domains = ['juejin.com']
start_urls = [
'https://api.juejin.cn/']
# 请求URL
list_url = 'https://api.juejin.cn/recommend_api/v1/article/recommend_cate_feed'
# 伪装成浏览器
headers = {
"X-Agent": "Juejin/Web",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
"Content-Type": "application/json",
}
def start_requests(self):
# 请求报文
# 通过F12可以获知 第一次请求 cursor参数为0,第二次及以后这个值为第一次请求返回的值,之后的limit参数才可用
body = {"id_type": 2, "sort_type": 200,
"cate_id": "6809637767543259144", "cursor": "0", "limit": 10}
# POST 请求
yield scrapy.Request(url=self.list_url, callback=self.parse_httpbin, method="POST", headers=self.headers, body=json.dumps(body), errback=self.errback_httpbin)
def parse_httpbin(self, response):
# 返回结果转为json
result = json.loads(response.text)
if result.get("err_msg") == 'success':
data = result.get("data")
fileds = []
if len(data) > 0:
# 循环取出想要的值
for item in data:
# 规范好值的字段
filed = JuejinItem()
filed["title"] = item.get("article_info").get("title")
filed["categoryName"] = item.get(
"category").get("category_name")
# filed["tags"] = item.get("tags").get("tag_name")
tagValue = []
for tag in item.get("tags"):
tagValue.append(tag.get("tag_name"))
# 数组转为字符串存库
filed["tags"] = json.dumps(tagValue, ensure_ascii=False)
filed["briefContent"] = item.get(
"article_info").get("brief_content")
filed["userName"] = item.get(
"author_user_info").get("user_name")
fileds.append(filed)
yield filed
# 翻页 先判断是否还有下一页数据
if result.get("has_more"):
body = {"id_type": 2, "sort_type": 200,
"cate_id": "6809637767543259144", "cursor": result.get("cursor"), "limit": 20}
# 爬每一页
yield scrapy.Request(url=self.list_url, callback=self.parse_httpbin, method="POST", headers=self.headers, body=json.dumps(body), errback=self.errback_httpbin)
# self.logger.info(
# 'Got successful response from {}'.format(response.url))
# do something useful here...
# 错误处理
def errback_httpbin(self, failure):
self.logger.info(repr(failure))
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError on %s', response.url)
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
self.logger.error('DNSLookupError on %s', request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
self.logger.error('TimeoutError on %s', request.url)
JuejinItem 声明文件
# -*- coding:utf-8 -*-
import scrapy
class JuejinItem(scrapy.Item):
categoryName = scrapy.Field()
title = scrapy.Field()
tags = scrapy.Field()
userName = scrapy.Field()
briefContent = scrapy.Field()
pipelines 文件是 爬取完每条数据后的操作,如写入文件,存库等
存文件示例
from scrapy.exporters import JsonItemExporter
class JsonExporterPipleline(object):
# 调用scrapy提供的json export导出json文件
def __init__(self):
self.file = open('juejin.json', 'wb')
self.exporter = JsonItemExporter(
self.file, encoding="utf-8", ensure_ascii=False)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
存mysql数据库示例
from itemadapter import ItemAdapter
import json
import pymysql
class JuejinPipeline:
def process_item(self, item, spider):
# 将item里的数据拿出来
categoryName = item["categoryName"]
title = item["title"]
tags = item["tags"]
userName = item["userName"]
briefContent = item["briefContent"]
# 和本地的mysql数据库建立连接
db = pymysql.connect(
host="127.0.0.1",
user="root",
passwd="123456",
db="payapa",
# charset="utf-8",
# cursorclass=pymysql.cursors.DictCursor
)
try:
# 使用cursor()方法获取操作游标
cursor = db.cursor()
# SQL 插入语句
sql = "INSERT INTO juejin(categoryName, title, tags, userName, briefContent) \
VALUES ('{0}', '{1}', '{2}', '{3}', '{4}')".format(categoryName, title, tags, userName, briefContent)
# 执行SQL语句
cursor.execute(sql)
# 提交修改
db.commit()
finally:
# 关闭连接
db.close()
return item
编写完成pipeline文件后需要将编写的文件添加到setting.py 配置中
ITEM_PIPELINES = {
'payapa.pipelines.PayapaPipeline': 300,
'payapa.JsonExporterPipleline.JsonExporterPipleline': 300,
'payapa.juejinPipelines.JuejinPipeline': 300
}
分配给每个类的整型值,确定了他们运行的顺序,item按数字从低到高的顺序,通过pipeline,通常将这些数字定义在0-1000范围内(0-1000随意设置,数值越低,组件的优先级越高)
启动爬虫
scrapy crawl juejin
至此,爬取工作完成