selenium爬取京东商品信息
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys # 键盘按键操作
import time
import pymysql
conn = pymysql.connect(
user='root', # The first four arguments is based on DB-API 2.0 recommendation.
password="xxxxxxx",
host='127.0.0.1',
database="jingdong",
autocommit=True,
port=3306)
cursor = conn.cursor()
def get_goods(bro):
bro.execute_script('scrollTo(0,5000)') # 往下滑以下屏幕
li_list = bro.find_elements(By.CLASS_NAME, 'gl-item')
# print(len(li_list)) # 30
for watch in li_list:
try:
good_url = bro.find_element(By.CSS_SELECTOR,'#J_goodsList > ul > li:nth-child(4) > div > div.p-img > a').get_attribute('href')
img = bro.find_element(By.CSS_SELECTOR, '#J_goodsList > ul > li:nth-child(1) > div > div.p-img > a > img')
img_url = img.get_attribute('src')
# print(img_url)
price = bro.find_element(By.CSS_SELECTOR,
'#J_goodsList > ul > li:nth-child(1) > div > div.p-price > strong').text
# print(price)
desc = bro.find_element(By.CSS_SELECTOR,
'#J_goodsList > ul > li:nth-child(4) > div > div.p-name.p-name-type-2 > a > em').text
# print(desc)
common_num = bro.find_element(By.CSS_SELECTOR, '#J_comment_100021614535').text # J_comment_10079757918250
if not common_num:
common_num = bro.find_element(By.CSS_SELECTOR, '#J_comment_10079757918250').text
# print(common_num)
shop_name = bro.find_element(By.CSS_SELECTOR,
'#J_goodsList > ul > li:nth-child(4) > div > div.p-shop > span > a').text
# print(shop_name)
shop_nature = bro.find_element(By.CSS_SELECTOR, '#J_pro_100058523249 > i').text
if not shop_nature:
shop_nature = bro.find_element(By.ID, 'J_pro_10055476654431').text
# print(shop_nature)
print(f"""
商品链接:{good_url}
手机图片:{img_url}
手机价格:{price}
手机简介:{desc}
评论数量:{common_num}
手机店铺:{shop_name}
店铺性质:{shop_nature}
""")
cursor.execute('insert into test (good_url,img_url,price,`desc`,common_num,shop_name,shop_nature) values (%s,%s,%s,%s,%s,%s,%s)',
args=[good_url,img_url,price,desc,common_num,shop_name,shop_nature])
except Exception as e:
print(e)
continue
next = bro.find_element(By.PARTIAL_LINK_TEXT,'下一页')
next.click()
get_goods(bro)
bro = webdriver.Chrome()
bro.get('https://www.jd.com/')
bro.implicitly_wait(10)
bro.maximize_window()
try:
search_input = bro.find_element(By.ID,'key')
search_input.send_keys('华为手表')
search_input.send_keys(Keys.ENTER)
get_goods(bro)
except Exception as e:
print(e)
finally:
bro.close()
scrapy介绍、安装、架构
Scrapy 是一个爬虫框架(底层代码封装好了,只需要在固定位置写固定代码即可),应用领域比较广泛,相当于是爬虫界的django
scrapy架构
1.引擎(EGINE):引擎负责控制系统所有组件之间的数据流,并在某些动作发生时触发事件。大总管,负责整个爬虫数据的流动
2.调度器(SCHEDULER)用来接受引擎发过来的请求, 压入队列中, 并在引擎再次请求的时候返回. 可以想像成一个URL的优先级队列, 由它来决定下一个要抓取的网址是什么, 同时去除重复的网址
3.下载器(DOWLOADER) 用于下载网页内容, 并将网页内容返回给EGINE,下载器是建立在twisted这个高效的异步模型上的(效率很高,同时可以发送特别多请求出出)
4.爬虫(SPIDERS) SPIDERS是开发人员自定义的类,用来解析responses,并且提取items,或者发送新的请求
5.项目管道(ITEM PIPLINES) 在items被提取后负责处理它们,主要包括清理、验证、持久化(比如存到数据库)等操作
6.下载器中间件(Downloader Middlewares)位于Scrapy引擎和下载器之间,主要用来处理从EGINE传到DOWLOADER的请求request,已经从DOWNLOADER传到EGINE的响应response,你可用该中间件做以下几件事
7.爬虫中间件(Spider Middlewares)位于EGINE和SPIDERS之间,主要工作是处理SPIDERS的输入(即responses)和输出(即requests)
安装
1.Windows平台
1、pip3 install wheel # 安装后,便支持通过wheel文件安装软件,wheel文件官网:https://www.lfd.uci.edu/~gohlke/pythonlibs
2、pip3 install lxml
3、pip3 install pyopenssl
4、下载并安装pywin32:https://sourceforge.net/projects/pywin32/files/pywin32/
5、下载twisted的wheel文件:http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
6、执行pip3 install 下载目录\Twisted-17.9.0-cp36-cp36m-win_amd64.whl
7、pip3 install scrapy
2.Linux,mac平台
1、pip3 install scrapy
创建scrapy项目:使用命令
1.创建项目
scrapy startproject 项目名字
2.创建爬虫 :scrapy genspider 名字 域名
scrapy genspider cnblogs www.cnblogs.com
3.启动爬虫
1.方式一:使用命令
scrapy crawl cnblogs
2.方式二:使用脚本运行爬虫(创建启动文件)
# run.py
from scrapy.cmdline import execute
execute(['scrapy', 'crawl', 'cnblogs','--nolog'])
scrapy项目目录结构
myfirstscrapy # 项目包
-myfirstscrapy # 包
--spiders # 包,里面放了自定义的爬虫,类似于app
---__init__.py
---baidu.py # 自己建的爬虫
--cnblogs.py # 自己建的爬虫
--__init__.py
--items.py # 类似于django的 models表模型,一个个模型类
--middlewares.py # 中间件
--pipelines.py # 管道--->写持久化
--run.py # 自己建的,启动爬虫
--settings.py # 项目配置文件
-scrapy.cfg # 项目上线配置
ps:后期我们只关注spiders中得爬虫和pipelines持久化即可
scrapy解析数据
1.response对象有css方法和xpath方法( scrapy 内置了解析库,不需要使用第三方:支持xpath和css)
-css中写css选择器
-xpath中写xpath选择
2.重点1:
1.xpath取文本内容
'.//a[contains(@class,"link-title")]/text()'
2.xpath取属性
'.//a[contains(@class,"link-title")]/@href'
3.css取文本
'a.link-title::text'
4.css取属性
'img.image-scale::attr(src)'
3.重点2:
.extract_first() 取一个
.extract() 取所有
4.一启动爬虫:把start_urls地址包装成request对象--->
丢给引擎--->调度器--->排队--->引擎--->下载中间件--->下载器--->下载完成--->引擎--->爬虫---->就回到了parse
用css来获取所有文章
import scrapy
class CnblogsSpider(scrapy.Spider):
name = "cnblogs"
allowed_domains = ["www.cnblogs.com"]
start_urls = ["https://www.cnblogs.com"]
def parse(self, response):
# print(response.text) # http响应包装成了response
# 需求:解析出所有文章(css选择器)
arctile_list = response.css('article.post-item') # 列表中放对象
# print(len(arctile_list))
for article in arctile_list:
title = article.css('a.post-item-title::text').extract_first()
print(title)
url = article.css('a.post-item-title::attr(href)').extract_first()
print(url)
desc = article.css('p.post-item-summary::text').extract()
# print(desc)
real_desc = desc[0].replace('\n','').replace(' ','')
if real_desc:
desc = real_desc
else:
desc = desc[1].replace('\n', '').replace(' ', '')
print(desc)
img = article.css('img.avatar::attr(src)').extract_first()
print(img)
author = article.css('a.post-item-author>span::text').extract_first()
print(author)
create_time = article.css('span.post-meta-item>span::text').extract_first()
print(create_time)
print(f"""
文章标题:{title}
文章链接:{url}
文章简介:{desc}
作者头像:{img}
作者昵称:{author}
文章创建时间:{create_time}
""")
用xpath获取所有文章
import scrapy
class CnblogsSpider(scrapy.Spider):
name = "cnblogs"
allowed_domains = ["www.cnblogs.com"]
start_urls = ["https://www.cnblogs.com"]
def parse(self, response):
# print(response.text) # http响应包装成了response
# 需求:解析出所有文章(xpath选择器)
# article.post-item
arctile_list = response.xpath('//article[contains(@class,"post-item")]') # 列表中放对象
print(len(arctile_list))
for article in arctile_list:
# a.post - item - title::text
title = article.xpath('.//a/text()').extract_first()
print(title)
# a.post-item-title::attr(href)
url = article.xpath('.//a[contains(@class,"post-item-title")]/@href').extract_first()
print(url)
# p.post-item-summary::text
desc = article.xpath('.//p[contains(@class,"post-item-summary")]/text()').extract()
# print(desc)
real_desc = desc[0].replace('\n','').replace(' ','')
if real_desc:
desc = real_desc
else:
desc = desc[1].replace('\n', '').replace(' ', '')
print(desc)
# img.avatar::attr(src)
img = article.xpath('.//p//img/@src').extract_first()
print(img)
# a.post-item-author>span::text
author = article.xpath('.//section/footer/a[1]/span/text()').extract_first()
print(author)
# span.post-meta-item>span::text
create_time = article.xpath('.//section/footer/span[1]/span/text()').extract_first()
print(create_time)
print(f"""
文章标题:{title}
文章链接:{url}
文章简介:{desc}
作者头像:{img}
作者昵称:{author}
文章创建时间:{create_time}
""")
setting配置
基础配置
1.了解
BOT_NAME = "firstscrapy" #项目名字,整个爬虫名字
2.爬虫存放位置 了解
SPIDER_MODULES = ["firstscrapy.spiders"]
NEWSPIDER_MODULE = "firstscrapy.spiders"
3.是否遵循爬虫协议,一般都设为False
ROBOTSTXT_OBEY = False
4.USER_AGENT = "firstscrapy (+http://www.yourdomain.com)"
5.日志级别
LOG_LEVEL='ERROR'
6.DEFAULT_REQUEST_HEADERS 默认请求头
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
7.SPIDER_MIDDLEWARES 爬虫中间件
SPIDER_MIDDLEWARES = {
'cnblogs.middlewares.CnblogsSpiderMiddleware': 543,
}
8.DOWNLOADER_MIDDLEWARES 下载中间件
DOWNLOADER_MIDDLEWARES = {
'cnblogs.middlewares.CnblogsDownloaderMiddleware': 543,
}
9.ITEM_PIPELINES 持久化配置
ITEM_PIPELINES = {
'cnblogs.pipelines.CnblogsPipeline': 300,
}
高级配置(提高爬虫效率--scarpy框架)
1.增加并发:默认16
默认scrapy开启的并发线程为32个,可以适当进行增加。在settings配置文件中修改
CONCURRENT_REQUESTS = 100
值为100,并发设置成了为100。
2.降低日志级别:
在运行scrapy时,会有大量日志信息的输出,为了减少CPU的使用率。可以设置log输出信息为INFO或者ERROR即可。在配置文件中编写:
LOG_LEVEL = 'INFO'
3.禁止cookie:
如果不是真的需要cookie,则在scrapy爬取数据时可以禁止cookie从而减少CPU的使用率,提升爬取效率。在配置文件中编写:
COOKIES_ENABLED = False
4.禁止重试:
对失败的HTTP进行重新请求(重试)会减慢爬取速度,因此可以禁止重试。在配置文件中编写:
RETRY_ENABLED = False
5.减少下载超时:
如果对一个非常慢的链接进行爬取,减少下载超时可以能让卡住的链接快速被放弃,从而提升效率。在配置文件中进行编写:
DOWNLOAD_TIMEOUT = 10 超时时间为10s
项目
1.vue 前端 ,小程序,uniapp,安卓
-https://gitee.com/hjp1011/uniapp-oa?_from=gitee_search
-https://gitee.com/microapp/linjiashop-uniapp?_from=gitee_search
-https://gitee.com/open-source-byte/house?_from=gitee_search
-https://gitee.com/CZJpython/time-mercenaries
-微信小程序
-https://gitee.com/kesixin/QuestionWechatApp?_from=gitee_search
-https://gitee.com/yaozy717/hbhzdtn?_from=gitee_search
-https://gitee.com/voice-of-xiaozhuang/sportmini?_from=gitee_search
-vue
-https://gitee.com/vilson/vue-projectManage?_from=gitee_search
-https://gitee.com/nmgwap/vueproject?_from=gitee_search
-https://gitee.com/GaryZhouz/exam_system?_from=gitee_search
-python项目
-https://gitee.com/wushuiyong/walle-web
-https://gitee.com/fanhuibin1/zhaoxinpms?_from=gitee_search#%E4%BA%A7%E5%93%81%E8%93%9D%E5%9B%BE
-https://gitee.com/ht-jenny/Parking?_from=gitee_search