爬取新闻
1.爬取网页---requests
2.解析
1.xml格式,用了re匹配的
2.html:bs4,lxml...(xml格式包含了html)
1. json:
2. python :内置的
3. java : fastjson--->漏洞
4. java: 谷歌 Gson
5. go :内置 基于反射,效率不高
案例
# pip3.10 install beautifulsoup4
import requests
from bs4 import BeautifulSoup
import pymysql
conn = pymysql.connect(
host = '127.0.0.1',
port = 3306,
user = 'root',
password='ln1998151125',
database='views',
charset='utf8mb4',
autocommit=True # 执行增、改、删除自动执行 = conn.commit(二次确认)
)
cursor = conn.cursor()
res = requests.get('https://www.autohome.com.cn/news/1/#liststart')
# print(res.text)
soup = BeautifulSoup(res.text,'html.parser') # 1 解析的字符串 2 解析器(html.parser内置的),第三方 lxml ,额外安装,否则报错
# soup = BeautifulSoup(res.text, 'lxml') # 1 解析的字符串 2 解析器(html.parser内置的),第三方 lxml ,额外安装,否则报错 pip3.10 install lxml
ul_list=soup.find_all(name='ul',class_="article")
# print(len(ul_list))
for ul in ul_list:
li_list=ul.find_all(name='li')
# print(len(li_list))
for li in li_list:
# 文章标题
h3=li.find(name='h3')
if not h3:
break
title = h3.text
# 文章连接
a = li.find(name='a').attrs.get('href') # https://www.autohome.com.cn/news/202307/1286380.html#pvareaid=102624
url = 'https:'+a
# print(url)
# 新闻图片
img_url = li.find(name='img').attrs.get('src')
img_url='https:'+img_url
print(img_url)
# 文章简介
p = li.find_all(name='p')[-1]
desc = p.text
# print(desc)
print("""
新闻标题:%s
新闻连接:%s
新闻图片:%s
新闻简介:%s
"""%(title,url,img_url,desc))
cursor.execute('insert into view (title,url,img_url,`dec`) values (%s,%s,%s,%s)',args=[title,url,img_url,desc]) # desc是mysql关键字
# cursor.execute('insert into new (title,url,img_url,look_num,critical_num,`desc`) values (%s,%s,%s,%s,%s,%s)'%(title,url,img_url,look_num,critical_num,desc) ) #会xss攻击
conn.commit()
bs4介绍文档树
bs4文档树:用来解析html格式,在html中查找元素的第三方包(即直接通过标签名字选择,特点是选择速度快,但如果存在多个相同的标签则只返回第一个)
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story<span>nana</span></b><b>adfasdf<b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,'lxml')
# print(soup.prettify()) # 美化
用法1:通过.遍历
a = soup.html.body.a
print(a)
a1 = soup.a # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print(a1) # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
用法2:获取标签的名称(对象.name)
a = soup.a # soup.a是个对象
print(a) # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print(a.name) # a
用法3:获取标签的属性
a = soup.a.attrs
print(a) # {'href': 'http://example.com/elsie', 'class': ['sister'], 'id': 'link1'}
print(a.get('href')) # http://example.com/elsie
print(a.get('id')) # link1
用法4:获取变迁的内容---文本内容
p= soup.p.text # text 会把当前标签的子子孙孙的文本内容都拿出来,拼到一起
print(p) # The Dormouse's storynanaadfasdf
p1 = soup.p.string # 当前标签有且只有自己(没有子标签),把文本内容拿出来
print(p1) # None
p2 = soup.p.strings # # generator 把子子孙孙的文本内容放到生成器中
print(p2) # <generator object Tag._all_strings at 0x102b137d0>
print(list(p2)) # ["The Dormouse's story", 'nana', 'adfasdf']
用法5:嵌套选择 .完后可以继续再.
print(soup.head.title.text) # The Dormouse's story
用法6:子节点、子孙节点
1.soup.p.contents # p下所有直接子节点
print(soup.p.contents) # [<b>The Dormouse's story<span>nana</span></b>, <b>adfasdf<b></b></b>]
2.soup.p.children 得到一个迭代器,包含p下所有直接子节点
print(list(soup.p.children)) # [<b>The Dormouse's story<span>nana</span></b>, <b>adfasdf<b></b></b>]
用法7:父节点,祖先节点
print(soup.a.parent) # 获取a标签的父节点,<p class="story">.....</p>
print(list(soup.a.parents)) #找到a标签所有的祖先节点,父亲的父亲,父亲的父亲的父亲...
用法8:兄弟节点
print(soup.a.next_sibling) #下一个兄弟
print(soup.a.previous_sibling) #上一个兄弟 Once upon a time there were three little sisters; and their names were
print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象 ['\n', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, ' and\n', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, ';\nand they lived at the bottom of a well.']
print(list(soup.a.previous_siblings)) #上面的兄弟们=>生成器对象 ['Once upon a time there were three little sisters; and their names were\n']
bs4搜索文档树
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b class ='baby'>The Dormouse's story<span>lqz</span></b><b>adfasdf<b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1" xx="xx">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3" name="zzz">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'lxml')
五种过滤器
字符串、正则表达式、列表、True、方法
字符串
通过字符串查找
a=soup.find(xx='xx') # 括号中可以写 name:标签名,id,class_,href,text,所有属性
a = soup.find(name='a')
print(a) # <a class="sister" href="http://example.com/elsie" id="link1" xx="xx">Elsie</a>
print(soup.find_all(name='a',class_="sister")) # [<a class="sister" href="http://example.com/elsie" id="link1" xx="xx">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3" name="zzz">Tillie</a>]
print(soup.find_all(name='a',id = 'link2')) # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
print(soup.find(text='Elsie')) # Elsie
print(soup.find(text='Elsie').parent) # <a class="sister" href="http://example.com/elsie" id="link1" xx="xx">Elsie</a>
print(soup.find(href="http://example.com/elsie")) # <a class="sister" href="http://example.com/elsie" id="link1" xx="xx">Elsie</a>
print(soup.find(attrs={'class':'sister'})) # # 可以通过attrs传属性
print(soup.find(attrs={'name': 'zzz'})) # 可以通过attrs传属性
正则表达式
import re
a = soup.find_all(class_=re.compile('^s'))
print(a)
# 需求:找出所有有链接的标签
a = soup.find_all(href=re.compile('^http'))
print(a)
列表
a= soup.find_all(name=['b','span'])
print(a) # [<b class="baby">The Dormouse's story<span>lqz</span></b>, <span>lqz</span>, <b>adfasdf<b></b></b>, <b></b>]
print(soup.find_all(class_=['sister','title'])) # [<p class="title"><b class="baby">The Dormouse's story<span>lqz</span></b><b>adfasdf<b></b></b></p>, <a class="sister" href="http://example.com/elsie" id="link1" xx="xx">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3" name="zzz">Tillie</a>]
True
a= soup.find_all(href = True)
print(a) # [<a class="sister" href="http://example.com/elsie" id="link1" xx="xx">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3" name="zzz">Tillie</a>]
# 查询所有有class但是没有id的标签
a = soup.find_all(class_=True,id=False)
print(a)
其他用法
1.find:本质就是find_all find的参数,就是find_all的参数,但是find_all比find多
2.recursive=True:是否递归查找,默认是True,如果写成false,只找第一层, limit=None
a = soup.find_all(name='html',recursive=False)
# print(a)
1.联合遍历文档使用
a=soup.html.p.find(name='b',recursive=False)
print(a) # <b class="baby">The Dormouse's story<span>lqz</span></b>
2.limit=None 限制找几条
a=soup.find_all(name='a',limit=1)
print(a) # [<a class="sister" href="http://example.com/elsie" id="link1" xx="xx">Elsie</a>]
css选择器
标签名
.类名
#id号
div a # div下的子子孙孙中得a
div>a #div直接子节点
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b class ='baby'>The Dormouse's story<span>nana</span></b><b>adfasdf<b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1" xx="xx">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3" name="zzz">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,'lxml')
基本选择器
print(soup.select('p')) # [<p class="title">....</p>]
print(soup.select('.sister')) # [<a class="sister" href="http://example.com/elsie" id="link1" xx="xx">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3" name="zzz">Tillie</a>]
print(soup.p.find(name='p').select('span')) # [<a class="sister" href="http://example.com/elsie" id="link1" xx="xx">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3" name="zzz">Tillie</a>]
css,xpath
1.以后,基本所有的解析器都会支持两种解析:css,xpath,都可以去页面中复制
import requests
res=requests.get('http://it028.com/css-selectors.html')
# res.encoding=res.apparent_encoding
res.encoding='utf-8'
# print(res.text)
soup=BeautifulSoup(res.text,'lxml')
res=soup.select('#content > table > tbody > tr:nth-child(14) > td:nth-child(3)')
# //*[@id="content"]/table/tbody/tr[14]/td[3]
print(res)
selenium基本使用
1.requests发送请求,不能加载ajax
2.selenium:直接操作浏览器,不是直接发送http请求,而是用代码控制模拟人操作浏览器的行为,js会自动加载
3.appnium :直接操作手机
4.使用步骤(操作什么浏览器:1 谷歌(为例) 2 ie 3 Firefox)
1.下载谷歌浏览器驱动(跟浏览器版本一致)
网址:https://registry.npmmirror.com/binary.html?path=chromedriver/
浏览器版本:114.0.5735.198(驱动版本对应)
2.将可执行文件,放到项目路径下
3.写代码
# pip3.8 install selenium
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
bro = webdriver.Chrome(executable_path='./chromedriver.exe') # 打开了浏览器
bro.get('https://www.baidu.com')
time.sleep(1)
bro.close()
案例
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
bro = webdriver.Chrome() # 打开了浏览器,默认路径来查找Chrome WebDriver。
print(bro)
bro.get('https://www.baidu.com')
time.sleep(10)
# 有id优先用id找
input_name = bro.find_element(by=By.ID,value='kw')
# 像标签写内容
input_name.send_keys('郑秀晶')
button = bro.find_element(by=By.ID,value='su')
button.click()
time.sleep(3)
bro.close()
模拟登陆百度
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
bro = webdriver.Chrome()
bro.get('https://www.baidu.com')
bro.implicitly_wait(10) #隐式等待--->找标签,如果找不到就先等,等10s,如果10s内,标签有了,直接往下执行,如果登录10s还没有,就报错
bro.maximize_window()
# 1.找登陆按钮
button = bro.find_element(By.ID,value='s-top-loginbtn')
# 如果是a标签,可以根据a标签文字找
# button = bro.find_element(By.LINK_TEXT, '登录')
button.click()
# 点击短信登录
sms_login = bro.find_element(By.ID, 'TANGRAM__PSP_11__headerLoginTab')
sms_login.click()
time.sleep(1)
# 2.点击账号登陆
username_login = bro.find_element(By.ID, 'TANGRAM__PSP_11__changePwdCodeItem')
username_login.click()
time.sleep(1)
input_user = bro.find_element(by = By.ID,value='TANGRAM__PSP_11__userName')
input_user.send_keys('18081245209')
input_pwd = bro.find_element(by = By.ID,value='TANGRAM__PSP_11__password')
input_pwd.send_keys('123445')
login = bro.find_element(by = By.ID,value='TANGRAM__PSP_11__submit')
time.sleep(4)
login.click()
time.sleep(3)
bro.close()
selenium其他用法
无界面浏览器
# 无界面浏览器(一堆配置)
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率
chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
bro=webdriver.Chrome(options=chrome_options)
bro.get('https://www.douyin.com/')
print(bro.page_source) # 当前页面内容
time.sleep(3)
bro.close()
搜索标签
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
bro = webdriver.Chrome()
bro.get('https://www.cnblogs.com/')
bro.implicitly_wait(10)
# bs4的 find和find_all也支持css
# selenium的find_element和 find_elements也支持css和xpath
bro.find_element(by=By.ID) # 根据id找一个
bro.find_element(by=By.NAME) # 根据name属性找一个
bro.find_elements(by=By.TAG_NAME,value='div') # 根据标签名找所有
bro.find_element(by=By.TAG_NAME,value='xxx') # 根据a标签文字
bro.find_element(by=By.PARTIAL_LINK_TEXT) #根据a标签文字模糊找
bro.find_element(by=By.CLASS_NAME) #根据类名
bro.find_element(by=By.CSS_SELECTOR) #根据css选择器找
bro.find_element(by=By.XPATH) #根据xpath
bro.close()
获取标签属性,文本,大小
print(tag.get_attribute('src')) # 用的最多
tag.text # 文本内容
#获取标签ID,位置,名称,大小(了解)
print(tag.id) # 不是属性id,是selenum提供的要给id,无用
print(tag.location) x,y坐标
print(tag.tag_name) 标签名
print(tag.size) 大小
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
bro = webdriver.Chrome() # 打开了浏览器
bro.get('https://www.chaojiying.com/apiuser/login/')
bro.implicitly_wait(10)
tag = bro.find_element(By.CSS_SELECTOR,'body > div.wrapper_danye > div > div.content_login > div.login_form > form > div > img')
print(tag.get_attribute('src')) # https://www.chaojiying.com/include/code/code.php?u=2
print(tag.id) # 551A961BF3D80C47B79E9C9026F2D166_element_10
print(tag.location) # {'x': 758, 'y': 291}
print(tag.tag_name) # img
print(tag.size) # {'height': 50, 'width': 180}
作业
1.把验证码图片保存本地
https://www.chaojiying.com/apiuser/login/
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from PIL import Image
bro = webdriver.Chrome()
bro.set_window_size(1200, 800)
bro.get('https://www.chaojiying.com/apiuser/login/')
bro.implicitly_wait(10)
bro.get_screenshot_as_file('CrawlResult/screenshot.png')
img = bro.find_element(by=By.CSS_SELECTOR,value='body > div.wrapper_danye > div > div.content_login > div.login_form > form > div > img')
img_location = img.location
img_size = img.size
print(img_location) # {'x': 758, 'y': 291}
print(img_size) # {'height': 50, 'width': 180}
left = int(img_location['x'])
top = int(img_location['y'])
right = int(img_location['x'] + img_size['width'])
bottom = int(img_location['y'] + img_size['height'])
# 通过Image处理图像
im = Image.open('CrawlResult/screenshot.png')
im = im.crop((left, top, right, bottom))
im.save('CrawlResult/code.png')