97、爬去新闻、bs4介绍遍历文档树、bs4搜索文档树、css选择器、selenium基本使用、selenium其他用法、搜索标签

107 阅读7分钟

爬取新闻

1.爬取网页---requests
2.解析 
	1.xml格式,用了re匹配的
	2.html:bs4,lxml...(xml格式包含了html)
		1. json:
    2. python :内置的
    3. java : fastjson--->漏洞
    4. java:  谷歌  Gson
    5. go :内置 基于反射,效率不高

案例

# pip3.10 install beautifulsoup4

import requests
from bs4 import BeautifulSoup
import pymysql


conn = pymysql.connect(
    host = '127.0.0.1',
    port = 3306,
    user = 'root',
    password='ln1998151125',
    database='views',
    charset='utf8mb4',
    autocommit=True # 执行增、改、删除自动执行 = conn.commit(二次确认)
)
cursor = conn.cursor()
res = requests.get('https://www.autohome.com.cn/news/1/#liststart')
# print(res.text)


soup = BeautifulSoup(res.text,'html.parser')  # 1 解析的字符串  2 解析器(html.parser内置的),第三方  lxml ,额外安装,否则报错
# soup = BeautifulSoup(res.text, 'lxml')  # 1 解析的字符串  2 解析器(html.parser内置的),第三方  lxml ,额外安装,否则报错  pip3.10 install lxml
ul_list=soup.find_all(name='ul',class_="article")
# print(len(ul_list))

for ul in ul_list:
    li_list=ul.find_all(name='li')
    # print(len(li_list))
    for li in li_list:
        # 文章标题
        h3=li.find(name='h3')
        if not h3:
            break
        title = h3.text

        # 文章连接
        a = li.find(name='a').attrs.get('href') # https://www.autohome.com.cn/news/202307/1286380.html#pvareaid=102624
        url = 'https:'+a
        # print(url)

        # 新闻图片
        img_url = li.find(name='img').attrs.get('src')
        img_url='https:'+img_url
        print(img_url)

        # 文章简介
        p = li.find_all(name='p')[-1]
        desc = p.text
        # print(desc)


        print("""
        新闻标题:%s
        新闻连接:%s
        新闻图片:%s
        新闻简介:%s

        """%(title,url,img_url,desc))
        cursor.execute('insert into view (title,url,img_url,`dec`) values (%s,%s,%s,%s)',args=[title,url,img_url,desc])  # desc是mysql关键字
        # cursor.execute('insert into new (title,url,img_url,look_num,critical_num,`desc`) values (%s,%s,%s,%s,%s,%s)'%(title,url,img_url,look_num,critical_num,desc) ) #会xss攻击
        conn.commit()

bs4介绍文档树

bs4文档树:用来解析html格式,在html中查找元素的第三方包(即直接通过标签名字选择,特点是选择速度快,但如果存在多个相同的标签则只返回第一个)

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story<span>nana</span></b><b>adfasdf<b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc,'lxml')
# print(soup.prettify())   # 美化

用法1:通过.遍历

a = soup.html.body.a
print(a)
a1 = soup.a  # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print(a1)  # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

用法2:获取标签的名称(对象.name)

a = soup.a  # soup.a是个对象
print(a)  # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print(a.name)  # a

用法3:获取标签的属性

a = soup.a.attrs
print(a)  # {'href': 'http://example.com/elsie', 'class': ['sister'], 'id': 'link1'}
print(a.get('href'))  # http://example.com/elsie
print(a.get('id'))  # link1

用法4:获取变迁的内容---文本内容

p= soup.p.text  # text 会把当前标签的子子孙孙的文本内容都拿出来,拼到一起
print(p)  # The Dormouse's storynanaadfasdf
p1 = soup.p.string  # 当前标签有且只有自己(没有子标签),把文本内容拿出来
print(p1)  # None
p2 = soup.p.strings  # # generator 把子子孙孙的文本内容放到生成器中
print(p2)  # <generator object Tag._all_strings at 0x102b137d0>
print(list(p2))  # ["The Dormouse's story", 'nana', 'adfasdf']

用法5:嵌套选择 .完后可以继续再.

print(soup.head.title.text)  # The Dormouse's story

用法6:子节点、子孙节点

1.soup.p.contents  # p下所有直接子节点
  print(soup.p.contents)  # [<b>The Dormouse's story<span>nana</span></b>, <b>adfasdf<b></b></b>]

2.soup.p.children 得到一个迭代器,包含p下所有直接子节点
  print(list(soup.p.children))  # [<b>The Dormouse's story<span>nana</span></b>, <b>adfasdf<b></b></b>]

用法7:父节点,祖先节点

print(soup.a.parent)  # 获取a标签的父节点,<p class="story">.....</p>
print(list(soup.a.parents))  #找到a标签所有的祖先节点,父亲的父亲,父亲的父亲的父亲...

用法8:兄弟节点

print(soup.a.next_sibling) #下一个兄弟
print(soup.a.previous_sibling) #上一个兄弟  Once upon a time there were three little sisters; and their names were


print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象 ['\n', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, ' and\n', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, ';\nand they lived at the bottom of a well.']
print(list(soup.a.previous_siblings)) #上面的兄弟们=>生成器对象  ['Once upon a time there were three little sisters; and their names were\n']

bs4搜索文档树

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b class ='baby'>The Dormouse's story<span>lqz</span></b><b>adfasdf<b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1" xx="xx">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3" name="zzz">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'lxml')

五种过滤器

字符串、正则表达式、列表、True、方法

字符串

通过字符串查找


  a=soup.find(xx='xx')   # 括号中可以写 name:标签名,id,class_,href,text,所有属性
  a = soup.find(name='a')
  print(a)  # <a class="sister" href="http://example.com/elsie" id="link1" xx="xx">Elsie</a>
  print(soup.find_all(name='a',class_="sister"))  # [<a class="sister" href="http://example.com/elsie" id="link1" xx="xx">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3" name="zzz">Tillie</a>]
  print(soup.find_all(name='a',id = 'link2'))  # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
  print(soup.find(text='Elsie'))  # Elsie
  print(soup.find(text='Elsie').parent)  # <a class="sister" href="http://example.com/elsie" id="link1" xx="xx">Elsie</a>
  print(soup.find(href="http://example.com/elsie"))  # <a class="sister" href="http://example.com/elsie" id="link1" xx="xx">Elsie</a>
  print(soup.find(attrs={'class':'sister'}))  # # 可以通过attrs传属性
  print(soup.find(attrs={'name': 'zzz'}))  # 可以通过attrs传属性

正则表达式

import re
a = soup.find_all(class_=re.compile('^s'))
print(a)

# 需求:找出所有有链接的标签
a = soup.find_all(href=re.compile('^http'))
print(a)

列表

a= soup.find_all(name=['b','span'])
print(a)  # [<b class="baby">The Dormouse's story<span>lqz</span></b>, <span>lqz</span>, <b>adfasdf<b></b></b>, <b></b>]

print(soup.find_all(class_=['sister','title'])) # [<p class="title"><b class="baby">The Dormouse's story<span>lqz</span></b><b>adfasdf<b></b></b></p>, <a class="sister" href="http://example.com/elsie" id="link1" xx="xx">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3" name="zzz">Tillie</a>]

True

a= soup.find_all(href = True)
print(a)  # [<a class="sister" href="http://example.com/elsie" id="link1" xx="xx">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3" name="zzz">Tillie</a>]

# 查询所有有class但是没有id的标签
a = soup.find_all(class_=True,id=False)
print(a)

其他用法

1.find:本质就是find_all    find的参数,就是find_all的参数,但是find_all比find多
2.recursive=True:是否递归查找,默认是True,如果写成false,只找第一层,  limit=None
a = soup.find_all(name='html',recursive=False)
# print(a)
	1.联合遍历文档使用
    a=soup.html.p.find(name='b',recursive=False)
    print(a)  # <b class="baby">The Dormouse's story<span>lqz</span></b>


  2.limit=None  限制找几条
   a=soup.find_all(name='a',limit=1)
   print(a)  # [<a class="sister" href="http://example.com/elsie" id="link1" xx="xx">Elsie</a>]

css选择器


标签名
.类名
#id号
div a  # div下的子子孙孙中得a
div>a  #div直接子节点
from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b class ='baby'>The Dormouse's story<span>nana</span></b><b>adfasdf<b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1" xx="xx">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3" name="zzz">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc,'lxml')

基本选择器

print(soup.select('p'))  # [<p class="title">....</p>]
print(soup.select('.sister'))  # [<a class="sister" href="http://example.com/elsie" id="link1" xx="xx">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3" name="zzz">Tillie</a>]
print(soup.p.find(name='p').select('span'))  # [<a class="sister" href="http://example.com/elsie" id="link1" xx="xx">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3" name="zzz">Tillie</a>]

css,xpath

1.以后,基本所有的解析器都会支持两种解析:css,xpath,都可以去页面中复制
  import requests
  res=requests.get('http://it028.com/css-selectors.html')
  # res.encoding=res.apparent_encoding
  res.encoding='utf-8'
  # print(res.text)
  soup=BeautifulSoup(res.text,'lxml')
  res=soup.select('#content > table > tbody > tr:nth-child(14) > td:nth-child(3)')
  # //*[@id="content"]/table/tbody/tr[14]/td[3]
  print(res)

selenium基本使用

1.requests发送请求,不能加载ajax 
2.selenium:直接操作浏览器,不是直接发送http请求,而是用代码控制模拟人操作浏览器的行为,js会自动加载
3.appnium :直接操作手机

4.使用步骤(操作什么浏览器:1 谷歌(为例) 2 ie 3 Firefox)
	1.下载谷歌浏览器驱动(跟浏览器版本一致)
		网址:https://registry.npmmirror.com/binary.html?path=chromedriver/
			浏览器版本:114.0.5735.198(驱动版本对应)
  2.将可执行文件,放到项目路径下
  3.写代码
        # pip3.8 install selenium
        from selenium import webdriver
        import time
        from selenium.webdriver.common.by import By
        bro = webdriver.Chrome(executable_path='./chromedriver.exe')  # 打开了浏览器
        bro.get('https://www.baidu.com')
        time.sleep(1)
        bro.close()

案例

from selenium import webdriver
import time
from selenium.webdriver.common.by import By

bro = webdriver.Chrome()  # 打开了浏览器,默认路径来查找Chrome WebDriver。
print(bro)

bro.get('https://www.baidu.com')
time.sleep(10)
# 有id优先用id找
input_name = bro.find_element(by=By.ID,value='kw')
# 像标签写内容
input_name.send_keys('郑秀晶')
button = bro.find_element(by=By.ID,value='su')
button.click()
time.sleep(3)
bro.close()

模拟登陆百度

from selenium import webdriver
import time
from selenium.webdriver.common.by import By



bro = webdriver.Chrome()
bro.get('https://www.baidu.com')
bro.implicitly_wait(10) #隐式等待--->找标签,如果找不到就先等,等10s,如果10s内,标签有了,直接往下执行,如果登录10s还没有,就报错
bro.maximize_window()

# 1.找登陆按钮
button = bro.find_element(By.ID,value='s-top-loginbtn')
# 如果是a标签,可以根据a标签文字找
# button = bro.find_element(By.LINK_TEXT, '登录')
button.click()



# 点击短信登录
sms_login = bro.find_element(By.ID, 'TANGRAM__PSP_11__headerLoginTab')
sms_login.click()
time.sleep(1)

# 2.点击账号登陆
username_login = bro.find_element(By.ID, 'TANGRAM__PSP_11__changePwdCodeItem')
username_login.click()
time.sleep(1)

input_user = bro.find_element(by = By.ID,value='TANGRAM__PSP_11__userName')
input_user.send_keys('18081245209')
input_pwd =  bro.find_element(by = By.ID,value='TANGRAM__PSP_11__password')
input_pwd.send_keys('123445')
login = bro.find_element(by = By.ID,value='TANGRAM__PSP_11__submit')
time.sleep(4)
login.click()
time.sleep(3)
bro.close()

selenium其他用法

无界面浏览器

# 无界面浏览器(一堆配置)
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率
chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败

bro=webdriver.Chrome(options=chrome_options)
bro.get('https://www.douyin.com/')
print(bro.page_source)  # 当前页面内容
time.sleep(3)
bro.close()

搜索标签

from selenium import webdriver
import time
from selenium.webdriver.common.by import By



bro = webdriver.Chrome()
bro.get('https://www.cnblogs.com/')
bro.implicitly_wait(10)


# bs4的 find和find_all也支持css
# selenium的find_element和 find_elements也支持css和xpath
bro.find_element(by=By.ID) # 根据id找一个
bro.find_element(by=By.NAME)  # 根据name属性找一个
bro.find_elements(by=By.TAG_NAME,value='div')  # 根据标签名找所有
bro.find_element(by=By.TAG_NAME,value='xxx')  # 根据a标签文字
bro.find_element(by=By.PARTIAL_LINK_TEXT)  #根据a标签文字模糊找
bro.find_element(by=By.CLASS_NAME)  #根据类名
bro.find_element(by=By.CSS_SELECTOR)  #根据css选择器找
bro.find_element(by=By.XPATH)  #根据xpath
bro.close()

获取标签属性,文本,大小

print(tag.get_attribute('src'))  # 用的最多
tag.text  # 文本内容
#获取标签ID,位置,名称,大小(了解)
print(tag.id) #  不是属性id,是selenum提供的要给id,无用
print(tag.location) x,y坐标
print(tag.tag_name) 标签名
print(tag.size)  大小
from selenium import webdriver
import time
from selenium.webdriver.common.by import By

bro = webdriver.Chrome()  # 打开了浏览器
bro.get('https://www.chaojiying.com/apiuser/login/')
bro.implicitly_wait(10)


tag = bro.find_element(By.CSS_SELECTOR,'body > div.wrapper_danye > div > div.content_login > div.login_form > form > div > img')

print(tag.get_attribute('src'))  # https://www.chaojiying.com/include/code/code.php?u=2
print(tag.id)  # 551A961BF3D80C47B79E9C9026F2D166_element_10
print(tag.location)  # {'x': 758, 'y': 291}
print(tag.tag_name)  # img
print(tag.size)  # {'height': 50, 'width': 180}

作业

1.把验证码图片保存本地
https://www.chaojiying.com/apiuser/login/


from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from PIL import Image

bro = webdriver.Chrome()
bro.set_window_size(1200, 800)

bro.get('https://www.chaojiying.com/apiuser/login/')

bro.implicitly_wait(10)
bro.get_screenshot_as_file('CrawlResult/screenshot.png')
img = bro.find_element(by=By.CSS_SELECTOR,value='body > div.wrapper_danye > div > div.content_login > div.login_form > form > div > img')
img_location = img.location
img_size = img.size
print(img_location)  # {'x': 758, 'y': 291}
print(img_size)  # {'height': 50, 'width': 180}
left = int(img_location['x'])
top = int(img_location['y'])
right = int(img_location['x'] + img_size['width'])
bottom = int(img_location['y'] + img_size['height'])

# 通过Image处理图像
im = Image.open('CrawlResult/screenshot.png')
im = im.crop((left, top, right, bottom))
im.save('CrawlResult/code.png')