>︿< 入不了门 重新开始 今天很慌张,感觉大家都开始做了
通过正则表达式获取文本信息
- re.findall:返回string中所有与pattern相匹配的全部字串,返回形式为数组
- (.*?)
- 参数re.DOTALL:选取多行信息
from urllib.request import urlopen
import re
html = urlopen("https://morvanzhou.github.io/static/scraping/basic-structure.html").read().decode('utf-8')
print(html)
# urlopen("url")读取网页信息
#read().decode('utf-8')读取并解码网页
res = re.findall(r"(.+?)", html)
print("\nPage title is: ", res[0])
#re.findall:返回string中所有与pattern相匹配的全部字串,返回形式为数组
#打印和中间的文本
#正则表达式:根据规则选取文本中的某段信息
res = re.findall(r"(.*?)
", html, flags=re.DOTALL)
print("\nPage paragraph is: ", res[0])
#re.DOTALL选取多行信息
res = re.findall(r'href="(.*?)"', html)
print("\nAll links: ", res)
bs4
####简化匹配过程选取tag信息(代替正则表达式
- pip install wheel
- pip lxml
from bs4 import BeautifulSoup
from urllib.request import urlopen
html = urlopen("https://morvanzhou.github.io/static/scraping/basic-structure.html").read().decode(‘utf-8’)
print(html)
soup = BeautifulSoup(html,features='lxml')
#参数features传入解析形式
print(soup.h1)
print('\n',soup.p)
all_herf = soup.find_all('a')
print(all_herf)
#soup.find_all('a')表示找到所有的a tag
all_herf = [l['href'] for l in all_href]
’‘’
相当于
for l in all_href:
print(l['href'])
‘’‘
使用tag:css的class(信息捆绑形式
- Class:通过批量采取某一种类型的信息
from bs4 import BeautifulSoup
from urllib.request import urlopen
html = urlopen("https://textweb").read().decode('utf-8')
# decode()里面的解码形式需要引号括起来
soup = BeautifulSoup(html,features = 'lxml')
#找到month类的值
month = soup.find_all('li',{"class":"month"})
for m in month:
print(m.get_text())
#直接print(m)输出包括超链接在内的tag,m.get_text()则输出文本
#在- 下找<>
jan = soup.find('ul',{"class":"jan"})
d_jan = jan.find_all('li')
for d in d_jan:
print(d.get_text())