本文已参与「新人创作礼」活动,一起开启掘金创作之路。
python爬虫
通过前面所学的内容,我们做一个使用Python来获取网页的信息的实验
1、如何使python获取网页的所有html代码
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0
2、过滤图片地址
正则表达式:
是对字符串操作的一种逻辑公式,就是用事先定义好的一些特定的字符、及这些特定字符的组合,组成一个“规则字符串”,这个“规则字符串”用来表达对字符串的一种过滤逻辑。
I say Good not food 过滤出good和food两个单词
re模块的应用
单个字符匹配
. 点 匹配单个任意字符
re.findall(".ood","I say Good not food")
['Good', 'food']
[] 中括号里面的内容会被逐一单个匹配
re.findall("[Gf]ood","I say Good not food")
['Good', 'food']
\d 匹配单个数字
re.findall("\d","I am 40")
['4', '0']
re.findall("\d\d","I am 40")
['40']
\w 匹配[0-9]、[a-z]、[A-Z] 以及_ 中的任意一个字符
re.findall("\w","fg38! h")
['f', 'g', '3', '8', 'h']
\s 匹配空白字符 空格 tab键
re.findall("\s","f g38! h")
['\t', ' ']
匹配一组字符串
直接匹配
re.findall("Good","I say Good not food")
['Good']
分隔符的应用 匹配两个不同的字符串
re.findall("Good|food","I say Good not food")
['Good', 'food']
*号 匹配左邻字符出现0次或者多次
re.findall("go*gle","I like google not ggle goooogle and gogle")
['google', 'ggle', 'goooogle', 'gogle']
+号 左邻字符出现1次或多次
re.findall("go+gle","I like google not ggle goooogle and gogle")
['google', 'goooogle', 'gogle']
?号 左邻字符出现0次或1次
re.findall("go?gle","I like google not ggle goooogle and gogle")
['ggle', 'gogle']
{}号 定义左邻字符出现的次数
re.findall("go{2}gle","I like google not ggle goooogle and gogle")
['google']
re.findall("go{2,3}gle","I like google not ggle goooogle and gogle")
['google']
re.findall("go{2,10}gle","I like google not ggle goooogle and gogle")
['google', 'goooogle']
^ 匹配是否以某个字符串开头
re.findall("^I like","I like google not ggle goooogle and gogle")
['I like']
$匹配是否以某字符串结尾re.findall("gogle$","I like google not ggle goooogle and gogle")
['gogle']
()分组保存 \数字
test = re.search("(allen)\1","my name is allenallen")
test.group()
'allenallen'
实例:
笔者在网上找了一个网站做实验,特将源码奉上
import urllib.request
import re
class GetHtml(object):
def __init__(self,URL,HEAD):
self.url = URL
self.head = HEAD
def get_index(self):
self.request = urllib.request.Request(self.url)
self.request.add_header("user-agent",self.head)
self.response = urllib.request.urlopen(self.request)
return self.response.read()
def get_urllist(self):
self.strurllist = []
self.urllist = re.findall(b"/item/\w{5}_\w{1,2}.html",self.get_index())
#print(self.urllist)
for i in self.urllist:
self.strurllist.append("https://www.meitulu.com/" + str(i,encoding="utf8"))
#print(self.strurllist)
return self.strurllist
def get_list(self):
self.strimglist = []
#for self.url in self.get_urllist():
self.imglist = re.findall(b"https://mtl.gzhuibei.com/images/img/\w{5}/[^0]{1,2}.jpg",self.get_index())
#print(self.imglist)
for i in self.imglist:
self.strimglist.append(str(i,encoding="utf8"))
print(self.imglist)
return self.strimglist
def get_image(self,x):
num = 1
for self.url in self.get_list():
num += 1
with open(str(x)+"-"+str(num)+".jpg","wb") as f:
f.write(self.get_index())
html = GetHtml("https://www.meitulu.com/item/16323.html"," Mozilla/5.0 \
(Windows NT 10.0; Win64; x64; rv:75.0)Gecko/20100101 Firefox/75.0")
x = 0
for i in html.get_urllist():
x+=1
html = GetHtml(i," Mozilla/5.0 \
(Windows NT 10.0; Win64; x64; rv:75.0)Gecko/20100101 Firefox/75.0")
html.get_image(x)
运行结果:
参考文献: