贴吧爬取
写代码前,构思需要的功能块;写代码时,把各个功能模块名提前写好
初始化
初始化必要参数,完成基础设置 爬取百度贴吧lol吧:爬取地址中的get参数须传递(可以指定不同主题的贴吧和页码)
- 主题名
- 初始网址
- 请求头
生成网址
生成每一页的路由地址
- 根据列表生成式生成多个页面的地址
下载
get请求给每一页的地址,爬取页面
保存
保存爬取结果到文件中,把每一页爬取结果写入到对应名字的文件中
控制流程
将以上爬取操作封装到run函数中,方便外部对象调用,以后会在此添加多线程
- 生成要爬取的每一页的路由地址
- 通过for循环遍历每一个路由地址
- 对每个路由地址进行爬取和获取页码操作,并进行保存
源码
1 import requests
2
3 class TiebaSpider:
4 def __init__(self, tieba_name_crawl):
5 """
6 初始化必要参数,完成基础设置
7 爬取百度贴吧lol吧:爬取地址中的get参数须传递(可以指定不同主题的贴吧和页码)
8 """
9 self.tieba_name = tieba_name_crawl
10 self.url_base = 'https://tieba.baidu.com/f?kw=' + tieba_name_crawl + '&ie=utf-8&pn={}'
11 self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'}
12
13 def make_url(self):
14 """
15 生成每一页的路由地址
16 :return:(列表生成式)
17 """
18 return [self.url_base.format(i) for i in range(4)]
19
20 def download_url(self, url_str):
21 """
22 get请求给每一页的地址,爬取页面
23 :param url_str: 每一页的路由地址
24 :return: 爬取的结果
25 """
26 result = requests.get(url_str, headers=self.headers)
27 return result.text
28
29 def save_result(self, result, page_num):
30 """
31 保存爬取结果到文件中
32 :param result: 每一页的爬取结果
33 :param page_num: 页码,方便分类保存
34 :return: 把每一页爬取结果写入到对应名字的文件中
35 """
36 # with open('./download/lol' + str(page_num) + '.html', 'ab') as f:
37 # f.write(result.encode('utf-8'))
38 file_path = './download/{}~第{}页.html'.format(self.tieba_name,page_num)
39 with open(file_path,'wb') as f:
40 f.write(result.encode('utf-8'))
41
42 def run(self):
43 """
44 将以上爬取操作封装到run函数中,方便外部对象调用,以后会在此添加多线程
45 · 生成要爬取的每一页的路由地址
46 · 通过for循环遍历每一个路由地址
47 · 对每个路由地址进行爬取和获取页码操作,并进行保存
48 :return:
49 """
50 url_lists = self.make_url()
51 for url_str in url_lists:
52 result_str = self.download_url(url_str)
53 p_num = url_lists.index(url_str) + 1
54 self.save_result(result=result_str,page_num=p_num)
55
56 if __name__ == '__main__':
57 tieba_spider = TiebaSpider('lol')
58 tieba_spider.run()
爬取糗事百科
1 import requests
2 from bs4 import BeautifulSoup
3 import lxml.html
4
5 class QiushiSpider:
6 def __init__(self):
7 """
8 初始化必要参数,完成基础设置
9 """
10 # self.tieba_name = qiushi_name_crawl
11 # https: // www.qiushibaike.com / 8
12 # hr / page / 2 /
13 self.url_base = 'https://www.qiushibaike.com/8hr/page/{}/'
14 # self.url_base = 'https://tieba.baidu.com/f?kw=' + qiushi_name_crawl + '&ie=utf-8&pn={}'
15 self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'}
16
17 def make_url(self):
18 return [self.url_base.format(i) for i in range(4)]
19
20 def download_url(self, url_str):
21 result = requests.get(url_str, headers=self.headers)
22 #----------
23 # html = lxml.html.fromstring(result.text)
24 # html_data = html.xpath('//div[@class="content"]/span[1]/text()')
25 # data_all = []
26 # # for h in html_data:
27 # # data_all.append(h)
28 # return html_data
29 #-----------
30 return result.text
31
32 def save_result(self, result, page_num):
33 with open('./download/qiushi' + str(page_num) + '.html', 'ab') as f:
34 f.write(result.encode('utf-8'))
35
36
37 # qiushi = QiushiSpider()
38 # qiushi_url = qiushi.make_url()
39 # j = 1
40 # for i in qiushi_url:
41 # qiushi_text = qiushi.download_url(url_str=i)
42 # qiushi.save_result(result=qiushi_text, page_num=j)
43 # j += 1
爬取网站信息
BeautifulSoup方式
1 import requests
2 from bs4 import BeautifulSoup
3 class CountrySoup:
4 def __init__(self,country_name):
5 self.country_name = country_name
6 self.url_base = 'http://example.webscraping.com/places/default/view/{}'.format(self.country_name)
7 self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',}
8
9 def download_url(self):
10 result = requests.get(self.url_base,headers=self.headers)
11 soup = BeautifulSoup(result.text,'lxml')
12 tr = soup.find(attrs={'id':"places_country__row"})
13 print(tr,type(tr))
14 td = tr.find(attrs={'class':"w2p_fw"})
15 print(td,type(td))
16
17 print(td.text)
lxml方式
1 class CountrySpider:
2 def __init__(self,country_name):
3 self.country_name = country_name
4 self.url_base = 'http://example.webscraping.com/places/default/view/{}'.format(self.country_name)
5 self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',}
6
7 def download_url(self,url_str):
8 result = requests.get(url_str,headers=self.headers)
9 html = lxml.html.fromstring(result.text)
10 data_country = html.xpath('//tr[@id="places_country__row"]/td[@class="w2p_fw"]/text()')
11 data_capital = html.xpath('//tr[@id="places_capital__row"]/td[@class="w2p_fw"]/text()')
12 data_area = html.xpath('//tr[@id="places_area__row"]/td[@class="w2p_fw"]/text()')
13 data_all = ['国家:'+data_country[0],'首都:'+data_capital[0],'国土面积:'+data_area[0]]
14 return data_all
15 # print(html_data)
16
17 def save_result(self,result):
18 print(type(result),result)
19 for r in result:
20 r = r + '\n'
21 with open('./country.txt','ab') as f:
22 f.write(r.encode('utf-8'))
23 # with open('./country.txt','ab') as f:
24 # f.writelines(result)
25 def run(self):
26 result = self.download_url(self.url_base)
27 self.save_result(result)
28
29
30 if __name__ == '__main__':
31 # c = CountrySpider('Bolivia-27')
32 # c.run()
33 s = CountrySoup('Bolivia-27')
34 s.download_url()