本次分享两个爬虫案例,案例一爬取新浪微博热点,案例二爬取百度天气
案例1-新浪热点爬取
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author:Vaeditshen
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import requests
webhook = 'xxxx'
url='https://weibo.com/'
path = '/Users/shenshunfeng/Desktop/爬虫/chromedriver' # 指定浏览器驱动
from selenium.webdriver.chrome.options import Options
chrome_option = Options()
chrome_option.add_argument('window-size=1920x3000')
chrome_option.add_argument('--headless')
chrome_option.add_argument('--disable-gpu')
chrome_option.add_argument('blink-settings=imagesEnabled=false')
driver = webdriver.Chrome(executable_path=path, options=chrome_option)
# driver.implicitly_wait(20) # seconds
driver.get(url)
sleep(20)
html = driver.page_source
# print(html)
# driver.get_screenshot_as_file("xinlang.jpg")
# obj.save_screenshot("km2.png")
driver.close()
driver.quit()
soup = BeautifulSoup(html, "html.parser")
divs = soup.find_all('div', class_="wbpro-side-panel")
content = ''
texttmp = '[{}]({})'
j = 1
for i in divs:
a = i.find('a', class_="ALink_default_2ibt1")
t = i.find('div', class_="wbpro-textcut f12 cla").get_text()
if a.has_attr("href"):
# print(str(a["href"]).lstrip("/"), t)
print(texttmp.format(str(t).strip(), str(a["href"]).lstrip("/")))
md = texttmp.format(str(t).strip(), str(a["href"]).lstrip("/"))
content = content + str(j) + '.' + md + '\n'
print(content)
j = j + 1
# print(divs)
print(content)
wx_text = {
"msgtype": "markdown",
"markdown": {
"content": content
}
}
# 发送到群机器人
r = requests.post(webhook, data=json.dumps(wx_text))
print(r.text)
效果如下:
案例2-百度天气爬取
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author:Vaeditshen
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
import json
import requests
webhook = 'xxx'
url='http://weathernew.pae.baidu.com/weathernew/pc?query=%E5%B9%BF%E4%B8%9C%E6%B7%B1%E5%9C%B3%E5%A4%A9%E6%B0%94&srcid=4982&city_name=%E6%B7%B1%E5%9C%B3&province_name=%E5%B9%BF%E4%B8%9C'
path = '/Users/shenshunfeng/Desktop/爬虫/chromedriver'
from selenium.webdriver.chrome.options import Options
chrome_option = Options()
chrome_option.add_argument('window-size=1920x3000')
chrome_option.add_argument('--headless')
chrome_option.add_argument('--disable-gpu')
# chrome_option.add_argument('blink-settings=imagesEnabled=false')
driver = webdriver.Chrome(executable_path=path, options=chrome_option)
driver.get(url)
sleep(8)
html = driver.page_source
# print(html)
# driver.get_screenshot_as_file("tianqi.jpg")
# obj.save_screenshot("km2.png")
driver.close()
driver.quit()
soup = BeautifulSoup(html, "html.parser")
lis = soup.find_all('li', class_="weather-day")[2]
wek = lis.find_all('p')[0].get_text(strip=True)
tomorrow_day = lis.find_all('p')[1].get_text(strip=True)
wind_force = lis.find_all('p')[3].get_text(strip=True)
air_quality = lis.find_all('p')[4].get_text(strip=True)
print(wek,tomorrow_day,wind_force,air_quality)
zhishu = soup.find("div", class_="zhishu-box")
tuijian = ""
tianqi = ""
for i in zhishu.find_all("div", class_='zhishu-item-top'):
tuijian = tuijian + i.get_text(strip=True) + '|'
for i in zhishu.find_all("div", class_='zhishu-item-bottom'):
tianqi = tianqi + i.get_text(strip=True) + '|'
zstitle = soup.find("div", class_="zhishu-title")
zstitleone = zstitle.find("span", class_='zhishu-titleone').get_text(strip=True)
zstitletwo = zstitle.find("span", class_='zhishu-titletwo').get_text(strip=True)
md = "{}\n > {},{}\n天气:{}\n风力:{}\n空气质量:{}\n{}:\n{}".format(zstitletwo, wek, tomorrow_day, tianqi, wind_force, air_quality, zstitleone, tuijian)
print(md)
wx_text = {
"msgtype": "markdown",
"markdown": {
"content": md
}
}
# 发送企业微信机器人
r = requests.post(webhook, data=json.dumps(wx_text))
效果图如下: