bs4的find_all配合selenium
这个使用bs4的find_all配合selenium的方式进行职位数字的提取
import urllib
import selenium
import selenium.webdriver
import time
from bs4 import BeautifulSoup
def download(url):
driver = selenium.webdriver.Chrome(r"C:\Users\zuoyikeji\Desktop\chromedriver.exe")
driver.get(url)
time.sleep(3)
pagesource = driver.page_source
soup = BeautifulSoup(pagesource,"lxml")
text = soup.find_all("div",class_="dw_tlc")[0].find("div",class_="rt").get_text().strip()
driver.close()
return text
print(download("https://search.51job.com/list/010000,000000,0000,00,9,99,python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="))
urllib2配合bs4进行数据的提取
- urllib2配合bs4进行数据的提取
- 这个使用bs4的select配合urllib2的方式进行职位数字的提取
- 注意没有这种形式:
- select("a",class_="link") 这个是find_all的写法
import urllib2
from bs4 import BeautifulSoup
def download(url):
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
request = urllib2.Request(url,headers=headers)
response = urllib2.urlopen(request)
pagesource = response.read()
soup = BeautifulSoup(pagesource,"lxml")
# text = soup.select(".dw_tlc")[0].select(".rt")[0].get_text().strip()
text = soup.select("div .dw_tlc")[0].select("div .rt")[0].get_text().strip()
return text
print(download("https://search.51job.com/list/010000,000000,0000,00,9,99,python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="))