智联招聘job的正则的提取方式
import re
mystr = '<span title="大数据分析工程师" class="jobTitle over-length \
listsimple__content__item__box__jobname__span \
listsimple__content__item__box__jobname__span__title">大数据分析工程师</span>'
restr = ">\W+<"
restr1 = ">(\W+)<"
regex = re.compile(restr,re.IGNORECASE)
mylist = regex.findall(mystr)
print(mylist)
print(mylist[0])
使用selenium爬取拉钩信息
import re
import selenium
import selenium.webdriver
import time
def getnumberbyname(searchname):
url = "https://sou.zhaopin.com/?jl=530&kw="+searchname+"&kt=3"
driver = selenium.webdriver.Chrome(r"C:\Users\xxxx\Desktop\chromedriver.exe")
driver.get(url)
pagesource = driver.page_source
restr = '">(\W+)</span>'
regex = re.compile(restr, re.IGNORECASE)
mylist = regex.findall(pagesource)
time.sleep(2)
driver.close()
return mylist
for name in getnumberbyname("python"):
print(name)
对指定地点的python求职信息进行抓取
import selenium
import selenium.webdriver
import re
import time
def getnumberbyspace(space):
url = "https://www.lagou.com/jobs/list_python%20测试/p-city_2?px=default&district=海淀区&bizArea="+space+"#filterBox"
driver = selenium.webdriver.Chrome(r"C:\Users\xxxx\Desktop\chromedriver.exe")
driver.get(url)
time.sleep(5)
pagesource = driver.page_source
"""
在这里延申两个知识点:
1.我们的匹配的字符串中如果有中文的话,一般匹配不上,建议前面带一个u
2.我们的括号的匹配需要使用( 的方式,而针对分组的括号则不做任何处理
"""
restr = u'职位 ( <span>(\d+)</span> )'
regex = re.compile(restr,re.IGNORECASE)
mylist = regex.findall(pagesource)
driver.close()
return mylist[0]
spacelist = ["马连洼","海淀黄庄","西二旗","香山","航天桥"]
for space in spacelist:
print(space,getnumberbyspace(space))