一、腾讯招聘信息
这是我参与更文挑战的第4天,活动详情查看: 更文挑战
# 腾讯招聘
# 爬取title和demaed
import json
import requests
from pymysql import *
class tengxun(object):
def __init__(self, url):
self.url = url
self.headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
def parse_url(self): # 获取源码
response = requests.get(url=self.url, headers=self.headers)
return response.json()
def save_json(self, parse_str): # 保存源码
with open("腾讯招聘.json", 'w', encoding="utf-8") as f:
f.write(json.dumps(parse_str, ensure_ascii=False, indent=4))
def content_str(self, parse_str): # 提取数据,保存在mysql
html = parse_str['Data']['Posts']
a = 1
for nr in html:
title = nr.get('RecruitPostName')
if title == None:
continue
demaed = nr.get('Responsibility')
if demaed == None:
continue
print(title,demaed)
# conn = connect(host='localhost', port=3306, database='spider', user='root', password='mysql')
# cs1 = conn.cursor()
# count = cs1.execute('insert into tengxun values("{}","{}");'.format(title, demaed))
# print("成功{}条".format(a))
# a += 1
# conn.commit()
# cs1.close()
# conn.close()
def run(self):
parse_str = self.parse_url()
self.content_str(parse_str)
self.save_json(parse_str)
if __name__ == '__main__':
for page in range(1, 500):
url = 'https://careers.tencent.com/tencentcareer/api/post/Query?&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn'.format(page)
spider = tengxun(url)
spider.run()