03爬虫实例-爬取实例-腾讯招聘信息

243 阅读1分钟

一、腾讯招聘信息

这是我参与更文挑战的第4天,活动详情查看: 更文挑战

# 腾讯招聘
# 爬取title和demaed
import json
import requests
from pymysql import *


class tengxun(object):
    def __init__(self, url):
        self.url = url
        self.headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}

    def parse_url(self):  # 获取源码
        response = requests.get(url=self.url, headers=self.headers)
        return response.json()

    def save_json(self, parse_str):  # 保存源码
        with open("腾讯招聘.json", 'w', encoding="utf-8") as f:
            f.write(json.dumps(parse_str, ensure_ascii=False, indent=4))

    def content_str(self, parse_str):  # 提取数据,保存在mysql
        html = parse_str['Data']['Posts']
        a = 1
        for nr in html:
            title = nr.get('RecruitPostName')
            if title == None:
                continue
            demaed = nr.get('Responsibility')
            if demaed == None:
                continue
            print(title,demaed)
            # conn = connect(host='localhost', port=3306, database='spider', user='root', password='mysql')
            # cs1 = conn.cursor()
            # count = cs1.execute('insert into tengxun values("{}","{}");'.format(title, demaed))
            # print("成功{}条".format(a))
            # a += 1
            # conn.commit()
            # cs1.close()
            # conn.close()

    def run(self):
        parse_str = self.parse_url()
        self.content_str(parse_str)
        self.save_json(parse_str)


if __name__ == '__main__':
    for page in range(1, 500):
        url = 'https://careers.tencent.com/tencentcareer/api/post/Query?&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn'.format(page)
        spider = tengxun(url)
        spider.run()