天眼查爬虫1

1,500 阅读45分钟

本文已参与「新人创作礼」活动,一起开启掘金创作之路。

最近一直在搞天眼查爬虫,谈谈心得吧。

1,打开天眼查页面:天眼查,随便输入一个公司:临汾晋临运货运有限公司。进入主页,要爬的东西不是太多,主要有以下几个模块:工商信息开庭公告法律诉讼法院公告被执行人立案信息行政处罚等。
2,首先写个基本的爬虫请求,先不加请求头试试看:

import requests
url = "https://www.tianyancha.com/company/307542945"
response = requests.get(url)
response = response.content.decode()
print(response)

结果返回一堆JS乱码

<html><script>
var arg1='D957D04B44DEC7580920C9FE399FBC36FB1382C8';
var _0x4818=['\x63\x73\x4b\x48\x77\x71\x4d\x49','\x5a\x73\x4b\x4a\x77\x72\x38\x56\x65\x41\x73\x79','\x55\x63\x4b\x69\x4e\x38\x4f\x2f\x77\x70\x6c\x77\x4d\x41\x3d\x3d','\x4a\x52\x38\x43\x54\x67\x3d\x3d','\x59\x73\x4f\x6e\x62\x53\x45\x51\x77\x37\x6f\x7a\x77\x71\x5a\x4b\x65\x73\x4b\x55\x77\x37\x6b\x77\x58\x38\x4f\x52\x49\x51\x3d\x3d','\x77\x37\x6f\x56\x53\x38\x4f\x53\x77\x6f\x50\x43\x6c\x33\x6a\x43\x68\x4d\x4b\x68\x77\x36\x48\x44\x6c\x73\x4b\x58\x77\x34\x73\x2f\x59\x73\x4f\x47','\x66\x77\x56\x6d\x49\x31\x41\x74\x77\x70\x6c\x61\x59\x38\x4f\x74\x77\x35\x63\x4e\x66\x53\x67\x70\x77\x36\x4d\x3d','\x4f\x63\x4f\x4e\x77\x72\x6a\x43\x71\x73\x4b\x78\x54\x47\x54\x43\x68\x73\x4f\x6a\x45\x57\x45\x38\x50\x63\x4f\x63\x4a\x38\x4b\x36','\x55\x38\x4b\x35\x4c\x63\x4f\x74\x77\x70\x56\x30\x45\x4d\x4f\x6b\x77\x34\x37\x44\x72\x4d\x4f\x58','\x48\x4d\x4f\x32\x77\x6f\x48\x43\x69\x4d\x4b\x39\x53\x6c\x58\x43\x6c\x63\x4f\x6f\x43\x31\x6b\x3d','\x61\x73\x4b\x49\x77\x71\x4d\x44\x64\x67\x4d\x75\x50\x73\x4f\x4b\x42\x4d\x4b\x63\x77\x72\x72\x43\x74\x6b\x4c\x44\x72\x4d\x4b\x42\x77\x36\x34\x64','\x77\x71\x49\x6d\x4d\x54\x30\x74\x77\x36\x52\x4e\x77\x35\x6b\x3d','\x44\x4d\x4b\x63\x55\x30\x4a\x6d\x55\x77\x55\x76','\x56\x6a\x48\x44\x6c\x4d\x4f\x48\x56\x63\x4f\x4e\x58\x33\x66\x44\x69\x63\x4b\x4a\x48\x51\x3d\x3d','\x77\x71\x68\x42\x48\x38\x4b\x6e\x77\x34\x54\x44\x68\x53\x44\x44\x67\x4d\x4f\x64\x77\x72\x6a\x43\x6e\x63\x4f\x57\x77\x70\x68\x68\x4e\x38\x4b\x43\x47\x63\x4b\x71\x77\x36\x64\x48\x41\x55\x35\x2b\x77\x72\x67\x32\x4a\x63\x4b\x61\x77\x34\x49\x45\x4a\x63\x4f\x63\x77\x72\x52\x4a\x77\x6f\x5a\x30\x77\x71\x46\x39\x59\x67\x41\x56','\x64\x7a\x64\x32\x77\x35\x62\x44\x6d\x33\x6a\x44\x70\x73\x4b\x33\x77\x70\x59\x3d','\x77\x34\x50\x44\x67\x63\x4b\x58\x77\x6f\x33\x43\x6b\x63\x4b\x4c\x77\x72\x35\x71\x77\x72\x59\x3d','\x77\x72\x4a\x4f\x54\x63\x4f\x51\x57\x4d\x4f\x67','\x77\x71\x54\x44\x76\x63\x4f\x6a\x77\x34\x34\x37\x77\x72\x34\x3d','\x77\x35\x58\x44\x71\x73\x4b\x68\x4d\x46\x31\x2f','\x77\x72\x41\x79\x48\x73\x4f\x66\x77\x70\x70\x63','\x4a\x33\x64\x56\x50\x63\x4f\x78\x4c\x67\x3d\x3d','\x77\x72\x64\x48\x77\x37\x70\x39\x5a\x77\x3d\x3d','\x77\x34\x72\x44\x6f\x38\x4b\x6d\x4e\x45\x77\x3d','\x49\x4d\x4b\x41\x55\x6b\x42\x74','\x77\x36\x62\x44\x72\x63\x4b\x51\x77\x70\x56\x48\x77\x70\x4e\x51\x77\x71\x55\x3d','\x64\x38\x4f\x73\x57\x68\x41\x55\x77\x37\x59\x7a\x77\x72\x55\x3d','\x77\x71\x6e\x43\x6b\x73\x4f\x65\x65\x7a\x72\x44\x68\x77\x3d\x3d','\x55\x73\x4b\x6e\x49\x4d\x4b\x57\x56\x38\x4b\x2f','\x77\x34\x7a\x44\x6f\x63\x4b\x38\x4e\x55\x5a\x76','\x63\x38\x4f\x78\x5a\x68\x41\x4a\x77\x36\x73\x6b\x77\x71\x4a\x6a','\x50\x63\x4b\x49\x77\x34\x6e\x43\x6b\x6b\x56\x62','\x4b\x48\x67\x6f\x64\x4d\x4f\x32\x56\x51\x3d\x3d','\x77\x70\x73\x6d\x77\x71\x76\x44\x6e\x47\x46\x71','\x77\x71\x4c\x44\x74\x38\x4f\x6b\x77\x34\x63\x3d','\x77\x37\x77\x31\x77\x34\x50\x43\x70\x73\x4f\x34\x77\x71\x41\x3d','\x77\x71\x39\x46\x52\x73\x4f\x71\x57\x4d\x4f\x71','\x62\x79\x42\x68\x77\x37\x72\x44\x6d\x33\x34\x3d','\x4c\x48\x67\x2b\x53\x38\x4f\x74\x54\x77\x3d\x3d','\x77\x71\x68\x4f\x77\x37\x31\x35\x64\x73\x4f\x48','\x55\x38\x4f\x37\x56\x73\x4f\x30\x77\x71\x76\x44\x76\x63\x4b\x75\x4b\x73\x4f\x71\x58\x38\x4b\x72','\x59\x69\x74\x74\x77\x35\x44\x44\x6e\x57\x6e\x44\x72\x41\x3d\x3d','\x59\x4d\x4b\x49\x77\x71\x55\x55\x66\x67\x49\x6b','\x61\x42\x37\x44\x6c\x4d\x4f\x44\x54\x51\x3d\x3d','\x77\x70\x66\x44\x68\x38\x4f\x72\x77\x36\x6b\x6b','\x77\x37\x76\x43\x71\x4d\x4f\x72\x59\x38\x4b\x41\x56\x6b\x35\x4f\x77\x70\x6e\x43\x75\x38\x4f\x61\x58\x73\x4b\x5a\x50\x33\x44\x43\x6c\x63\x4b\x79\x77\x36\x48\x44\x72\x51\x3d\x3d','\x77\x6f\x77\x2b\x77\x36\x76\x44\x6d\x48\x70\x73\x77\x37\x52\x74\x77\x6f\x39\x38\x4c\x43\x37\x43\x69\x47\x37\x43\x6b\x73\x4f\x52\x54\x38\x4b\x6c\x57\x38\x4f\x35\x77\x72\x33\x44\x69\x38\x4f\x54\x48\x73\x4f\x44\x65\x48\x6a\x44\x6d\x63\x4b\x6c\x4a\x73\x4b\x71\x56\x41\x3d\x3d','\x4e\x77\x56\x2b','\x77\x37\x48\x44\x72\x63\x4b\x74\x77\x70\x4a\x61\x77\x70\x5a\x62','\x77\x70\x51\x73\x77\x71\x76\x44\x69\x48\x70\x75\x77\x36\x49\x3d','\x59\x4d\x4b\x55\x77\x71\x4d\x4a\x5a\x51\x3d\x3d','\x4b\x48\x31\x56\x4b\x63\x4f\x71\x4b\x73\x4b\x31','\x66\x51\x35\x73\x46\x55\x6b\x6b\x77\x70\x49\x3d','\x77\x72\x76\x43\x72\x63\x4f\x42\x52\x38\x4b\x6b','\x4d\x33\x77\x30\x66\x51\x3d\x3d','\x77\x36\x78\x58\x77\x71\x50\x44\x76\x4d\x4f\x46\x77\x6f\x35\x64'];(function(_0x4c97f0,_0x1742fd){var _0x4db1c=function(_0x48181e){while(--_0x48181e){_0x4c97f0['\x70\x75\x73\x68'](_0x4c97f0['\x73\x68\x69\x66\x74']());}};var _0x3cd6c6=function(){var _0xb8360b=


function setCookie(name,value){var expiredate=new Date();expiredate.setTime(expiredate.getTime()+(3600*1000));document.cookie=name+"="+value+";expires="+expiredate.toGMTString()+";max-age=3600;path=/";}
function reload(x) {setCookie("acw_sc__v2", x);document.location.reload();}
</script></html>


还是加上请求头吧(记得要在网页中提前登录,然后拿到登录信息Cookie)

import requests
url = "https://www.tianyancha.com/company/307542945"
headers = {
    'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
    "Cookie":"aliyungf_tc=282ebba707a03b502b075b0cdd1393a6f2720a797a955a717160cc4bbdb89fbc; csrfToken=WuVruO4GsAFIc9z5DdQxtlAR; TYCID=a8a5bdd05db911ec8061e5e64f8765b8; ssuid=4585560966; bannerFlag=true; creditGuide=1; _ga=GA1.2.1523420552.1639581264; tyc-user-phone=%255B%252215838072824%2522%255D; jsid=https%3A%2F%2Fwww.tianyancha.com%2F%3Fjsid%3DSEM-BAIDU-PZ-SY-2021112-JRGW; searchSessionId=1639670771.66783963; relatedHumanSearchGraphId=3149325530; relatedHumanSearchGraphId.sig=HJivDEDhp8niuTtrieCF_t56fZHwONjuSprSp-AkZ88; _gid=GA1.2.849271308.1639890728; _bl_uid=1sknjxCmczFsFIom9l9jr1w8w0Xk; RTYCID=3888f8318b3e437c8327a699fa3a611f; CT_TYCID=1c090123cbab4efd8d0462dd297a2be2; bdHomeCount=1; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1639581261,1639890723,1639894039; tyc-user-info-save-time=1639897221599; tyc-user-info={%22mobile%22:%2215838072824%22%2C%22state%22:%220%22%2C%22vipManager%22:%220%22}; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTgzODA3MjgyNCIsImlhdCI6MTYzOTg5NzIyMCwiZXhwIjoxNjcxNDMzMjIwfQ.L5BmUloturqBaXRmetCI_szFbDikF6vXn8EPT-G8F8YeblfdTY6LD7yduZPYDY8QSBeQDRROLVutwO1UxCZfcw; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1639901361; cloud_token=af0670e6ed824e6eb06daff2a7eb1b1f; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2215838072824%22%2C%22first_id%22%3A%2217dbea89958208-0911ada0d7e41f-978183a-1382400-17dbea89959ced%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22%24device_id%22%3A%2217dbea89958208-0911ada0d7e41f-978183a-1382400-17dbea89959ced%22%7D; acw_tc=76b20f8c16399031969332159e4f50909777140122d856e2b56d883f435e44; acw_sc__v2=61bef0c59afd6919ffcbba56aa8833b29f08ee10",
}
response = requests.get(url,headers=headers)
response = response.content.decode()
print(response)

结果是可以顺利拿到整个网页的数据。

4,下一步,要爬取的数据是工商信息、开庭公告等,工商信息就在网页里面,找到对应的元素标签直接提取即可,但是像开庭公告和法律诉讼等模块的信息,会出现分页的情况,如下:

在这里插入图片描述

我的方法是右键“检查”,打开Network,以法律诉讼为例,点击Fetch/XHR,刷新页面,找到法律诉讼相关的源数据地址:

在这里插入图片描述

我这边是一个一个找的,找到后双击该链接,就可以看到源数据了:

在这里插入图片描述

注意整个url网址,我分别把法律诉讼、开庭公告、立案信息、行政处罚等模块数据的网址都找了出来,并总结了一下,有用的url是这个:

https://www.tianyancha.com/pagination/lawsuit .xhtml?TABLE_DIM_NAME=manageDangerous&ps=10&pn={}&id={}

需要替换模块信息和把最后面的pn=和id=的值填上。

  1. pn是页数,从1开始的,后面写代码的时候可以加个for循环,遍历页数即可。
  2. id的值是公司id,比如我们要爬的这个公司id就是307542945,这个id是你进入公司页面后在url地址中看到的。
  3. 还有一个需要修改的就是模块名,法律诉讼是:lawsuit,如上链接;开庭公告是:announcementcourt;法院公告是:court;被执行人是zhixing;立案信息是:courtRegister;行政处罚是mergePunishCount;只要将对应的模块信息英文名替代链接中的法律诉讼lawsuit即可。

例如:该公司法律诉讼的第2页:
https://www.tianyancha.com/pagination/lawsuit.xhtml?TABLE_DIM_NAME=lawsuit&ps=10&pn=2&id=500674557

所以,这个爬虫的主线任务分为两个:

一是通过请求公司的网址提取出工商信息

二是针对开庭公告、法律诉讼等模块信息,直接请求源数据地址。

代码如下:

import requests
from lxml import etree
import time
import sys
import random
import os

class TianYan:
    def __init__(self,company_id,fp,cookie):
        self.fp = fp
        self.company_id = company_id
        self.url = "https://www.tianyancha.com/company/{}".format(company_id)

        self.User_agents = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1",
            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
            "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)",
            "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)",
            "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
        ]
        # 需要预先登录天眼查,打开源地址数据页面,将其中的Cookie复制到这里  (此Cookie的值需要保持登录状态,如果chrome中退出再登录,需要更新Cookie)
        self.cookie = cookie
        self.headers = {
            'User-Agent': random.choice(self.User_agents),
            'Cookie':self.cookie,
            'Referer':"https://www.tianyancha.com/login?from=https%3A%2F%2Fwww.tianyancha.com%2Fsearch%3Fkey%3D%25E9%2583%2591%25E5%25B7%259E%25E6%2583%25A0%25E5%25B7%259E%25E6%25B1%25BD%25E8%25BD%25A6%25E8%25BF%2590%25E8%25BE%2593%25E6%259C%2589%25E9%2599%2590%25E5%2585%25AC%25E5%258F%25B8",
            'Connection': 'keep-alive',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Host': 'www.tianyancha.com',
            'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
            'Upgrade-Insecure-Requests': '1'
            }



    def get_html(self):
        response = requests.get(self.url,headers=self.headers)  # whm
        res = response.content.decode()
        return res


    def get_start_crawl(self):  # 基本信息
        try:
            response = self.get_html()
        except Exception as e:
            print("公司{}网页读取失败,可能是ip或者登录的Cookie问题".format(self.company_id))
            raise Exception()
        if "快捷登录与短信登录" in response:
            print("爬取基本信息失败-需要登录 company_id:{}".format(self.company_id))
            # sys.exit(0)  # ※ 终止程序
        # print(response)  # 更换Cookie  ※
        tree_html = etree.HTML(response)
        try:
            tr_list = tree_html.xpath('//*[@id="_container_baseInfo"]/table/tbody/tr')
            # company_name = tree_html.xpath("//div[@class='box -company-box ']/div[@class='content']/div[@class='header']/span/span/h1/text()")[0]  # 公司名
            company_name = tree_html.xpath("//div[@class='container company-header-block ']/div[3]/div[@class='content']/div[@class='header']/span/span/h1/text()")[0]  # 公司名  ※ 定位问题

            people_name = tr_list[0].xpath("td[2]//div[@class='humancompany']/div[@class='name']/a/text()")[0]  # 法定代表人
            company_status = tr_list[0].xpath("td[4]/text()")[0]  # 经营状态
            company_start_date = tr_list[1].xpath("td[2]/text()")[0]  # 成立日期
            company_zhuce = tr_list[2].xpath("td[2]/div/text()")[0]  # 注册资本
            company_shijiao = tr_list[3].xpath("td[2]/text()")[0]  # 实缴资本
            gongshanghao = tr_list[3].xpath("td[4]/text()")[0]  # 工商注册号
            xinyong_code = tr_list[4].xpath("td[2]/span/span/text()")[0]  # 统一信用代码
            nashuirenshibiehao = tr_list[4].xpath("td[4]/span/span/text()")[0]  # 纳税人识别号
            zhuzhijigou_code = tr_list[4].xpath("td[6]/span/span/text()")[0]  # 组织机构代码
            yingyeqixian = tr_list[5].xpath('td[2]/span/text()')[0].replace(' ', '')  # 营业期限
            people_zizi = tr_list[5].xpath('td[4]/text()')[0]  # 纳税人资质
            check_date = tr_list[5].xpath('td[6]/text()')[0]  # 核准日期
            leixing = tr_list[6].xpath('td[2]/text()')[0]  # 企业类型
            hangye = tr_list[6].xpath('td[4]/text()')[0]  # 行业
            people_number = tr_list[6].xpath('td[6]/text()')[0]  # 人员规模
            canbaorenshu = tr_list[7].xpath('td[2]/text()')[0]  # 参保人数
            dengjijiguan = tr_list[7].xpath('td[4]/text()')[0]  # 登记机关
            old_name = tr_list[8].xpath("td[2]//span[@class='copy-info-box']/span/text()")[0]  # 曾用名
            dizhi = tr_list[9].xpath('td[2]/span/span/span/text()')[0]  # 注册地址
            fanwei = tr_list[10].xpath('td[2]/span/text()')[0]  # 经营范围

            head_content = "法定代表人:{}\x01公司名:{}\x01经营状态:{}\x01成立日期:{}\x01注册资本:{}\x01实缴资本:{}\x01工商注册号:{}\x01统一信用代码:{}\x01纳税人识别号:{}\x01组织机构代码:{}" \
                           "\x01营业期限:{}\x01纳税人资质:{}\x01核准日期:{}\x01企业类型:{}\x01行业:{}\x01人员规模:{}\x01参保人数:{}\x01登记机关:{}\x01曾用名:{}\x01" \
                           "注册地址:{}\x01经营范围:{}".format(people_name,company_name,company_status,company_start_date,company_zhuce,company_shijiao,
                                                       gongshanghao,xinyong_code,nashuirenshibiehao,zhuzhijigou_code,yingyeqixian,people_zizi,check_date,
                                                       leixing,hangye,people_number,canbaorenshu,dengjijiguan,old_name,dizhi,fanwei)
            print(head_content,file=self.fp)
        except Exception as e:
            # print(self.response)
            print("公司{}的头部基本信息提取失败".format(self.company_id))
            # a = 1/0
            raise Exception()  # 手动引发异常,等同于a=1/0


    def kaiting(self):
        # 将公司ID替换掉就可以了
        company_id = self.company_id
        print("开庭公告",file=self.fp)
        for pg_num in range(1, 11):
            kt_ult = []
            # 分页修改announcementcourt和Cookie即可,每个字段的分页都有一个固定的Cookie,接下来查看这样是否会封IP  (本地电脑需要登录,而且Cookie就是登录后数据页面的Cookie)
            ss_url = 'https://www.tianyancha.com/pagination/announcementcourt.xhtml?TABLE_DIM_NAME=manageDangerous&ps=10&pn={}&id={}'.format(
                pg_num, company_id)
            ss_headers = {
                'User-Agent': random.choice(self.User_agents),
                'Cookie':self.cookie,
                'Connection': 'keep-alive',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Host': 'www.tianyancha.com',
                'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
            }
            # ss_page_status = requests.get(url=ss_url, headers=ss_headers).status_code

            # print(ss_page_status)
            response = requests.get(url=ss_url, headers=ss_headers,allow_redirects=False).content.decode()
            if "抱歉,没有找到相关信息,请更换关键词重试" in response:
                break
            # print(response)
            tree_html = etree.HTML(response)
            kt_list = tree_html.xpath('//tbody/tr')
            if kt_list != '' and len(kt_list) > 0:
                for tr in kt_list:
                    try:
                        tds = tr.xpath("td")
                        court_order = tds[0].xpath("text()")[0]
                        court_date = tds[1].xpath("text()")[0]  # 开庭日期
                        court_num = tds[2].xpath("span/text()")[0]  # 案号
                        court_reason = tds[3].xpath("span/text()")[0]  # 案由
                        court_sta = tds[4].xpath("div")
                        court_sta_list = []
                        for i in court_sta:
                            court_sta_list.append(i.xpath("string(.)"))
                        court_status = " ".join(court_sta_list)  # 案件身份
                        court_law = tds[5].xpath("span/text()")[0]  # 审理法院
                        kt_ult.append("序号:{}\x01开庭日期:{}\x01案号:{}\x01案由:{}\x01案件身份:{}\x01审理法院:{}".format(court_order,court_date, court_num,
                                                                                               court_reason,
                                                                                               court_status, court_law))
                    except Exception as e:
                        print("公司{}此条开庭公告信息无法解析。第{}页".format(self.company_id,pg_num), e)
                        raise Exception("")
            else:
                break
            for elm in kt_ult:
                print(elm,file=self.fp)


    def lawsuitwhm(self):
        # 将公司ID替换掉就可以了
        company_id = self.company_id
        print("法律诉讼",file=self.fp)
        for pg_num in range(1, 11):  # 法律诉讼爬10个页面即可
            ss_ult = []
            # 法律诉讼的Cookie也需要登录后的数据页面中的Cookie
            ss_url = 'https://www.tianyancha.com/pagination/lawsuit.xhtml?TABLE_DIM_NAME=manageDangerous&ps=10&pn={}&id={}'.format(
                pg_num, company_id)
            ss_headers = {
                'User-Agent': random.choice(self.User_agents),
                'Cookie': self.cookie,
                'Connection': 'keep-alive',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Host': 'www.tianyancha.com',
                'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
            }
            # ss_page_status = requests.get(url=ss_url, headers=ss_headers).status_code

            # print(ss_page_status)
            response = requests.get(url=ss_url, headers=ss_headers,allow_redirects=False).content.decode()

            if "抱歉,没有找到相关信息,请更换关键词重试" in response:
                break
            ss_tree = etree.HTML(response)
            ss_list = ss_tree.xpath('//tbody/tr')
            if len(ss_list) != 0:
                for tr in ss_list:
                    try:
                        tds = tr.xpath("td")
                        lawsuit_order = tds[0].xpath("text()")[0]
                        lawsuit_name = tds[1].xpath("text()")[0]  # 案件名称
                        lawsuit_reason = tds[2].xpath("span/text()")[0]  # 案由
                        lawsuit_sta = tds[3].xpath("div/div/div/span")  # 在本案中身份
                        lawsuit_sta_list = []
                        for i in lawsuit_sta:
                            lawsuit_sta_list.append(i.xpath("string(.)"))
                        lawsuit_status = "".join(lawsuit_sta_list)  # 在本案中身份
                        lawsuit_result = tds[4].xpath("div/div/text()")[0]  # 裁判结果
                        lawsuit_result = lawsuit_result.replace('\n', '').replace(' ','').replace('\r', '')
                        lawsuit_money = tds[5].xpath("span/text()")[0]  # 案件金额
                        ss_ult.append("序号:{}\x01案件名称:{}\x01案由:{}\x01在本案中身份:{}\x01裁判结果:{}\x01案件金额:{}".format(lawsuit_order,lawsuit_name,lawsuit_reason,lawsuit_status,lawsuit_result,lawsuit_money))
                    except Exception as e:
                        print("公司{}此条法律诉讼信息未能解析。第{}页".format(self.company_id,pg_num),e)
                        raise Exception("")
            else:
                break
            for one_ult in ss_ult:
                print(one_ult,file=self.fp)


    def fayuangonggao(self):
        # 法院公告解析:
        print("法院公告",file=self.fp)
        company_id = self.company_id
        for pg_num in range(1,11):
            url = 'https://www.tianyancha.com/pagination/court.xhtml?TABLE_DIM_NAME=manageDangerous&ps=10&pn={}&id={}'.format(
                pg_num, company_id)

            headers = {
                'User-Agent': random.choice(self.User_agents),  # 下面的Cookie需要换上本电脑上数据页面的Cookie
                'Cookie': self.cookie,
                'Connection': 'keep-alive',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Host': 'www.tianyancha.com',
                'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
            }
            response = requests.get(url=url, headers=headers,allow_redirects=False).content.decode()
            if "抱歉,没有找到相关信息,请更换关键词重试" in response:
                break
            # print(response)
            gonggao_ult = []
            gonggao_tree = etree.HTML(response)
            gonggao_list = gonggao_tree.xpath('//tbody/tr')
            if len(gonggao_list) != 0:
                for tr in gonggao_list:
                    try:
                        tds = tr.xpath("td")
                        gg_order = tds[0].xpath("text()")[0]
                        gg_date = tds[1].xpath("text()")[0]  # 刊登日期
                        gg_num = tds[2].xpath("text()")[0]  # 案号
                        gg_reason = tds[3].xpath("text()")[0]  # 案由
                        e = tds[4].xpath("div")
                        estr = []
                        for i in e:
                            estr.append(i.xpath("string(.)"))
                        gg_status = "\x01".join(estr)  # 案件身份
                        gg_type = tds[5].xpath("text()")[0]  # 公告类型
                        gg_law = tds[6].xpath("text()")[0]  # 法院

                        gonggao_ult.append("序号:{}\x01刊登日期:{}\x01案号:{}\x01案由:{}\x01案件身份:{}\x01公告类型:{}\x01法院:{}".format(gg_order,gg_date,gg_num,gg_reason,gg_status,gg_type,gg_law))
                    except Exception as e:
                        print("公司{}此条法院公告信息无法解析。第{}页".format(self.company_id,pg_num),e)
                        raise Exception("")
            else:
                break
            for elm in gonggao_ult:
                print(elm,file=self.fp)

    def beizhixing(self):
        print("被执行人",file=self.fp)

        company_id = self.company_id
        for pg_num in range(1,11):
            zhixingren_ult = []
            url = 'https://www.tianyancha.com/pagination/zhixing.xhtml?TABLE_DIM_NAME=manageDangerous&ps=10&pn={}&id={}'.format(
                pg_num, company_id)

            headers = {
                'User-Agent': random.choice(self.User_agents),  # 下面的Cookie需要换上本电脑上数据页面的Cookie
                'Cookie': self.cookie,
                'Connection': 'keep-alive',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Host': 'www.tianyancha.com',
                'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
            }
            response = requests.get(url=url, headers=headers, allow_redirects=False).content.decode()
            if "抱歉,没有找到相关信息,请更换关键词重试" in response:
                break
            # print(response)
            zhixingren_tree = etree.HTML(response)
            try:
                zhixingren_list = zhixingren_tree.xpath('//tbody/tr')
            except Exception as e:
                break
            if len(zhixingren_list) != 0:
                for tr in zhixingren_list:
                    try:
                        tds = tr.xpath("td")
                        zhixing_order = tds[0].xpath("text()")[0]  # 序号
                        zhixing_date = tds[1].xpath("text()")[0]  # 立案日期
                        zhixing_num = tds[2].xpath("text()")[0]  # 案号
                        zhixing_money = tds[3].xpath("text()")[0]  # 执行标的
                        zhixing_lawer = tds[4].xpath("text()")[0]  # 执行法院

                        zhixingren_ult.append("序号:{}\x01立案日期:{}\x01案号:{}\x01执行标的:{}\x01执行法院:{}".format(zhixing_order,zhixing_date,zhixing_num,zhixing_money,zhixing_lawer))
                    except Exception as e:
                        print("公司{}此条法院公告信息无法解析。第{}页".format(self.company_id,pg_num),e)
                        raise Exception("")
            else:
                break
            for elm in zhixingren_ult:
                print(elm,file=self.fp)

    def lian_message(self):
        print("立案信息",file=self.fp)
        company_id = self.company_id
        for pg_num in range(1,11):
            url = 'https://www.tianyancha.com/pagination/courtRegister.xhtml?TABLE_DIM_NAME=manageDangerous&ps=10&pn={}&id={}'.format(
                pg_num, company_id)
            headers = {
                'User-Agent': random.choice(self.User_agents),  # 下面的Cookie需要换上本电脑上数据页面的Cookie
                'Cookie': self.cookie,
                'Connection': 'keep-alive',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Host': 'www.tianyancha.com',
                'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
            }
            response = requests.get(url=url, headers=headers, allow_redirects=False).content.decode()
            if "抱歉,没有找到相关信息,请更换关键词重试" in response:
                break
            # print(response)
            lian_ult = []
            lian_tree = etree.HTML(response)
            try:
                lian_list = lian_tree.xpath('//tbody/tr')
            except Exception as e:
                break
            if len(lian_list) != 0:
                for tr in lian_list:
                    try:
                        tds = tr.xpath("td")
                        register_order = tds[0].xpath("text()")[0]  # 序号
                        register_date = tds[1].xpath("text()")[0]  # 立案日期
                        register_num = tds[2].xpath("text()")[0]  # 案号
                        register_sta = tds[3].xpath("div")
                        register_status = []
                        for i in register_sta:
                            register_status.append(i.xpath("string(.)"))
                        register_status = "\x01".join(register_status)  # 案件身份
                        register_law = tds[4].xpath("text()")[0]  # 法院
                        lian_ult.append(
                            "序号:{}\x01立案日期:{}\x01案号:{}\x01案件身份:{}\x01法院:{}".format(register_order,register_date, register_num, register_status,
                                                                          register_law))
                    except Exception as e:
                        print("公司{}此条立案信息无法解析。第{}页".format(self.company_id,pg_num), e)
                        raise Exception("")
            else:
                break
            for elm in lian_ult:
                print(elm,file=self.fp)

    def xingzheng(self):
        print("行政处罚",file=self.fp)
        company_id = self.company_id
        for pg_num in range(1,11):
            url = 'https://www.tianyancha.com/pagination/mergePunishCount.xhtml?TABLE_DIM_NAME=manageDangerous&ps=10&pn={}&id={}'.format(
                pg_num, company_id)
            headers = {
                'User-Agent': random.choice(self.User_agents),  # 下面的Cookie需要换上本电脑上数据页面的Cookie
                'Cookie': self.cookie,
                'Connection': 'keep-alive',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Host': 'www.tianyancha.com',
                'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
            }
            response = requests.get(url=url, headers=headers, allow_redirects=False).content.decode()
            if "抱歉,没有找到相关信息,请更换关键词重试" in response:
                break
            # print(response)
            xingzheng_ult = []
            xingzheng_tree = etree.HTML(response)
            try:
                xingzheng_list = xingzheng_tree.xpath('//tbody/tr')
            except Exception as e:
                break
            if len(xingzheng_list) != 0:
                for tr in xingzheng_list:
                    try:
                        tds = tr.xpath("td")
                        penalty_order = tds[0].xpath("text()")[0]  # 序号
                        penalty_date = tds[1].xpath("text()")[0]  # 处罚日期
                        penalty_books = tds[2].xpath("div/text()")[0]  # 决定文书号
                        penalty_reason = tds[3].xpath("div/div/text()")[0]  # 处罚事由
                        penalty_result = tds[4].xpath("div/div/text()")[0]  # 处罚结果
                        penalty_unit = tds[5].xpath("text()")[0]  # 处罚单位
                        penalty_source = tds[6].xpath("span/text()")[0]  # 数据来源

                        # print(penalty_date,penalty_books,penalty_reason,penalty_result,penalty_unit,penalty_source)
                        xingzheng_ult.append(
                            "序号:{}\x01处罚日期:{}\x01决定文书号:{}\x01处罚事由:{}\x01处罚结果:{}\x01处罚单位:{}\x01数据来源:{}".format(
                                penalty_order,penalty_date,penalty_books,penalty_reason,penalty_result,penalty_unit,penalty_source))
                    except Exception as e:
                        print("公司{}此条行政处罚无法解析。第{}页".format(self.company_id,pg_num), e)
                        raise Exception("")
            else:
                break
            for elm in xingzheng_ult:
                print(elm,file=self.fp)

    def body_run(self):
        self.get_start_crawl()  # 基本信息
        self.kaiting()  # 开庭公告
        self.lawsuitwhm()  # 法律诉讼
        self.fayuangonggao()  # 法院公告
        self.beizhixing()  # 被执行人
        self.lian_message()  # 立案信息
        self.xingzheng()  # 行政处罚



if __name__ == '__main__':
    # 某些公司ID
    company_list = ["500674557", "844565574", "2319114574", "2317302446", "789235759", "2964355333"]
    # 登录后的cookie值
    cookie = "aliyungf_tc=282ebba707a03b502b075b0cdd1393a6f2720a797a955a717160cc4bbdb89fbc; csrfToken=WuVruO4GsAFIc9z5DdQxtlAR; TYCID=a8a5bdd05db911ec8061e5e64f8765b8; ssuid=4585560966; bannerFlag=true; creditGuide=1; _ga=GA1.2.1523420552.1639581264; tyc-user-phone=%255B%252215838072824%2522%255D; jsid=https%3A%2F%2Fwww.tianyancha.com%2F%3Fjsid%3DSEM-BAIDU-PZ-SY-2021112-JRGW; searchSessionId=1639670771.66783963; relatedHumanSearchGraphId=3149325530; relatedHumanSearchGraphId.sig=HJivDEDhp8niuTtrieCF_t56fZHwONjuSprSp-AkZ88; _gid=GA1.2.849271308.1639890728; _bl_uid=1sknjxCmczFsFIom9l9jr1w8w0Xk; RTYCID=3888f8318b3e437c8327a699fa3a611f; CT_TYCID=1c090123cbab4efd8d0462dd297a2be2; bdHomeCount=1; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1639581261,1639890723,1639894039; tyc-user-info={%22mobile%22:%2215838072824%22%2C%22state%22:%220%22%2C%22vipManager%22:%220%22}; tyc-user-info-save-time=1639897221599; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTgzODA3MjgyNCIsImlhdCI6MTYzOTg5NzIyMCwiZXhwIjoxNjcxNDMzMjIwfQ.L5BmUloturqBaXRmetCI_szFbDikF6vXn8EPT-G8F8YeblfdTY6LD7yduZPYDY8QSBeQDRROLVutwO1UxCZfcw; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2215838072824%22%2C%22first_id%22%3A%2217dbea89958208-0911ada0d7e41f-978183a-1382400-17dbea89959ced%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%2217dbea89958208-0911ada0d7e41f-978183a-1382400-17dbea89959ced%22%7D; acw_tc=781bad3916399078174934253e1d04c8ef43b5f83c2a747f3f57b0a14ee334; acw_sc__v2=61bf01fd5f03ac75c418beb88fd652d293148018; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1639907848; cloud_token=81f33c027448458e802b45199925632b; cloud_utm=cd6a257eda1e472ca0551876cdfd8f95; _gat_gtag_UA_123487620_1=1"

    for company_id in company_list:
        path = "TY_{}.txt".format(company_id)
        file_txt = open(path, "w")  # 新建一个文件(或者清空源文件内容)
        fp = open(path, 'a+', encoding='utf-8')
        ty = TianYan(company_id,fp,cookie)
        ty.body_run()
        print("{} successful! ".format(company_id))


上面代码是将各个公司爬取的信息分别存储在了txt文件中

但是!!!有漏洞,如果爬取十几个二十几个公司还好,一旦爬取成千上万个公司,就会有各种各样问题,比如:ip被封、图片点击验证、返回结果是JS乱码、登录失效等等问题

所以,我又写了一套代码,主要是用代理IP解决ip被封问题,限制爬取频率,随机在1~3秒内(如果爬取的频率是固定的话,也会被识别出来的)

Tianyan_body.py

import requests
from lxml import etree
import time
import sys
import random
import os

class TianYan:
    def __init__(self,company_id,fp,cookie,ip_proxies):
        self.fp = fp
        self.company_id = company_id
        self.ip_proxies = ip_proxies
        self.url = "https://www.tianyancha.com/company/{}".format(company_id)

        self.User_agents = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1",
            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
            "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)",
            "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)",
            "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
        ]
        # 需要预先登录天眼查,打开源地址数据页面,将其中的Cookie复制到这里  (此Cookie的值需要保持登录状态,如果chrome中退出再登录,需要更新Cookie)
        self.cookie = cookie
        self.headers = {
            'User-Agent': random.choice(self.User_agents),
            'Cookie':self.cookie,
            'Referer':"https://www.tianyancha.com/login?from=https%3A%2F%2Fwww.tianyancha.com%2Fsearch%3Fkey%3D%25E9%2583%2591%25E5%25B7%259E%25E6%2583%25A0%25E5%25B7%259E%25E6%25B1%25BD%25E8%25BD%25A6%25E8%25BF%2590%25E8%25BE%2593%25E6%259C%2589%25E9%2599%2590%25E5%2585%25AC%25E5%258F%25B8",
            'Connection': 'keep-alive',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Host': 'www.tianyancha.com',
            'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
            'Upgrade-Insecure-Requests': '1'
            }



    def get_html(self):
        response = requests.get(self.url,headers=self.headers,proxies=self.ip_proxies)  # whm
        res = response.content.decode()
        return res


    def get_start_crawl(self):  # 基本信息
        try:
            response = self.get_html()
        except Exception as e:
            print("公司{}网页读取失败,可能是ip或者登录的Cookie问题".format(self.company_id))
            raise Exception()
        if "快捷登录与短信登录" in response:
            print("爬取基本信息失败-需要登录 company_id:{}".format(self.company_id))
            # sys.exit(0)  # ※ 终止程序

        tree_html = etree.HTML(response)
        try:
            tr_list = tree_html.xpath('//*[@id="_container_baseInfo"]/table/tbody/tr')
            # company_name = tree_html.xpath("//div[@class='box -company-box ']/div[@class='content']/div[@class='header']/span/span/h1/text()")[0]  # 公司名
            company_name = tree_html.xpath("//div[@class='container company-header-block ']/div[3]/div[@class='content']/div[@class='header']/span/span/h1/text()")[0]  # 公司名  ※ 定位问题

            people_name = tr_list[0].xpath("td[2]//div[@class='humancompany']/div[@class='name']/a/text()")[0]  # 法定代表人
            company_status = tr_list[0].xpath("td[4]/text()")[0]  # 经营状态
            company_start_date = tr_list[1].xpath("td[2]/text()")[0]  # 成立日期
            company_zhuce = tr_list[2].xpath("td[2]/div/text()")[0]  # 注册资本
            company_shijiao = tr_list[3].xpath("td[2]/text()")[0]  # 实缴资本
            gongshanghao = tr_list[3].xpath("td[4]/text()")[0]  # 工商注册号
            xinyong_code = tr_list[4].xpath("td[2]/span/span/text()")[0]  # 统一信用代码
            nashuirenshibiehao = tr_list[4].xpath("td[4]/span/span/text()")[0]  # 纳税人识别号
            zhuzhijigou_code = tr_list[4].xpath("td[6]/span/span/text()")[0]  # 组织机构代码
            yingyeqixian = tr_list[5].xpath('td[2]/span/text()')[0].replace(' ', '')  # 营业期限
            people_zizi = tr_list[5].xpath('td[4]/text()')[0]  # 纳税人资质
            check_date = tr_list[5].xpath('td[6]/text()')[0]  # 核准日期
            leixing = tr_list[6].xpath('td[2]/text()')[0]  # 企业类型
            hangye = tr_list[6].xpath('td[4]/text()')[0]  # 行业
            people_number = tr_list[6].xpath('td[6]/text()')[0]  # 人员规模
            canbaorenshu = tr_list[7].xpath('td[2]/text()')[0]  # 参保人数
            dengjijiguan = tr_list[7].xpath('td[4]/text()')[0]  # 登记机关
            old_name = tr_list[8].xpath("td[2]//span[@class='copy-info-box']/span/text()")[0]  # 曾用名
            dizhi = tr_list[9].xpath('td[2]/span/span/span/text()')[0]  # 注册地址
            fanwei = tr_list[10].xpath('td[2]/span/text()')[0]  # 经营范围

            head_content = "法定代表人:{}\x01公司名:{}\x01经营状态:{}\x01成立日期:{}\x01注册资本:{}\x01实缴资本:{}\x01工商注册号:{}\x01统一信用代码:{}\x01纳税人识别号:{}\x01组织机构代码:{}" \
                           "\x01营业期限:{}\x01纳税人资质:{}\x01核准日期:{}\x01企业类型:{}\x01行业:{}\x01人员规模:{}\x01参保人数:{}\x01登记机关:{}\x01曾用名:{}\x01" \
                           "注册地址:{}\x01经营范围:{}".format(people_name,company_name,company_status,company_start_date,company_zhuce,company_shijiao,
                                                       gongshanghao,xinyong_code,nashuirenshibiehao,zhuzhijigou_code,yingyeqixian,people_zizi,check_date,
                                                       leixing,hangye,people_number,canbaorenshu,dengjijiguan,old_name,dizhi,fanwei)
            print(head_content,file=self.fp)
        except Exception as e:
            # print(self.response)
            print("公司{}的头部基本信息提取失败".format(self.company_id))
            # a = 1/0
            raise Exception()  # 手动引发异常,等同于a=1/0


    def kaiting(self):
        # 将公司ID替换掉就可以了
        company_id = self.company_id
        print("开庭公告",file=self.fp)
        for pg_num in range(1, 11):
            kt_ult = []
            # 分页修改announcementcourt和Cookie即可,每个字段的分页都有一个固定的Cookie,接下来查看这样是否会封IP  (本地电脑需要登录,而且Cookie就是登录后数据页面的Cookie)
            ss_url = 'https://www.tianyancha.com/pagination/announcementcourt.xhtml?TABLE_DIM_NAME=manageDangerous&ps=10&pn={}&id={}'.format(
                pg_num, company_id)
            ss_headers = {
                'User-Agent': random.choice(self.User_agents),
                'Cookie':self.cookie,
                'Connection': 'keep-alive',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Host': 'www.tianyancha.com',
                'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
            }
            # ss_page_status = requests.get(url=ss_url, headers=ss_headers).status_code

            # print(ss_page_status)
            response = requests.get(url=ss_url, headers=ss_headers,allow_redirects=False,proxies=self.ip_proxies).content.decode()
            if "抱歉,没有找到相关信息,请更换关键词重试" in response:
                break
            # print(response)
            tree_html = etree.HTML(response)
            kt_list = tree_html.xpath('//tbody/tr')
            if kt_list != '' and len(kt_list) > 0:
                for tr in kt_list:
                    try:
                        tds = tr.xpath("td")
                        court_order = tds[0].xpath("text()")[0]
                        court_date = tds[1].xpath("text()")[0]  # 开庭日期
                        court_num = tds[2].xpath("span/text()")[0]  # 案号
                        court_reason = tds[3].xpath("span/text()")[0]  # 案由
                        court_sta = tds[4].xpath("div")
                        court_sta_list = []
                        for i in court_sta:
                            court_sta_list.append(i.xpath("string(.)"))
                        court_status = " ".join(court_sta_list)  # 案件身份
                        court_law = tds[5].xpath("span/text()")[0]  # 审理法院
                        kt_ult.append("序号:{}\x01开庭日期:{}\x01案号:{}\x01案由:{}\x01案件身份:{}\x01审理法院:{}".format(court_order,court_date, court_num,
                                                                                               court_reason,
                                                                                               court_status, court_law))
                    except Exception as e:
                        print("公司{}此条开庭公告信息无法解析。第{}页".format(self.company_id,pg_num), e)
                        raise Exception("")
            else:
                break
            for elm in kt_ult:
                print(elm,file=self.fp)


    def lawsuitwhm(self):
        # 将公司ID替换掉就可以了
        company_id = self.company_id
        print("法律诉讼",file=self.fp)
        for pg_num in range(1, 11):  # 法律诉讼爬10个页面即可
            ss_ult = []
            # 法律诉讼的Cookie也需要登录后的数据页面中的Cookie
            ss_url = 'https://www.tianyancha.com/pagination/lawsuit.xhtml?TABLE_DIM_NAME=manageDangerous&ps=10&pn={}&id={}'.format(
                pg_num, company_id)
            ss_headers = {
                'User-Agent': random.choice(self.User_agents),
                'Cookie': self.cookie,
                'Connection': 'keep-alive',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Host': 'www.tianyancha.com',
                'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
            }
            # ss_page_status = requests.get(url=ss_url, headers=ss_headers).status_code

            # print(ss_page_status)
            response = requests.get(url=ss_url, headers=ss_headers,allow_redirects=False,proxies=self.ip_proxies).content.decode()

            if "抱歉,没有找到相关信息,请更换关键词重试" in response:
                break
            ss_tree = etree.HTML(response)
            ss_list = ss_tree.xpath('//tbody/tr')
            if len(ss_list) != 0:
                for tr in ss_list:
                    try:
                        tds = tr.xpath("td")
                        lawsuit_order = tds[0].xpath("text()")[0]
                        lawsuit_name = tds[1].xpath("text()")[0]  # 案件名称
                        lawsuit_reason = tds[2].xpath("span/text()")[0]  # 案由
                        lawsuit_sta = tds[3].xpath("div/div/div/span")  # 在本案中身份
                        lawsuit_sta_list = []
                        for i in lawsuit_sta:
                            lawsuit_sta_list.append(i.xpath("string(.)"))
                        lawsuit_status = "".join(lawsuit_sta_list)  # 在本案中身份
                        lawsuit_result = tds[4].xpath("div/div/text()")[0]  # 裁判结果
                        lawsuit_result = lawsuit_result.replace('\n', '').replace(' ','').replace('\r', '')
                        lawsuit_money = tds[5].xpath("span/text()")[0]  # 案件金额
                        ss_ult.append("序号:{}\x01案件名称:{}\x01案由:{}\x01在本案中身份:{}\x01裁判结果:{}\x01案件金额:{}".format(lawsuit_order,lawsuit_name,lawsuit_reason,lawsuit_status,lawsuit_result,lawsuit_money))
                    except Exception as e:
                        print("公司{}此条法律诉讼信息未能解析。第{}页".format(self.company_id,pg_num),e)
                        raise Exception("")
            else:
                break
            for one_ult in ss_ult:
                print(one_ult,file=self.fp)


    def fayuangonggao(self):
        # 法院公告解析:
        print("法院公告",file=self.fp)
        company_id = self.company_id
        for pg_num in range(1,11):
            url = 'https://www.tianyancha.com/pagination/court.xhtml?TABLE_DIM_NAME=manageDangerous&ps=10&pn={}&id={}'.format(
                pg_num, company_id)

            headers = {
                'User-Agent': random.choice(self.User_agents),  # 下面的Cookie需要换上本电脑上数据页面的Cookie
                'Cookie': self.cookie,
                'Connection': 'keep-alive',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Host': 'www.tianyancha.com',
                'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
            }
            response = requests.get(url=url, headers=headers,allow_redirects=False,proxies=self.ip_proxies).content.decode()
            if "抱歉,没有找到相关信息,请更换关键词重试" in response:
                break
            # print(response)
            gonggao_ult = []
            gonggao_tree = etree.HTML(response)
            gonggao_list = gonggao_tree.xpath('//tbody/tr')
            if len(gonggao_list) != 0:
                for tr in gonggao_list:
                    try:
                        tds = tr.xpath("td")
                        gg_order = tds[0].xpath("text()")[0]
                        gg_date = tds[1].xpath("text()")[0]  # 刊登日期
                        gg_num = tds[2].xpath("text()")[0]  # 案号
                        gg_reason = tds[3].xpath("text()")[0]  # 案由
                        e = tds[4].xpath("div")
                        estr = []
                        for i in e:
                            estr.append(i.xpath("string(.)"))
                        gg_status = "\x01".join(estr)  # 案件身份
                        gg_type = tds[5].xpath("text()")[0]  # 公告类型
                        gg_law = tds[6].xpath("text()")[0]  # 法院

                        gonggao_ult.append("序号:{}\x01刊登日期:{}\x01案号:{}\x01案由:{}\x01案件身份:{}\x01公告类型:{}\x01法院:{}".format(gg_order,gg_date,gg_num,gg_reason,gg_status,gg_type,gg_law))
                    except Exception as e:
                        print("公司{}此条法院公告信息无法解析。第{}页".format(self.company_id,pg_num),e)
                        raise Exception("")
            else:
                break
            for elm in gonggao_ult:
                print(elm,file=self.fp)

    def beizhixing(self):
        print("被执行人",file=self.fp)

        company_id = self.company_id
        for pg_num in range(1,11):
            zhixingren_ult = []
            url = 'https://www.tianyancha.com/pagination/zhixing.xhtml?TABLE_DIM_NAME=manageDangerous&ps=10&pn={}&id={}'.format(
                pg_num, company_id)

            headers = {
                'User-Agent': random.choice(self.User_agents),  # 下面的Cookie需要换上本电脑上数据页面的Cookie
                'Cookie': self.cookie,
                'Connection': 'keep-alive',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Host': 'www.tianyancha.com',
                'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
            }
            response = requests.get(url=url, headers=headers, allow_redirects=False,proxies=self.ip_proxies).content.decode()
            if "抱歉,没有找到相关信息,请更换关键词重试" in response:
                break
            # print(response)
            zhixingren_tree = etree.HTML(response)
            try:
                zhixingren_list = zhixingren_tree.xpath('//tbody/tr')
            except Exception as e:
                break
            if len(zhixingren_list) != 0:
                for tr in zhixingren_list:
                    try:
                        tds = tr.xpath("td")
                        zhixing_order = tds[0].xpath("text()")[0]  # 序号
                        zhixing_date = tds[1].xpath("text()")[0]  # 立案日期
                        zhixing_num = tds[2].xpath("text()")[0]  # 案号
                        zhixing_money = tds[3].xpath("text()")[0]  # 执行标的
                        zhixing_lawer = tds[4].xpath("text()")[0]  # 执行法院

                        zhixingren_ult.append("序号:{}\x01立案日期:{}\x01案号:{}\x01执行标的:{}\x01执行法院:{}".format(zhixing_order,zhixing_date,zhixing_num,zhixing_money,zhixing_lawer))
                    except Exception as e:
                        print("公司{}此条法院公告信息无法解析。第{}页".format(self.company_id,pg_num),e)
                        raise Exception("")
            else:
                break
            for elm in zhixingren_ult:
                print(elm,file=self.fp)

    def lian_message(self):
        print("立案信息",file=self.fp)
        company_id = self.company_id
        for pg_num in range(1,11):
            url = 'https://www.tianyancha.com/pagination/courtRegister.xhtml?TABLE_DIM_NAME=manageDangerous&ps=10&pn={}&id={}'.format(
                pg_num, company_id)
            headers = {
                'User-Agent': random.choice(self.User_agents),  # 下面的Cookie需要换上本电脑上数据页面的Cookie
                'Cookie': self.cookie,
                'Connection': 'keep-alive',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Host': 'www.tianyancha.com',
                'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
            }
            response = requests.get(url=url, headers=headers, allow_redirects=False,proxies=self.ip_proxies).content.decode()
            if "抱歉,没有找到相关信息,请更换关键词重试" in response:
                break
            # print(response)
            lian_ult = []
            lian_tree = etree.HTML(response)
            try:
                lian_list = lian_tree.xpath('//tbody/tr')
            except Exception as e:
                break
            if len(lian_list) != 0:
                for tr in lian_list:
                    try:
                        tds = tr.xpath("td")
                        register_order = tds[0].xpath("text()")[0]  # 序号
                        register_date = tds[1].xpath("text()")[0]  # 立案日期
                        register_num = tds[2].xpath("text()")[0]  # 案号
                        register_sta = tds[3].xpath("div")
                        register_status = []
                        for i in register_sta:
                            register_status.append(i.xpath("string(.)"))
                        register_status = "\x01".join(register_status)  # 案件身份
                        register_law = tds[4].xpath("text()")[0]  # 法院
                        lian_ult.append(
                            "序号:{}\x01立案日期:{}\x01案号:{}\x01案件身份:{}\x01法院:{}".format(register_order,register_date, register_num, register_status,
                                                                          register_law))
                    except Exception as e:
                        print("公司{}此条立案信息无法解析。第{}页".format(self.company_id,pg_num), e)
                        raise Exception("")
            else:
                break
            for elm in lian_ult:
                print(elm,file=self.fp)

    def xingzheng(self):
        print("行政处罚",file=self.fp)
        company_id = self.company_id
        for pg_num in range(1,11):
            url = 'https://www.tianyancha.com/pagination/mergePunishCount.xhtml?TABLE_DIM_NAME=manageDangerous&ps=10&pn={}&id={}'.format(
                pg_num, company_id)
            headers = {
                'User-Agent': random.choice(self.User_agents),  # 下面的Cookie需要换上本电脑上数据页面的Cookie
                'Cookie': self.cookie,
                'Connection': 'keep-alive',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Host': 'www.tianyancha.com',
                'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
            }
            response = requests.get(url=url, headers=headers, allow_redirects=False,proxies=self.ip_proxies).content.decode()
            if "抱歉,没有找到相关信息,请更换关键词重试" in response:
                break
            # print(response)
            xingzheng_ult = []
            xingzheng_tree = etree.HTML(response)
            try:
                xingzheng_list = xingzheng_tree.xpath('//tbody/tr')
            except Exception as e:
                break
            if len(xingzheng_list) != 0:
                for tr in xingzheng_list:
                    try:
                        tds = tr.xpath("td")
                        penalty_order = tds[0].xpath("text()")[0]  # 序号
                        penalty_date = tds[1].xpath("text()")[0]  # 处罚日期
                        penalty_books = tds[2].xpath("div/text()")[0]  # 决定文书号
                        penalty_reason = tds[3].xpath("div/div/text()")[0]  # 处罚事由
                        penalty_result = tds[4].xpath("div/div/text()")[0]  # 处罚结果
                        penalty_unit = tds[5].xpath("text()")[0]  # 处罚单位
                        penalty_source = tds[6].xpath("span/text()")[0]  # 数据来源

                        # print(penalty_date,penalty_books,penalty_reason,penalty_result,penalty_unit,penalty_source)
                        xingzheng_ult.append(
                            "序号:{}\x01处罚日期:{}\x01决定文书号:{}\x01处罚事由:{}\x01处罚结果:{}\x01处罚单位:{}\x01数据来源:{}".format(
                                penalty_order,penalty_date,penalty_books,penalty_reason,penalty_result,penalty_unit,penalty_source))
                    except Exception as e:
                        print("公司{}此条行政处罚无法解析。第{}页".format(self.company_id,pg_num), e)
                        raise Exception("")
            else:
                break
            for elm in xingzheng_ult:
                print(elm,file=self.fp)

    def body_run(self):
        self.get_start_crawl()  # 基本信息
        self.kaiting()  # 开庭公告
        self.lawsuitwhm()  # 法律诉讼
        self.fayuangonggao()  # 法院公告
        self.beizhixing()  # 被执行人
        self.lian_message()  # 立案信息
        self.xingzheng()  # 行政处罚



Tianyan_main.py

import requests
from lxml import etree
import time
import sys
import random
import os
import re
# from .Tianyan_head import get_company_head
# from .Tianyan_body import TianYan
# from z1217.Tianyan_body import TianYan
# from .Tianyan_body import TianYan
from Just_try.CHD.z1219.Tianyan_body import TianYan

class CompanySetting:
    def __init__(self,var):
        self.User_agents = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1",
            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
            "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)",
            "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)",
            "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
        ]
        # 需要预先登录天眼查,打开源地址数据页面,将其中的Cookie复制到这里  (此Cookie的值需要保持登录状态,如果chrome中退出再登录,需要更新Cookie)-↓
        # 需要更换Cookie,过老的Cookie也会无效,需要点击图片验证
        # 更改代理IP
        # self.cookie = "TYCID=970a0360595911ecb844678fcc48c6b4; ssuid=7782420900; _ga=GA1.2.205529974.1639100190; creditGuide=1; tyc-user-phone=%255B%252215838072824%2522%255D; _bl_uid=51kg7wILzwmt33jnnh8b8gq4g88m; aliyungf_tc=975133a0586b9d8247910635fab0120f2dfb47303db68899981b96efb22ecb7a; csrfToken=GPfdbeBsdJFMKJ9PS4OZyF-k; bannerFlag=true; jsid=https%3A%2F%2Fwww.tianyancha.com%2F%3Fjsid%3DSEM-BAIDU-PZ-SY-2021112-JRGW; relatedHumanSearchGraphId=529120041; relatedHumanSearchGraphId.sig=1e6-htbuJ3oK-8m0maPt0n-jiP2_2MK-xlrwLZW5Oy0; bdHomeCount=6; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1639638751,1639638959,1639642785,1639645038; searchSessionId=1639645046.08106995; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2215838072824%22%2C%22first_id%22%3A%2217da1fc06a69ec-07c085ac54da9c-978183a-1327104-17da1fc06a7954%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%2217da1fc06a69ec-07c085ac54da9c-978183a-1327104-17da1fc06a7954%22%7D; tyc-user-info-save-time=1639704027297; tyc-user-info={%22mobile%22:%2215838072824%22%2C%22state%22:%220%22%2C%22vipManager%22:%220%22}; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTgzODA3MjgyNCIsImlhdCI6MTYzOTcwNDAyNiwiZXhwIjoxNjcxMjQwMDI2fQ._LllvS1X_MSI7IDLrBIvG1wIKvjVtALqFR81Ixmhke5msBxTizzR4mXFXuQaTyvUybvgRX31_FlK6P6qzCkoRA; acw_tc=2f6fc11d16397075828226503ec3489a686979b226b0b3504fa789a6ffd041; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1639707587; CT_TYCID=0d4dc14181ed451994ed050a41c2e73c; cloud_token=00d53042f06548f6b7d16614a061db98; cloud_utm=6aa5eb580a1c4ff9bf91e3172576f909"
        # self.cookie = "TYCID=970a0360595911ecb844678fcc48c6b4; ssuid=7782420900; _ga=GA1.2.205529974.1639100190; creditGuide=1; tyc-user-phone=%255B%252215838072824%2522%255D; _bl_uid=51kg7wILzwmt33jnnh8b8gq4g88m; aliyungf_tc=975133a0586b9d8247910635fab0120f2dfb47303db68899981b96efb22ecb7a; csrfToken=GPfdbeBsdJFMKJ9PS4OZyF-k; bannerFlag=true; jsid=https%3A%2F%2Fwww.tianyancha.com%2F%3Fjsid%3DSEM-BAIDU-PZ-SY-2021112-JRGW; relatedHumanSearchGraphId=529120041; relatedHumanSearchGraphId.sig=1e6-htbuJ3oK-8m0maPt0n-jiP2_2MK-xlrwLZW5Oy0; bdHomeCount=6; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1639638751,1639638959,1639642785,1639645038; searchSessionId=1639645046.08106995; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2215838072824%22%2C%22first_id%22%3A%2217da1fc06a69ec-07c085ac54da9c-978183a-1327104-17da1fc06a7954%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%2217da1fc06a69ec-07c085ac54da9c-978183a-1327104-17da1fc06a7954%22%7D; tyc-user-info-save-time=1639704027297; tyc-user-info={%22mobile%22:%2215838072824%22%2C%22state%22:%220%22%2C%22vipManager%22:%220%22}; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTgzODA3MjgyNCIsImlhdCI6MTYzOTcwNDAyNiwiZXhwIjoxNjcxMjQwMDI2fQ._LllvS1X_MSI7IDLrBIvG1wIKvjVtALqFR81Ixmhke5msBxTizzR4mXFXuQaTyvUybvgRX31_FlK6P6qzCkoRA; CT_TYCID=0d4dc14181ed451994ed050a41c2e73c; RTYCID=b5ba30f9d70e40c18d80b48fe303f22b; bannerHide=15838072824; acw_tc=781bad4616397361274246135e1eb3f8b23bff47f6f268b363aabff7f4ceca; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1639736175; cloud_token=7137d132ab654409a8a891f46f295ab1; cloud_utm=62da596d28c9401bac8a3c504346eaba"
        # self.cookie = "TYCID=970a0360595911ecb844678fcc48c6b4; ssuid=7782420900; _ga=GA1.2.205529974.1639100190; creditGuide=1; tyc-user-phone=%255B%252215838072824%2522%255D; _bl_uid=51kg7wILzwmt33jnnh8b8gq4g88m; aliyungf_tc=975133a0586b9d8247910635fab0120f2dfb47303db68899981b96efb22ecb7a; csrfToken=GPfdbeBsdJFMKJ9PS4OZyF-k; bannerFlag=true; jsid=https%3A%2F%2Fwww.tianyancha.com%2F%3Fjsid%3DSEM-BAIDU-PZ-SY-2021112-JRGW; relatedHumanSearchGraphId=529120041; relatedHumanSearchGraphId.sig=1e6-htbuJ3oK-8m0maPt0n-jiP2_2MK-xlrwLZW5Oy0; bdHomeCount=6; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1639638751,1639638959,1639642785,1639645038; searchSessionId=1639645046.08106995; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2215838072824%22%2C%22first_id%22%3A%2217da1fc06a69ec-07c085ac54da9c-978183a-1327104-17da1fc06a7954%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%2217da1fc06a69ec-07c085ac54da9c-978183a-1327104-17da1fc06a7954%22%7D; CT_TYCID=0d4dc14181ed451994ed050a41c2e73c; RTYCID=b5ba30f9d70e40c18d80b48fe303f22b; acw_tc=781bad4a16398202951244078e10561a742e1265ed4753147617e30ed7430a; cloud_token=2eed115dc8734abf9d2a27258fa072cf; cloud_utm=b8e11d75c9a5473c95358dd5beff997d; tyc-user-info={%22mobile%22:%2215838072824%22%2C%22state%22:%220%22%2C%22vipManager%22:%220%22}; tyc-user-info-save-time=1639820536903; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTgzODA3MjgyNCIsImlhdCI6MTYzOTgyMDUzNywiZXhwIjoxNjcxMzU2NTM3fQ.IQexhM-41JQUHWDQMV8MZR388CqFT-U0u24q_O4Xjx45-vdU8wFH1zuGrxyPAey7DkbWUuM9WU2fUZKG2bJt8A; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1639820539"
        # self.cookie = "aliyungf_tc=282ebba707a03b502b075b0cdd1393a6f2720a797a955a717160cc4bbdb89fbc; csrfToken=WuVruO4GsAFIc9z5DdQxtlAR; TYCID=a8a5bdd05db911ec8061e5e64f8765b8; ssuid=4585560966; bannerFlag=true; creditGuide=1; _ga=GA1.2.1523420552.1639581264; tyc-user-phone=%255B%252215838072824%2522%255D; jsid=https%3A%2F%2Fwww.tianyancha.com%2F%3Fjsid%3DSEM-BAIDU-PZ-SY-2021112-JRGW; searchSessionId=1639670771.66783963; relatedHumanSearchGraphId=3149325530; relatedHumanSearchGraphId.sig=HJivDEDhp8niuTtrieCF_t56fZHwONjuSprSp-AkZ88; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2215838072824%22%2C%22first_id%22%3A%2217dbea89958208-0911ada0d7e41f-978183a-1382400-17dbea89959ced%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E5%A4%A9%E7%9C%BC%E6%9F%A5%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Fother.php%22%7D%2C%22%24device_id%22%3A%2217dbea89958208-0911ada0d7e41f-978183a-1382400-17dbea89959ced%22%7D; _gid=GA1.2.849271308.1639890728; _bl_uid=1sknjxCmczFsFIom9l9jr1w8w0Xk; tyc-user-info={%22mobile%22:%2215838072824%22%2C%22state%22:%220%22%2C%22vipManager%22:%220%22}; tyc-user-info-save-time=1639891233305; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTgzODA3MjgyNCIsImlhdCI6MTYzOTg5MTIzMiwiZXhwIjoxNjcxNDI3MjMyfQ.hla9H3LbFzE3-2IDDW4Xw_PosxVHzK0ATnZQUSffRQyWWQx95stFK8QuQA-UaJhV8VqSJGwojvnilxGUPilrLA; RTYCID=3888f8318b3e437c8327a699fa3a611f; CT_TYCID=1c090123cbab4efd8d0462dd297a2be2; bdHomeCount=1; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1639581261,1639890723,1639894039; acw_tc=707c9f7416398943324086228e5c82b88b3612a564754bc05f7710f8edac8d; acw_sc__v2=61becd3c362220c666fbe902c836b81079267abd; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1639894507; cloud_token=aeb2c3ff7c344c0e9e448e3fc68643df; cloud_utm=2cd2f64ef4d0433684d8901b1d22c97e"
        # 这是需要修改的地方
        self.cookie = "aliyungf_tc=282ebba707a03b502b075b0cdd1393a6f2720a797a955a717160cc4bbdb89fbc; csrfToken=WuVruO4GsAFIc9z5DdQxtlAR; TYCID=a8a5bdd05db911ec8061e5e64f8765b8; ssuid=4585560966; bannerFlag=true; creditGuide=1; _ga=GA1.2.1523420552.1639581264; tyc-user-phone=%255B%252215838072824%2522%255D; jsid=https%3A%2F%2Fwww.tianyancha.com%2F%3Fjsid%3DSEM-BAIDU-PZ-SY-2021112-JRGW; searchSessionId=1639670771.66783963; relatedHumanSearchGraphId=3149325530; relatedHumanSearchGraphId.sig=HJivDEDhp8niuTtrieCF_t56fZHwONjuSprSp-AkZ88; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2215838072824%22%2C%22first_id%22%3A%2217dbea89958208-0911ada0d7e41f-978183a-1382400-17dbea89959ced%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E5%A4%A9%E7%9C%BC%E6%9F%A5%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Fother.php%22%7D%2C%22%24device_id%22%3A%2217dbea89958208-0911ada0d7e41f-978183a-1382400-17dbea89959ced%22%7D; _gid=GA1.2.849271308.1639890728; _bl_uid=1sknjxCmczFsFIom9l9jr1w8w0Xk; RTYCID=3888f8318b3e437c8327a699fa3a611f; CT_TYCID=1c090123cbab4efd8d0462dd297a2be2; bdHomeCount=1; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1639581261,1639890723,1639894039; cloud_token=9c9711e8bdb6475cb7f6819d3cba55d8; acw_tc=707c9f6816398972133788598e76c87b9cc060eace0ffcccede3e98136ec98; acw_sc__v2=61bed87db3318ccfc4dc7a6bc493a83ddf8162f0; tyc-user-info={%22mobile%22:%2215838072824%22%2C%22state%22:%220%22%2C%22vipManager%22:%220%22}; tyc-user-info-save-time=1639897221599; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTgzODA3MjgyNCIsImlhdCI6MTYzOTg5NzIyMCwiZXhwIjoxNjcxNDMzMjIwfQ.L5BmUloturqBaXRmetCI_szFbDikF6vXn8EPT-G8F8YeblfdTY6LD7yduZPYDY8QSBeQDRROLVutwO1UxCZfcw; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1639897223"
        # 这也是需要修改的地方
        self.ip = "223.247.87.198:34039"

        ip_host = self.ip.split(":")[0]
        ip_port = self.ip.split(":")[1]
        # 非账号密码验证
        proxyMeta = "http://{}:{}".format(ip_host, ip_port)
        self.ip_proxies = {
            "http": proxyMeta,
            "https": proxyMeta
        }

        self.headers = {
            'User-Agent': random.choice(self.User_agents),
            'Cookie': self.cookie,
            'Referer': "https://www.tianyancha.com/login?from=https%3A%2F%2Fwww.tianyancha.com%2Fsearch%3Fkey%3D%25E9%2583%2591%25E5%25B7%259E%25E6%2583%25A0%25E5%25B7%259E%25E6%25B1%25BD%25E8%25BD%25A6%25E8%25BF%2590%25E8%25BE%2593%25E6%259C%2589%25E9%2599%2590%25E5%2585%25AC%25E5%258F%25B8",
            'Connection': 'keep-alive',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Host': 'www.tianyancha.com',
            'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
            'Upgrade-Insecure-Requests': '1'
        }
        self.var = var.strip()

    def is_number(self,x):
        try:
            x = int(x)
            return isinstance(x, int)
        except ValueError:
            return False


    def get_id_or_name(self):
        var = self.var
        # if self.is_number(var):  # 如果是公司ID的话
        #     url = "https://www.tianyancha.com/company/{}".format(var)
        #     response = requests.get(url, headers=self.headers)
        #     response = response.content.decode()
        #     if "快捷登录与短信登录" in response:
        #         print("需要登录 company_name:{}".format(var))
        #         sys.exit(0)  # ※ 终止程序
        #     etree_html = etree.HTML(response)
        #     try:
        #         # 求公司名
        #         h1 = etree_html.xpath('//div[@class="container company-header-block "]/div[3]/div[@class="content"]/div[@class="header"]/span/span/h1/text()')[0]
        #         # print(var,h1)
        #         return var,h1  # 公司Id和公司名
        #     except Exception as e:
        #         # print("公司:{}读取错误".format(var))
        #         return None,None
        if self.is_number(var):  # 如果是公司ID的话
            return var,None
        else:  # 如果是公司名
            url = "https://www.tianyancha.com/search?key={}".format(var)
            try:
                response = requests.get(url,headers=self.headers,proxies=self.ip_proxies).content.decode()  # 加上代理IP
            except Exception as e:
                print("该换ip了")
                sys.exit(0)
            if "快捷登录与短信登录" in response:
                print("需要登录 company_name:{}".format(var))
                sys.exit(0)  # ※ 终止程序
            etree_html = etree.HTML(response)
            try:
                div_s = etree_html.xpath("//div[@class='result-list sv-search-container']/div[1]")
                # print(len(div_s))  # 如果长度为0,则需要登录,而上面的Cookie和Referer和User-Agent已经有了登录信息,说明该号已被禁爬
                # print(self.response)  # 如果没有”登录“字眼就行
                # print(len(div_s))
                if len(div_s) > 0:  # 只拿取第一个公司的数据
                    contents = div_s[0].xpath("div/div[@class='content']/div[@class='header']/a/@sensors-event-ch")[0]
                    ret_id = re.findall("&card_id=(.*?)&item=", contents)[0]
                    ret_name = re.findall("&card_name=(.*?)&card_type", contents)[0]
                    # print(ret_id, ret_name)
                    return ret_id,ret_name
                else:
                    print("无法访问-通过公司名爬ID,需要更新登录Cookie或没有该公司信息")
                    # print(self.response)
                    return None,None
            except Exception as e:
                # print("公司:{}读取错误".format(var))
                return None, None



if __name__ == '__main__':
    start_time = time.time()  # 开始时间
    # with open("zSD_company_id.txt","r",encoding="utf-8") as f:
    #     content = f.readlines()
    # var_list = [car_id.strip() for car_id in content]
    # company_list = list(set(var_list))
    # company_list.sort(key=var_list.index)  # 去重排序不改变元素相对位置
    # company_list中可以写公司ID,也可以直接写公司名
    company_list = ["844565574", "2319114574", "2317302446", "789235759", "2964355333","山东广鑫宇汽车运输有限公司","山东宇汽车运输有限公司","河南国宾汽车运输有限公司","3273353756"]
    print(len(company_list),company_list)
    company_index = 0

    for var in company_list:
        company_index += 1  # 计数

        start2_time = time.time()
        Cs = CompanySetting(var)
        c_id,c_name = Cs.get_id_or_name()  # 拿到公司ID和公司名
        if c_id != None or c_name != None:
            if os.path.isfile("TY_{}.txt".format(c_id)):
                if int(os.path.getsize("TY_{}.txt".format(c_id))) == 0:  # 如果是空,删除
                    os.remove("TY_{}.txt".format(c_id))
            if os.path.isfile("TY_{}.txt".format(c_id)):  # 如果该公司信息已经爬取
                print("TY_{}.txt  pass".format(c_id))
            else:
                path = "TY_{}.txt".format(c_id)
                file_txt = open(path, "w")  # 新建一个文件(清空源文件内容)
                fp = open(path, 'a+', encoding='utf-8')
                try:
                    company_body_ty = TianYan(c_id,fp,Cs.cookie,Cs.ip_proxies)
                    company_body_ty.body_run()  # True or False
                    print("{} successful!  it cost time:{}".format(c_id,time.time()-start2_time))
                except Exception as e:  # 如果本条公司的数据爬取错误,则删除这个未爬完的txt
                    print("公司{}部分信息读取失败".format(c_id))
                    fp.close()
                    f_clear = open("TY_{}.txt".format(c_id), "w")  # 将爬取失败的文件清空 为0 k
                    # if os.path.isfile("TY_{}.txt".format(c_id)):
                    #     os.remove("TY_{}.txt".format(c_id))
                    # 发邮件记录
                    print("第{}个公司出现错误-files cost time:{}".format(company_index, time.time() - start_time))

                finally:
                    fp.close()  # 关闭文件 ※  需要及时关闭文件,不然程序循环再读取的时候如果该文件不关闭,那么文件大小就是0,而且程序报错显示该文件被其他进程占用
        else:
            print("这个公司{}的公司名字信息读取失败".format(var))
            # 发邮件记录
        time.sleep(random.random()*3)  # 随机间隔3秒以内
    print("{}-files cost time:{}".format(len(company_list),time.time()-start_time))





代码中需要更改的地方是Tianyan_main.pyCompanySetting类中初始化方法的两个变量:self.cookie和self.ip。
self.cookie的值是改为自己登录天眼查后的cookie值,ip改为动态代理ip。
关于代理IP,网上也有很多免费的代理ip,但是大多都不能用,或者只有一两秒的时效,我这里用的是天启HTTP,可以自己注册一下,注册完成后会送你5000天启币,你可以购买里面的ip,时效有3分钟、5分钟、10分钟和15分钟的,看自己需要吧,不过在提取IP之前要先添加白名单,就是将自己本地的IP地址添加到白名单中,里面有教程,我这里就不细说了,关于自己的ip可以百度“IP”看到。

一切都完工后,就可以爬了,但是爬个四五十条就会出现图片验证或者一堆乱码问题,tnnd,要崩溃,目前还在研究中。。。。

这里提一下:目前还不清楚天眼查是根据什么做的反爬机制,不知道是IP还是Cookie。只知道这个网站就是做爬虫起家的,手里好像还有反爬虫专利。不晓得是不是真的

未完待续。。。