task2-爬取OFFICE365、ZOOM服务器IP地址和对应协议端口号

429 阅读1分钟
from urllib import request
from bs4 import BeautifulSoup
import pymysql
import time
from selenium import webdriver

# 创建数据库连接,注意这里我加入了charset和cursorclass参数
conn = pymysql.connect(
    host="127.0.0.1",
    user="root",
    password="",
    database="spider",
    charset='utf8',
    cursorclass=pymysql.cursors.DictCursor)
# 获取游标
cursor = conn.cursor()


def office365_spider():
    url1 = "https://docs.microsoft.com/en-us/office365/enterprise/office-365-u-s-government-gcc-high-endpoints"
    url2 = "https://docs.microsoft.com/en-us/office365/enterprise/office-365-u-s-government-dod-endpoints"
    url3 = "https://docs.microsoft.com/en-us/office365/enterprise/urls-and-ip-address-ranges"
    url4 = "https://docs.microsoft.com/en-us/office365/enterprise/urls-and-ip-address-ranges-21vianet"
    url5 = "https://docs.microsoft.com/en-us/office365/enterprise/office-365-germany-endpoints"
    url = [url1, url2, url3, url4, url5]

    # print(url)
    for case in url:
        # print(case)
        html = request.urlopen(case).read().decode('utf-8')
        soup = BeautifulSoup(html, 'html.parser')
        # 获取table的每一行
        select = soup.find_all('td')
        # for s in select:
        #     print(s)
        a = []
        for i in range(len(select)):
            if i == 0 or i == 1:
                continue
            elif case == url3 and i == 2:
                continue
            else:
                a.append(select[i].text)
        b = [a[i:i + 5] for i in range(0, len(a), 5)]
        # company  IP  Port  Protocol四元组
        try:
            for item in b:
                # print(item)
                if len(item) != 5:
                    continue
                item[3] = item[3].replace(",", ",")
                ip = item[3].split(",")
                item[4] = item[4].replace(":", ":")
                protocol_and_port = item[4].split(":")
                if len(protocol_and_port) < 2:
                    continue

                protocol_tcp = protocol_and_port[0]
                port_tcp = protocol_and_port[1]
                protocol_udp = ""
                port_udp = ""
                if port_tcp.find("UDP") != -1:
                    protocol_udp = "UDP"
                    port_tcp = port_tcp.replace("UDP", "")
                    port_udp = protocol_and_port[2]
                for i in ip:
                    if i.find("us") != -1 and i.find("/") != -1:
                        continue
                    if i.find(".com") != -1 and i.find("/") != -1:
                        continue
                    if len(protocol_udp) != 0:
                        print("OFFICE365-" + str(i) + "-" + protocol_udp + "-" + port_udp)
                        cursor.execute(
                            "INSERT INTO application_spider(application,ip,protocol,port) VALUES(%s,%s,%s,%s);",
                            ('OFFICE365', i, protocol_udp, port_udp))
                    print("OFFICE365-" + str(i) + "-" + protocol_tcp + "-" + port_tcp)
                    cursor.execute("INSERT INTO application_spider(application,ip,protocol,port) VALUES(%s,%s,%s,%s);",
                                   ('OFFICE365', i, protocol_tcp, port_tcp))
                    conn.commit()
        except Exception as e:
            print(e)


def zoom_spider():
    url = "https://support.zoom.us/hc/en-us/articles/201362683-Network-firewall-or-proxy-server-settings-for-Zoom"
    print(url)
    html = request.urlopen(url).read().decode('utf-8')
    soup = BeautifulSoup(html, 'html.parser')
    # 获取table的每一行
    select = soup.find_all('td')

    index = [4, 8, 12, 20, 24, 28, 32, 40]
    a = []
    for i in range(len(select)):
        if not (i in index):
            continue
        else:
            for k in range(0, 4):
                a.append(select[i].text)
                i += 1
    b = [a[i:i + 4] for i in range(0, len(a), 4)]
    # company  IP  Port  Protocol四元组
    k = 0
    for item in b:
        if len(item) != 4:
            continue
        k += 1
        # print(item)
        item[1] = item[1].replace("\n", "").replace("\xa0-", ",").replace(" (see note)", "")
        item[3] = item[3].replace("\n", "").replace("IPv4:", " ").replace("IPv6:", " ")
        if k == 5:
            item[3] = item[3].replace("32", "32 ")
        protocol = item[0]
        port = item[1]
        ip_list = item[3].split(" ")
        # print(ip_list)
        # print(item)
        # print("--------------------------------------")
        try:
            for ip in ip_list:
                if len(ip) == 0:
                    continue
                if ip.count("/") > 1:
                    continue
                print("ZOOM-" + ip + "-" + protocol + "-" + port)
                cursor.execute("INSERT INTO application_spider(application,ip,protocol,port) VALUES(%s,%s,%s,%s);",
                               ('ZOOM', ip, protocol, port))
                conn.commit()
        except Exception as e:
            print(e)


def salesforce_spider():
    url = "https://help.salesforce.com/articleView?id=000321501&type=1&mode=1"
    # 无界面运行参数
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--window-size=1420,1080')
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=chrome_options)

    driver.get(url)
    driver.implicitly_wait(10)
    time.sleep(2)

    row = driver.find_elements_by_tag_name('tr')
    source = driver.page_source
    soup = BeautifulSoup(source, 'html.parser')
    select = soup.find_all('td')
    index = 0
    try:
        for s in select:
            index += 1
            if index < 5 or index > 374:
                continue
            if s.text.find(".") < 3:
                continue
            ip = s.text
            cursor.execute("INSERT INTO application_spider(application,ip,protocol,port) VALUES(%s,%s,%s,%s);",
                           ('SALESFORCE', ip, "TCP/UDP", "ANY"))
            conn.commit()
    except Exception as e:
        print(e)

    driver.quit()


def main():
    # sprider 3 application
    office365_spider()
    zoom_spider()
    salesforce_spider()

    # close cursor and connection
    cursor.close()
    conn.close()


if __name__ == '__main__':
    main()