docker部署Python+Selenium+Mysql

1,122 阅读9分钟

1.cebox安装docker引擎

#1.安装Docker Engine
sudo yum install /path/to/package.rpm
#2.启动Docker
sudo systemctl start docker
#3.测试是否安装正常
sudo docker run hello-world

2.启动selenium容器

docker run -d -p 4444:4444 --name selenium --shm-size=2g selenium/standalone-chrome:4.0.0-alpha-6-20200730

3.启动自动化生成数据库的mysql

  • 1.编写Dockerfile

    #基础镜像使用 mysql:latest
    FROM mysql:latest
    #
    ##作者
    MAINTAINER jinsong_yan <jinsong_yan@trendmicro.com>
    #
    ##定义会被容器自动执行的目录
    ENV AUTO_RUN_DIR /docker-entrypoint-initdb.d
    #
    ##定义初始化sql文件
    ENV INSTALL_DB_SQL spider.sql
    #
    ##把要执行的sql文件放到/docker-entrypoint-initdb.d/目录下,容器会自动执行这个sql
    COPY ./$INSTALL_DB_SQL $AUTO_RUN_DIR/
    #
    ##给执行文件增加可执行权限
    RUN chmod a+x $AUTO_RUN_DIR/$INSTALL_DB_SQL
    
    
  • 2.spider.sql文件

    -- 建库
    CREATE DATABASE IF NOT EXISTS spider default charset utf8 COLLATE utf8_general_ci;
    
    -- 切换数据库
    use spider;
    
    -- 建表
    DROP TABLE IF EXISTS `application_spider`;
    CREATE TABLE `application_spider`  (
      `id` bigint(20) NOT NULL AUTO_INCREMENT,
      `application` varchar(255) NOT NULL,
      `category` varchar(255) NOT NULL,
      `ip` varchar(255) NOT NULL,
      `protocol` varchar(255) NOT NULL,
      `port` varchar(255) NOT NULL,
      `create_time` date NOT NULL,
      PRIMARY KEY (`id`)
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
    
    -- 插入数据
    
    
  • 3.生成镜像

    docker build -t spider-mysql .
    
  • 4.根据镜像生成容器

    docker run  --name=spider-mysql -p 3306:3306 -e MYSQL_ROOT_PASSWORD=123456  -d spider-mysql
    

4.启动python爬虫

  • 1.编写Dockerfile

    # 基于镜像基础
    FROM python:3.7
    #指定工作目录
    WORKDIR /app
    ADD ./ ./
    #创建挂载点
    VOLUME ["usr/dockerSpider","/app"]
    # 安装所需的包
    RUN pip install -r requirements.txt
    
    ENTRYPOINT ["/bin/bash"]
    #CMD ["python", "application_spider.py"] && ["python", "detect_spider.py"]
    CMD ["spider-start.sh"]
    
  • 2.编写requirements.txt

    astroid==2.4.2
    beautifulsoup4==4.9.1
    certifi==2020.6.20
    cffi==1.14.0
    chardet==3.0.4
    colorama==0.4.3
    common==0.1.2
    idna==2.10
    Interface==2.11.1
    isort==4.3.21
    lazy-object-proxy==1.4.3
    mccabe==0.6.1
    nose==1.3.7
    pycparser==2.20
    pylint==2.5.3
    PyMySQL==0.10.0
    requests==2.24.0
    selenium==3.141.0
    six==1.15.0
    soupsieve==2.0.1
    toml==0.10.1
    typed-ast==1.4.1
    urllib3==1.25.9
    wrapt==1.12.1
    zope.event==4.4
    zope.interface==5.1.0
    zope.schema==6.0.0
    
    
  • 3.编写spider-start.sh

    #!/bin/bash
    python application_spider.py
    python detect_spider.py
    
  • 4.生成镜像

    docker build -t spider_project .
    
  • 5.启动容器

    # --link能和selenium、spider-mysql直接通信
    docker run -v /usr/dockerSpider:/app --link selenium --link spider-mysql spider-project
    

5.定时任务

1.打开添加定时任务文本
crontab -e
2.新增定时任务 每周一17:00执行
0 17 * * 1 docker start spider-project
3.查看定时任务
crontab -l

6.附件

  • 爬虫爬取部分
import logging
from urllib import request
from bs4 import BeautifulSoup
import pymysql
import time
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

def log_init(logFilename):
    """ Output log to file and console """
    # Define a Handler and set a format which output to file
    logging.basicConfig(
        level=logging.INFO,  # 定义输出到文件的log级别,
        format='%(asctime)s  %(levelname)s\t :%(message)s',  # 定义输出log的格式
        datefmt='%Y-%m-%d %A %H:%M:%S',  # 时间
        filename=logFilename,  # log文件名
        filemode='a')  # 写入模式“w”或“a”
    # Define a Handler and set a format which output to console
    console = logging.StreamHandler()  # 定义console handler
    console.setLevel(logging.INFO)  # 定义该handler级别
    formatter = logging.Formatter('%(asctime)s  %(levelname)s\t :%(message)s')  # 定义该handler格式
    console.setFormatter(formatter)
    # Create an instance
    logging.getLogger().addHandler(console)  # 实例化添加handler


def mysql_init():
    global conn, cursor
    # 创建数据库连接,注意这里我加入了charset和cursorclass参数
    conn = pymysql.connect(
        host="spider-mysql",
        user="root",
        password="123456",
        database="spider",
        charset='utf8',
        cursorclass=pymysql.cursors.DictCursor)
    # 获取游标
    cursor = conn.cursor()


def office365_spider():
    url1 = "https://docs.microsoft.com/en-us/office365/enterprise/office-365-u-s-government-gcc-high-endpoints"
    url2 = "https://docs.microsoft.com/en-us/office365/enterprise/office-365-u-s-government-dod-endpoints"
    url3 = "https://docs.microsoft.com/en-us/office365/enterprise/urls-and-ip-address-ranges"
    url4 = "https://docs.microsoft.com/en-us/office365/enterprise/urls-and-ip-address-ranges-21vianet"
    url5 = "https://docs.microsoft.com/en-us/office365/enterprise/office-365-germany-endpoints"
    url = [url1, url2, url3, url4, url5]
    create_time = time.strftime('%Y-%m-%d')
    # print(url)
    for case in url:
        # print(case)
        html = request.urlopen(case).read().decode('utf-8')
        soup = BeautifulSoup(html, 'html.parser')
        # 获取table的每一行
        select = soup.find_all('td')
        # for s in select:
        #     print(s)
        a = []
        for i in range(len(select)):
            if i == 0 or i == 1:
                continue
            elif case == url3 and i == 2:
                continue
            else:
                a.append(select[i].text)
        b = [a[i:i + 5] for i in range(0, len(a), 5)]
        # company  IP  Port  Protocol四元组
        office365_category = ["Exchange Online", "SharePoint Online and OneDrive for Business",
                              "Skype for Business Online and Microsoft Teams", "Microsoft 365 Common and Office Online"]
        app_category_index = 0
        try:
            for item in b:
                # print(item)
                if len(item) != 5:
                    continue
                item[3] = item[3].replace(",", ",")
                ip = item[3].split(",")
                item[4] = item[4].replace(":", ":")
                protocol_and_port = item[4].split(":")
                if len(protocol_and_port) < 2:
                    continue
                if case == url1:
                    if item[0] == "9":
                        app_category_index += 1
                    elif item[0] == "7":
                        app_category_index += 1
                    elif item[0] == "11":
                        app_category_index += 1
                elif case == url2:
                    if item[0] == "9":
                        app_category_index += 1
                    elif item[0] == "7":
                        app_category_index += 1
                    elif item[0] == "11":
                        app_category_index += 1
                elif case == url3:
                    if item[0] == "31":
                        app_category_index += 1
                    elif item[0] == "11":
                        app_category_index += 1
                    elif item[0] == "40":
                        app_category_index += 1
                elif case == url4:
                    if item[0] == "4":
                        app_category_index += 1
                    elif item[0] == "3":
                        app_category_index += 1
                    elif item[0] == "6":
                        app_category_index += 1
                else:
                    if item[0] == "8":
                        app_category_index += 1
                    elif item[0] == "6":
                        app_category_index += 1
                    elif item[0] == "18":
                        app_category_index += 1
                protocol_tcp = protocol_and_port[0]
                port_tcp = protocol_and_port[1]
                protocol_udp = ""
                port_udp = ""
                if port_tcp.find("UDP") != -1:
                    protocol_udp = "UDP"
                    port_tcp = port_tcp.replace("UDP", "")
                    port_udp = protocol_and_port[2]
                for i in ip:
                    if i.find("us") != -1 and i.find("/") != -1:
                        continue
                    if i.find(".com") != -1 and i.find("/") != -1:
                        continue
                    if len(protocol_udp) != 0:
                        print("OFFICE365-" + office365_category[
                            app_category_index] + "-" + i + "-" + protocol_udp + "-" + port_udp)
                        cursor.execute(
                            "INSERT INTO application_spider(application,category,ip,protocol,port,create_time) VALUES(%s,%s,%s,%s,%s,%s);",
                            ('OFFICE365', office365_category[app_category_index], i, protocol_udp, port_udp,
                             "2020-07-22"))
                    print("OFFICE365-" + office365_category[
                        app_category_index] + "-" + i + "-" + protocol_tcp + "-" + port_tcp)
                    cursor.execute(
                        "INSERT INTO application_spider(application,category,ip,protocol,port,create_time) VALUES(%s,%s,%s,%s,%s,%s);",
                        ('OFFICE365', office365_category[app_category_index], i, protocol_tcp, port_tcp, create_time))
                    conn.commit()
        except Exception as e:
            print(e)
            logging.error("office365_spider()爬取发生异常,请检查页面是否更新")
            conn.rollback()


def zoom_spider():
    url = "https://support.zoom.us/hc/en-us/articles/201362683-Network-firewall-or-proxy-server-settings-for-Zoom"
    print(url)
    create_time = time.strftime('%Y-%m-%d')
    html = request.urlopen(url).read().decode('utf-8')
    soup = BeautifulSoup(html, 'html.parser')
    # 获取table的每一行
    select = soup.find_all('td')

    index = [4, 8, 12, 20, 24, 28, 32, 40]
    a = []
    for i in range(len(select)):
        if not (i in index):
            continue
        else:
            for k in range(0, 4):
                a.append(select[i].text)
                i += 1
    b = [a[i:i + 4] for i in range(0, len(a), 4)]
    # company  IP  Port  Protocol四元组
    k = 0
    zoom_category = ["Network firewall or web security gateway",
                     "Zoom Phone", "Zoom website"]
    app_category_index = 0
    for item in b:
        if len(item) != 4:
            continue
        k += 1
        # print(item)
        item[1] = item[1].replace("\n", "").replace("\xa0-", ",").replace(" (see note)", "")
        item[3] = item[3].replace("\n", "").replace("IPv4:", " ").replace("IPv6:", " ")
        if k == 4:
            app_category_index += 1
        elif k == 8:
            app_category_index += 1
        if k == 5:
            item[3] = item[3].replace("32", "32 ")
        protocol = item[0]
        port = item[1]
        ip_list = item[3].split(" ")
        # print(ip_list)
        # print(item)
        # print("--------------------------------------")
        try:
            for ip in ip_list:
                if len(ip) == 0:
                    continue
                if ip.count("/") > 1:
                    continue
                print("ZOOM-" + zoom_category[app_category_index] + "-" + ip + "-" + protocol + "-" + port)
                cursor.execute(
                    "INSERT INTO application_spider(application,category,ip,protocol,port,create_time) VALUES(%s,%s,%s,%s,%s,%s);",
                    ('ZOOM', zoom_category[app_category_index], ip, protocol, port, create_time))
                conn.commit()
        except Exception as e:
            print(e)
            logging.error("zoom_spider()爬取发生异常,请检查页面是否更新")
            conn.rollback()


def salesforce_spider():
    url = "https://help.salesforce.com/articleView?id=000321501&type=1&mode=1"
    create_time = time.strftime('%Y-%m-%d')
    # 无界面运行参数
    #chrome_options = webdriver.ChromeOptions()
    #chrome_options.add_argument('--no-sandbox')
    #chrome_options.add_argument('--window-size=1420,1080')
    #chrome_options.add_argument('--headless')
    #chrome_options.add_argument('--disable-gpu')
    #chrome_options.add_argument('--disable-dev-shm-usage')
    #driver = webdriver.Chrome(options=chrome_options)
    driver = webdriver.Remote(
    	command_executor="http://selenium:4444/wd/hub",
    	desired_capabilities=DesiredCapabilities.CHROME
    )
    driver.get(url)
    driver.implicitly_wait(10)
    time.sleep(2)

    source = driver.page_source
    soup = BeautifulSoup(source, 'html.parser')
    select = soup.find_all('td')
    index = 0
    salesforce_category = ["Salesforce", "Community Cloud", "Email Security Filter", "Trialforce organization email",
                           "Chatter and Community mail", "System mail", "Email forwarding"]
    app_category_index = 0
    try:
        for s in select:
            index += 1
            if index == 63:
                app_category_index += 1
            elif index == 82:
                app_category_index += 1
            elif index == 163:
                app_category_index += 1
            elif index == 206:
                app_category_index += 1
            elif index == 289:
                app_category_index += 1
            elif index == 333:
                app_category_index += 1
            if index < 5 or index > 374:
                continue
            if s.text.find(".") < 3:
                continue
            ip = s.text
            print("SALESFORCE- " + salesforce_category[app_category_index] + "-" + ip + "- " + "TCP/UDP" + "- " + "ANY")
            cursor.execute(
                "INSERT INTO application_spider(application,category,ip,protocol,port,create_time) VALUES(%s,%s,%s,%s,%s,%s);",
                ('SALESFORCE', salesforce_category[app_category_index], ip, "TCP/UDP", "ANY", create_time))
            conn.commit()
    except Exception as e:
        logging.error("salesforce_spider()爬取发生异常,请检查页面是否更新")
        conn.rollback()

    driver.quit()


def mysql_close():
    cursor.close()
    conn.close()


def main():
    log_init("spider.log");
    #mysql init
    mysql_init()
    # spider 3 application
    office365_spider()
    zoom_spider()
    salesforce_spider()
    #time.sleep(3600)
    # close cursor and connection
    mysql_close()


if __name__ == '__main__':
    main()

  • 爬虫检测部分
from datetime import timedelta, date, datetime
import pymysql
import logging


def log_init(logFilename):
    """ Output log to file and console """
    # Define a Handler and set a format which output to file
    logging.basicConfig(
        level=logging.INFO,  # 定义输出到文件的log级别,
        format='%(asctime)s  %(levelname)s\t :%(message)s',  # 定义输出log的格式
        datefmt='%Y-%m-%d %A %H:%M:%S',  # 时间
        filename=logFilename,  # log文件名
        filemode='a')  # 写入模式“w”或“a”
    # Define a Handler and set a format which output to console
    console = logging.StreamHandler()  # 定义console handler
    console.setLevel(logging.INFO)  # 定义该handler级别
    formatter = logging.Formatter('%(asctime)s  %(levelname)s\t :%(message)s')  # 定义该handler格式
    console.setFormatter(formatter)
    # Create an instance
    logging.getLogger().addHandler(console)  # 实例化添加handler


def mysql_init():
    global conn, cursor
    # 创建数据库连接,注意这里我加入了charset和cursorclass参数
    conn = pymysql.connect(
        host="spider-mysql",
        user="root",
        password="123456",
        database="spider",
        charset='utf8',
        cursorclass=pymysql.cursors.DictCursor)
    # 获取游标
    cursor = conn.cursor()


def get_last_week_date(_today):
    return (_today - timedelta(days=7)).strftime('%Y-%m-%d')


def get_last_2week_date(_today):
    return (_today - timedelta(days=14)).strftime('%Y-%m-%d')


def check_office365():
    # get today time 2020-08-12
    today = datetime.today()
    today_date = today.strftime('%Y-%m-%d')
    # get last week time
    last_week_date = get_last_week_date(today)
    last_2week_date = get_last_2week_date(today)
    today_count = 0
    last_week_count = 0
    try:
        cursor.execute("select count(*) from application_spider where create_time=%s and application=%s ",
                       (today_date, "OFFICE365"))
        # cursor.execute("select count(*) from application_spider where create_time<=%s ", last_2week_date)
        today_count = cursor.fetchall()
        today_count = int(str(today_count).split(":")[1].replace("}", "").replace("]", ""))

        cursor.execute("select count(*) from application_spider where create_time=%s and application=%s", (last_week_date, "OFFICE365"))
        last_week_count = cursor.fetchall()
        last_week_count = int(str(last_week_count).split(":")[1].replace("}", "").replace("]", ""))
    except Exception as e:
        logging.error(e)
        logging.error("检测OFFICE365数据发生异常,请检查页面是否更新")
    logging.info("上周日期:" + str(last_week_date) + " OFFICE365爬取条数为:" + str(last_week_count))
    logging.info("本周日期:" + str(today_date) + " OFFICE365爬取条数为:" + str(today_count))
    if today_count != last_week_count:
        logging.error("OFFICE365页面结果已更新,请检查页面并重新爬取")
    else:
        logging.info("OFFICE365本次爬取结果正确,网页未更新")


def check_zoom():
    # get today time
    today = datetime.today()
    today_date = today.strftime('%Y-%m-%d')
    # get last week time
    last_week_date = get_last_week_date(today)
    last_2week_date = get_last_2week_date(today)
    today_count = 0
    last_week_count = 0
    try:
        cursor.execute("select count(*) from application_spider where create_time=%s and application=%s ",
                       (today_date, "ZOOM"))
        # cursor.execute("select count(*) from application_spider where create_time<=%s ", last_2week_date)
        today_count = cursor.fetchall()
        today_count = int(str(today_count).split(":")[1].replace("}", "").replace("]", ""))

        cursor.execute("select count(*) from application_spider where create_time=%s and application=%s", (last_week_date, "ZOOM"))
        last_week_count = cursor.fetchall()
        last_week_count = int(str(last_week_count).split(":")[1].replace("}", "").replace("]", ""))
    except Exception as e:
        logging.error(e)
        logging.error("检测ZOOM数据发生异常,请检查页面是否更新")
    logging.info("上周日期:" + str(last_week_date) + " ZOOM爬取条数为:" + str(last_week_count))
    logging.info("本周日期:" + str(today_date) + " ZOOM爬取条数为:" + str(today_count))
    if today_count != last_week_count:
        logging.error("ZOOM页面结果已更新,请检查页面并重新爬取")
    else:
        logging.info("ZOOM本次爬取结果正确,网页未更新")


def check_salesforce():
    # get today time
    today = datetime.today()
    today_date = today.strftime('%Y-%m-%d')
    # get last week time
    last_week_date = get_last_week_date(today)
    last_2week_date = get_last_2week_date(today)
    today_count = 0
    last_week_count = 0
    try:
        cursor.execute("select count(*) from application_spider where create_time=%s and application=%s ",
                       (today_date, "SALESFORCE"))
        # cursor.execute("select count(*) from application_spider where create_time<=%s ", last_2week_date)
        today_count = cursor.fetchall()
        today_count = int(str(today_count).split(":")[1].replace("}", "").replace("]", ""))

        cursor.execute("select count(*) from application_spider where create_time=%s and application=%s", (last_week_date, "SALESFORCE"))
        last_week_count = cursor.fetchall()
        last_week_count = int(str(last_week_count).split(":")[1].replace("}", "").replace("]", ""))
    except Exception as e:
        logging.error(e)
        logging.error("检测ZOOM数据发生异常,请检查页面是否更新")
    logging.info("上周日期:" + str(last_week_date) + " SALESFORCE爬取条数为:" + str(last_week_count))
    logging.info("本周日期:" + str(today_date) + " SALESFORCE爬取条数为:" + str(today_count))
    if today_count != last_week_count:
        logging.error("SALESFORCE页面结果已更新,请检查页面并重新爬取")
    else:
        logging.info("SALESFORCE本次爬取结果正确,网页未更新")


def mysql_close():
    cursor.close()
    conn.close()


if __name__ == '__main__':
    #start log
    log_init("spider.log")
    #start mysql
    mysql_init()

    #start check
    check_office365()
    check_zoom()
    check_salesforce()

    #close mysql
    mysql_close()

  • 爬虫日志
2020-08-18 Tuesday 09:19:26  DEBUG	 :POST http://172.17.0.3:4444/wd/hub/session {"capabilities": {"firstMatch": [{}], "alwaysMatch": {"browserName": "chrome", "platformName": "any"}}, "desiredCapabilities": {"browserName": "chrome", "version": "", "platform": "ANY"}}
2020-08-18 Tuesday 09:19:26  DEBUG	 :Starting new HTTP connection (1): 172.17.0.3:4444
2020-08-18 Tuesday 09:19:27  DEBUG	 :http://172.17.0.3:4444 "POST /wd/hub/session HTTP/1.1" 200 1074
2020-08-18 Tuesday 09:19:27  DEBUG	 :Finished Request
2020-08-18 Tuesday 09:19:27  DEBUG	 :POST http://172.17.0.3:4444/wd/hub/session/d1d772ae19cbbdd684b658123b676d32/url {"url": "https://help.salesforce.com/articleView?id=000321501&type=1&mode=1"}
2020-08-18 Tuesday 09:19:27  DEBUG	 :Starting new HTTP connection (1): 172.17.0.3:4444
2020-08-18 Tuesday 09:19:31  DEBUG	 :http://172.17.0.3:4444 "POST /wd/hub/session/d1d772ae19cbbdd684b658123b676d32/url HTTP/1.1" 200 14
2020-08-18 Tuesday 09:19:31  DEBUG	 :Finished Request
2020-08-18 Tuesday 09:19:31  DEBUG	 :POST http://172.17.0.3:4444/wd/hub/session/d1d772ae19cbbdd684b658123b676d32/timeouts {"implicit": 10000}
2020-08-18 Tuesday 09:19:31  DEBUG	 :Starting new HTTP connection (1): 172.17.0.3:4444
2020-08-18 Tuesday 09:19:31  DEBUG	 :http://172.17.0.3:4444 "POST /wd/hub/session/d1d772ae19cbbdd684b658123b676d32/timeouts HTTP/1.1" 200 14
2020-08-18 Tuesday 09:19:31  DEBUG	 :Finished Request
2020-08-18 Tuesday 09:19:33  DEBUG	 :GET http://172.17.0.3:4444/wd/hub/session/d1d772ae19cbbdd684b658123b676d32/source {}
2020-08-18 Tuesday 09:19:33  DEBUG	 :Starting new HTTP connection (1): 172.17.0.3:4444
2020-08-18 Tuesday 09:19:34  DEBUG	 :http://172.17.0.3:4444 "GET /wd/hub/session/d1d772ae19cbbdd684b658123b676d32/source HTTP/1.1" 200 400765
2020-08-18 Tuesday 09:19:34  DEBUG	 :Finished Request
2020-08-18 Tuesday 09:19:35  DEBUG	 :DELETE http://172.17.0.3:4444/wd/hub/session/d1d772ae19cbbdd684b658123b676d32 {}
2020-08-18 Tuesday 09:19:35  DEBUG	 :Starting new HTTP connection (1): 172.17.0.3:4444
2020-08-18 Tuesday 09:19:35  DEBUG	 :http://172.17.0.3:4444 "DELETE /wd/hub/session/d1d772ae19cbbdd684b658123b676d32 HTTP/1.1" 200 14
2020-08-18 Tuesday 09:19:35  DEBUG	 :Finished Request
2020-08-18 Tuesday 09:19:36  INFO	 :上周日期:2020-08-11 OFFICE365爬取条数为:0
2020-08-18 Tuesday 09:19:36  INFO	 :本周日期:2020-08-18 OFFICE365爬取条数为:12038
2020-08-18 Tuesday 09:19:36  ERROR	 :OFFICE365页面结果已更新,请检查页面并重新爬取
2020-08-18 Tuesday 09:19:36  INFO	 :上周日期:2020-08-11 ZOOM爬取条数为:0
2020-08-18 Tuesday 09:19:36  INFO	 :本周日期:2020-08-18 ZOOM爬取条数为:1890
2020-08-18 Tuesday 09:19:36  ERROR	 :ZOOM页面结果已更新,请检查页面并重新爬取
2020-08-18 Tuesday 09:19:36  INFO	 :上周日期:2020-08-11 SALESFORCE爬取条数为:0
2020-08-18 Tuesday 09:19:36  INFO	 :本周日期:2020-08-18 SALESFORCE爬取条数为:1233
2020-08-18 Tuesday 09:19:36  ERROR	 :SALESFORCE页面结果已更新,请检查页面并重新爬取