1.cebox安装docker引擎
#1.安装Docker Engine
sudo yum install /path/to/package.rpm
#2.启动Docker
sudo systemctl start docker
#3.测试是否安装正常
sudo docker run hello-world
2.启动selenium容器
docker run -d -p 4444:4444 --name selenium --shm-size=2g selenium/standalone-chrome:4.0.0-alpha-6-20200730
3.启动自动化生成数据库的mysql
-
1.编写Dockerfile
#基础镜像使用 mysql:latest FROM mysql:latest # ##作者 MAINTAINER jinsong_yan <jinsong_yan@trendmicro.com> # ##定义会被容器自动执行的目录 ENV AUTO_RUN_DIR /docker-entrypoint-initdb.d # ##定义初始化sql文件 ENV INSTALL_DB_SQL spider.sql # ##把要执行的sql文件放到/docker-entrypoint-initdb.d/目录下,容器会自动执行这个sql COPY ./$INSTALL_DB_SQL $AUTO_RUN_DIR/ # ##给执行文件增加可执行权限 RUN chmod a+x $AUTO_RUN_DIR/$INSTALL_DB_SQL
-
2.spider.sql文件
-- 建库 CREATE DATABASE IF NOT EXISTS spider default charset utf8 COLLATE utf8_general_ci; -- 切换数据库 use spider; -- 建表 DROP TABLE IF EXISTS `application_spider`; CREATE TABLE `application_spider` ( `id` bigint(20) NOT NULL AUTO_INCREMENT, `application` varchar(255) NOT NULL, `category` varchar(255) NOT NULL, `ip` varchar(255) NOT NULL, `protocol` varchar(255) NOT NULL, `port` varchar(255) NOT NULL, `create_time` date NOT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8; -- 插入数据
-
3.生成镜像
docker build -t spider-mysql .
-
4.根据镜像生成容器
docker run --name=spider-mysql -p 3306:3306 -e MYSQL_ROOT_PASSWORD=123456 -d spider-mysql
4.启动python爬虫
-
1.编写Dockerfile
# 基于镜像基础 FROM python:3.7 #指定工作目录 WORKDIR /app ADD ./ ./ #创建挂载点 VOLUME ["usr/dockerSpider","/app"] # 安装所需的包 RUN pip install -r requirements.txt ENTRYPOINT ["/bin/bash"] #CMD ["python", "application_spider.py"] && ["python", "detect_spider.py"] CMD ["spider-start.sh"]
-
2.编写requirements.txt
astroid==2.4.2 beautifulsoup4==4.9.1 certifi==2020.6.20 cffi==1.14.0 chardet==3.0.4 colorama==0.4.3 common==0.1.2 idna==2.10 Interface==2.11.1 isort==4.3.21 lazy-object-proxy==1.4.3 mccabe==0.6.1 nose==1.3.7 pycparser==2.20 pylint==2.5.3 PyMySQL==0.10.0 requests==2.24.0 selenium==3.141.0 six==1.15.0 soupsieve==2.0.1 toml==0.10.1 typed-ast==1.4.1 urllib3==1.25.9 wrapt==1.12.1 zope.event==4.4 zope.interface==5.1.0 zope.schema==6.0.0
-
3.编写spider-start.sh
#!/bin/bash python application_spider.py python detect_spider.py
-
4.生成镜像
docker build -t spider_project .
-
5.启动容器
# --link能和selenium、spider-mysql直接通信 docker run -v /usr/dockerSpider:/app --link selenium --link spider-mysql spider-project
5.定时任务
1.打开添加定时任务文本
crontab -e
2.新增定时任务 每周一17:00执行
0 17 * * 1 docker start spider-project
3.查看定时任务
crontab -l
6.附件
- 爬虫爬取部分
import logging
from urllib import request
from bs4 import BeautifulSoup
import pymysql
import time
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
def log_init(logFilename):
""" Output log to file and console """
# Define a Handler and set a format which output to file
logging.basicConfig(
level=logging.INFO, # 定义输出到文件的log级别,
format='%(asctime)s %(levelname)s\t :%(message)s', # 定义输出log的格式
datefmt='%Y-%m-%d %A %H:%M:%S', # 时间
filename=logFilename, # log文件名
filemode='a') # 写入模式“w”或“a”
# Define a Handler and set a format which output to console
console = logging.StreamHandler() # 定义console handler
console.setLevel(logging.INFO) # 定义该handler级别
formatter = logging.Formatter('%(asctime)s %(levelname)s\t :%(message)s') # 定义该handler格式
console.setFormatter(formatter)
# Create an instance
logging.getLogger().addHandler(console) # 实例化添加handler
def mysql_init():
global conn, cursor
# 创建数据库连接,注意这里我加入了charset和cursorclass参数
conn = pymysql.connect(
host="spider-mysql",
user="root",
password="123456",
database="spider",
charset='utf8',
cursorclass=pymysql.cursors.DictCursor)
# 获取游标
cursor = conn.cursor()
def office365_spider():
url1 = "https://docs.microsoft.com/en-us/office365/enterprise/office-365-u-s-government-gcc-high-endpoints"
url2 = "https://docs.microsoft.com/en-us/office365/enterprise/office-365-u-s-government-dod-endpoints"
url3 = "https://docs.microsoft.com/en-us/office365/enterprise/urls-and-ip-address-ranges"
url4 = "https://docs.microsoft.com/en-us/office365/enterprise/urls-and-ip-address-ranges-21vianet"
url5 = "https://docs.microsoft.com/en-us/office365/enterprise/office-365-germany-endpoints"
url = [url1, url2, url3, url4, url5]
create_time = time.strftime('%Y-%m-%d')
# print(url)
for case in url:
# print(case)
html = request.urlopen(case).read().decode('utf-8')
soup = BeautifulSoup(html, 'html.parser')
# 获取table的每一行
select = soup.find_all('td')
# for s in select:
# print(s)
a = []
for i in range(len(select)):
if i == 0 or i == 1:
continue
elif case == url3 and i == 2:
continue
else:
a.append(select[i].text)
b = [a[i:i + 5] for i in range(0, len(a), 5)]
# company IP Port Protocol四元组
office365_category = ["Exchange Online", "SharePoint Online and OneDrive for Business",
"Skype for Business Online and Microsoft Teams", "Microsoft 365 Common and Office Online"]
app_category_index = 0
try:
for item in b:
# print(item)
if len(item) != 5:
continue
item[3] = item[3].replace(",", ",")
ip = item[3].split(",")
item[4] = item[4].replace(":", ":")
protocol_and_port = item[4].split(":")
if len(protocol_and_port) < 2:
continue
if case == url1:
if item[0] == "9":
app_category_index += 1
elif item[0] == "7":
app_category_index += 1
elif item[0] == "11":
app_category_index += 1
elif case == url2:
if item[0] == "9":
app_category_index += 1
elif item[0] == "7":
app_category_index += 1
elif item[0] == "11":
app_category_index += 1
elif case == url3:
if item[0] == "31":
app_category_index += 1
elif item[0] == "11":
app_category_index += 1
elif item[0] == "40":
app_category_index += 1
elif case == url4:
if item[0] == "4":
app_category_index += 1
elif item[0] == "3":
app_category_index += 1
elif item[0] == "6":
app_category_index += 1
else:
if item[0] == "8":
app_category_index += 1
elif item[0] == "6":
app_category_index += 1
elif item[0] == "18":
app_category_index += 1
protocol_tcp = protocol_and_port[0]
port_tcp = protocol_and_port[1]
protocol_udp = ""
port_udp = ""
if port_tcp.find("UDP") != -1:
protocol_udp = "UDP"
port_tcp = port_tcp.replace("UDP", "")
port_udp = protocol_and_port[2]
for i in ip:
if i.find("us") != -1 and i.find("/") != -1:
continue
if i.find(".com") != -1 and i.find("/") != -1:
continue
if len(protocol_udp) != 0:
print("OFFICE365-" + office365_category[
app_category_index] + "-" + i + "-" + protocol_udp + "-" + port_udp)
cursor.execute(
"INSERT INTO application_spider(application,category,ip,protocol,port,create_time) VALUES(%s,%s,%s,%s,%s,%s);",
('OFFICE365', office365_category[app_category_index], i, protocol_udp, port_udp,
"2020-07-22"))
print("OFFICE365-" + office365_category[
app_category_index] + "-" + i + "-" + protocol_tcp + "-" + port_tcp)
cursor.execute(
"INSERT INTO application_spider(application,category,ip,protocol,port,create_time) VALUES(%s,%s,%s,%s,%s,%s);",
('OFFICE365', office365_category[app_category_index], i, protocol_tcp, port_tcp, create_time))
conn.commit()
except Exception as e:
print(e)
logging.error("office365_spider()爬取发生异常,请检查页面是否更新")
conn.rollback()
def zoom_spider():
url = "https://support.zoom.us/hc/en-us/articles/201362683-Network-firewall-or-proxy-server-settings-for-Zoom"
print(url)
create_time = time.strftime('%Y-%m-%d')
html = request.urlopen(url).read().decode('utf-8')
soup = BeautifulSoup(html, 'html.parser')
# 获取table的每一行
select = soup.find_all('td')
index = [4, 8, 12, 20, 24, 28, 32, 40]
a = []
for i in range(len(select)):
if not (i in index):
continue
else:
for k in range(0, 4):
a.append(select[i].text)
i += 1
b = [a[i:i + 4] for i in range(0, len(a), 4)]
# company IP Port Protocol四元组
k = 0
zoom_category = ["Network firewall or web security gateway",
"Zoom Phone", "Zoom website"]
app_category_index = 0
for item in b:
if len(item) != 4:
continue
k += 1
# print(item)
item[1] = item[1].replace("\n", "").replace("\xa0-", ",").replace(" (see note)", "")
item[3] = item[3].replace("\n", "").replace("IPv4:", " ").replace("IPv6:", " ")
if k == 4:
app_category_index += 1
elif k == 8:
app_category_index += 1
if k == 5:
item[3] = item[3].replace("32", "32 ")
protocol = item[0]
port = item[1]
ip_list = item[3].split(" ")
# print(ip_list)
# print(item)
# print("--------------------------------------")
try:
for ip in ip_list:
if len(ip) == 0:
continue
if ip.count("/") > 1:
continue
print("ZOOM-" + zoom_category[app_category_index] + "-" + ip + "-" + protocol + "-" + port)
cursor.execute(
"INSERT INTO application_spider(application,category,ip,protocol,port,create_time) VALUES(%s,%s,%s,%s,%s,%s);",
('ZOOM', zoom_category[app_category_index], ip, protocol, port, create_time))
conn.commit()
except Exception as e:
print(e)
logging.error("zoom_spider()爬取发生异常,请检查页面是否更新")
conn.rollback()
def salesforce_spider():
url = "https://help.salesforce.com/articleView?id=000321501&type=1&mode=1"
create_time = time.strftime('%Y-%m-%d')
# 无界面运行参数
#chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--no-sandbox')
#chrome_options.add_argument('--window-size=1420,1080')
#chrome_options.add_argument('--headless')
#chrome_options.add_argument('--disable-gpu')
#chrome_options.add_argument('--disable-dev-shm-usage')
#driver = webdriver.Chrome(options=chrome_options)
driver = webdriver.Remote(
command_executor="http://selenium:4444/wd/hub",
desired_capabilities=DesiredCapabilities.CHROME
)
driver.get(url)
driver.implicitly_wait(10)
time.sleep(2)
source = driver.page_source
soup = BeautifulSoup(source, 'html.parser')
select = soup.find_all('td')
index = 0
salesforce_category = ["Salesforce", "Community Cloud", "Email Security Filter", "Trialforce organization email",
"Chatter and Community mail", "System mail", "Email forwarding"]
app_category_index = 0
try:
for s in select:
index += 1
if index == 63:
app_category_index += 1
elif index == 82:
app_category_index += 1
elif index == 163:
app_category_index += 1
elif index == 206:
app_category_index += 1
elif index == 289:
app_category_index += 1
elif index == 333:
app_category_index += 1
if index < 5 or index > 374:
continue
if s.text.find(".") < 3:
continue
ip = s.text
print("SALESFORCE- " + salesforce_category[app_category_index] + "-" + ip + "- " + "TCP/UDP" + "- " + "ANY")
cursor.execute(
"INSERT INTO application_spider(application,category,ip,protocol,port,create_time) VALUES(%s,%s,%s,%s,%s,%s);",
('SALESFORCE', salesforce_category[app_category_index], ip, "TCP/UDP", "ANY", create_time))
conn.commit()
except Exception as e:
logging.error("salesforce_spider()爬取发生异常,请检查页面是否更新")
conn.rollback()
driver.quit()
def mysql_close():
cursor.close()
conn.close()
def main():
log_init("spider.log");
#mysql init
mysql_init()
# spider 3 application
office365_spider()
zoom_spider()
salesforce_spider()
#time.sleep(3600)
# close cursor and connection
mysql_close()
if __name__ == '__main__':
main()
- 爬虫检测部分
from datetime import timedelta, date, datetime
import pymysql
import logging
def log_init(logFilename):
""" Output log to file and console """
# Define a Handler and set a format which output to file
logging.basicConfig(
level=logging.INFO, # 定义输出到文件的log级别,
format='%(asctime)s %(levelname)s\t :%(message)s', # 定义输出log的格式
datefmt='%Y-%m-%d %A %H:%M:%S', # 时间
filename=logFilename, # log文件名
filemode='a') # 写入模式“w”或“a”
# Define a Handler and set a format which output to console
console = logging.StreamHandler() # 定义console handler
console.setLevel(logging.INFO) # 定义该handler级别
formatter = logging.Formatter('%(asctime)s %(levelname)s\t :%(message)s') # 定义该handler格式
console.setFormatter(formatter)
# Create an instance
logging.getLogger().addHandler(console) # 实例化添加handler
def mysql_init():
global conn, cursor
# 创建数据库连接,注意这里我加入了charset和cursorclass参数
conn = pymysql.connect(
host="spider-mysql",
user="root",
password="123456",
database="spider",
charset='utf8',
cursorclass=pymysql.cursors.DictCursor)
# 获取游标
cursor = conn.cursor()
def get_last_week_date(_today):
return (_today - timedelta(days=7)).strftime('%Y-%m-%d')
def get_last_2week_date(_today):
return (_today - timedelta(days=14)).strftime('%Y-%m-%d')
def check_office365():
# get today time 2020-08-12
today = datetime.today()
today_date = today.strftime('%Y-%m-%d')
# get last week time
last_week_date = get_last_week_date(today)
last_2week_date = get_last_2week_date(today)
today_count = 0
last_week_count = 0
try:
cursor.execute("select count(*) from application_spider where create_time=%s and application=%s ",
(today_date, "OFFICE365"))
# cursor.execute("select count(*) from application_spider where create_time<=%s ", last_2week_date)
today_count = cursor.fetchall()
today_count = int(str(today_count).split(":")[1].replace("}", "").replace("]", ""))
cursor.execute("select count(*) from application_spider where create_time=%s and application=%s", (last_week_date, "OFFICE365"))
last_week_count = cursor.fetchall()
last_week_count = int(str(last_week_count).split(":")[1].replace("}", "").replace("]", ""))
except Exception as e:
logging.error(e)
logging.error("检测OFFICE365数据发生异常,请检查页面是否更新")
logging.info("上周日期:" + str(last_week_date) + " OFFICE365爬取条数为:" + str(last_week_count))
logging.info("本周日期:" + str(today_date) + " OFFICE365爬取条数为:" + str(today_count))
if today_count != last_week_count:
logging.error("OFFICE365页面结果已更新,请检查页面并重新爬取")
else:
logging.info("OFFICE365本次爬取结果正确,网页未更新")
def check_zoom():
# get today time
today = datetime.today()
today_date = today.strftime('%Y-%m-%d')
# get last week time
last_week_date = get_last_week_date(today)
last_2week_date = get_last_2week_date(today)
today_count = 0
last_week_count = 0
try:
cursor.execute("select count(*) from application_spider where create_time=%s and application=%s ",
(today_date, "ZOOM"))
# cursor.execute("select count(*) from application_spider where create_time<=%s ", last_2week_date)
today_count = cursor.fetchall()
today_count = int(str(today_count).split(":")[1].replace("}", "").replace("]", ""))
cursor.execute("select count(*) from application_spider where create_time=%s and application=%s", (last_week_date, "ZOOM"))
last_week_count = cursor.fetchall()
last_week_count = int(str(last_week_count).split(":")[1].replace("}", "").replace("]", ""))
except Exception as e:
logging.error(e)
logging.error("检测ZOOM数据发生异常,请检查页面是否更新")
logging.info("上周日期:" + str(last_week_date) + " ZOOM爬取条数为:" + str(last_week_count))
logging.info("本周日期:" + str(today_date) + " ZOOM爬取条数为:" + str(today_count))
if today_count != last_week_count:
logging.error("ZOOM页面结果已更新,请检查页面并重新爬取")
else:
logging.info("ZOOM本次爬取结果正确,网页未更新")
def check_salesforce():
# get today time
today = datetime.today()
today_date = today.strftime('%Y-%m-%d')
# get last week time
last_week_date = get_last_week_date(today)
last_2week_date = get_last_2week_date(today)
today_count = 0
last_week_count = 0
try:
cursor.execute("select count(*) from application_spider where create_time=%s and application=%s ",
(today_date, "SALESFORCE"))
# cursor.execute("select count(*) from application_spider where create_time<=%s ", last_2week_date)
today_count = cursor.fetchall()
today_count = int(str(today_count).split(":")[1].replace("}", "").replace("]", ""))
cursor.execute("select count(*) from application_spider where create_time=%s and application=%s", (last_week_date, "SALESFORCE"))
last_week_count = cursor.fetchall()
last_week_count = int(str(last_week_count).split(":")[1].replace("}", "").replace("]", ""))
except Exception as e:
logging.error(e)
logging.error("检测ZOOM数据发生异常,请检查页面是否更新")
logging.info("上周日期:" + str(last_week_date) + " SALESFORCE爬取条数为:" + str(last_week_count))
logging.info("本周日期:" + str(today_date) + " SALESFORCE爬取条数为:" + str(today_count))
if today_count != last_week_count:
logging.error("SALESFORCE页面结果已更新,请检查页面并重新爬取")
else:
logging.info("SALESFORCE本次爬取结果正确,网页未更新")
def mysql_close():
cursor.close()
conn.close()
if __name__ == '__main__':
#start log
log_init("spider.log")
#start mysql
mysql_init()
#start check
check_office365()
check_zoom()
check_salesforce()
#close mysql
mysql_close()
- 爬虫日志
2020-08-18 Tuesday 09:19:26 DEBUG :POST http://172.17.0.3:4444/wd/hub/session {"capabilities": {"firstMatch": [{}], "alwaysMatch": {"browserName": "chrome", "platformName": "any"}}, "desiredCapabilities": {"browserName": "chrome", "version": "", "platform": "ANY"}}
2020-08-18 Tuesday 09:19:26 DEBUG :Starting new HTTP connection (1): 172.17.0.3:4444
2020-08-18 Tuesday 09:19:27 DEBUG :http://172.17.0.3:4444 "POST /wd/hub/session HTTP/1.1" 200 1074
2020-08-18 Tuesday 09:19:27 DEBUG :Finished Request
2020-08-18 Tuesday 09:19:27 DEBUG :POST http://172.17.0.3:4444/wd/hub/session/d1d772ae19cbbdd684b658123b676d32/url {"url": "https://help.salesforce.com/articleView?id=000321501&type=1&mode=1"}
2020-08-18 Tuesday 09:19:27 DEBUG :Starting new HTTP connection (1): 172.17.0.3:4444
2020-08-18 Tuesday 09:19:31 DEBUG :http://172.17.0.3:4444 "POST /wd/hub/session/d1d772ae19cbbdd684b658123b676d32/url HTTP/1.1" 200 14
2020-08-18 Tuesday 09:19:31 DEBUG :Finished Request
2020-08-18 Tuesday 09:19:31 DEBUG :POST http://172.17.0.3:4444/wd/hub/session/d1d772ae19cbbdd684b658123b676d32/timeouts {"implicit": 10000}
2020-08-18 Tuesday 09:19:31 DEBUG :Starting new HTTP connection (1): 172.17.0.3:4444
2020-08-18 Tuesday 09:19:31 DEBUG :http://172.17.0.3:4444 "POST /wd/hub/session/d1d772ae19cbbdd684b658123b676d32/timeouts HTTP/1.1" 200 14
2020-08-18 Tuesday 09:19:31 DEBUG :Finished Request
2020-08-18 Tuesday 09:19:33 DEBUG :GET http://172.17.0.3:4444/wd/hub/session/d1d772ae19cbbdd684b658123b676d32/source {}
2020-08-18 Tuesday 09:19:33 DEBUG :Starting new HTTP connection (1): 172.17.0.3:4444
2020-08-18 Tuesday 09:19:34 DEBUG :http://172.17.0.3:4444 "GET /wd/hub/session/d1d772ae19cbbdd684b658123b676d32/source HTTP/1.1" 200 400765
2020-08-18 Tuesday 09:19:34 DEBUG :Finished Request
2020-08-18 Tuesday 09:19:35 DEBUG :DELETE http://172.17.0.3:4444/wd/hub/session/d1d772ae19cbbdd684b658123b676d32 {}
2020-08-18 Tuesday 09:19:35 DEBUG :Starting new HTTP connection (1): 172.17.0.3:4444
2020-08-18 Tuesday 09:19:35 DEBUG :http://172.17.0.3:4444 "DELETE /wd/hub/session/d1d772ae19cbbdd684b658123b676d32 HTTP/1.1" 200 14
2020-08-18 Tuesday 09:19:35 DEBUG :Finished Request
2020-08-18 Tuesday 09:19:36 INFO :上周日期:2020-08-11 OFFICE365爬取条数为:0
2020-08-18 Tuesday 09:19:36 INFO :本周日期:2020-08-18 OFFICE365爬取条数为:12038
2020-08-18 Tuesday 09:19:36 ERROR :OFFICE365页面结果已更新,请检查页面并重新爬取
2020-08-18 Tuesday 09:19:36 INFO :上周日期:2020-08-11 ZOOM爬取条数为:0
2020-08-18 Tuesday 09:19:36 INFO :本周日期:2020-08-18 ZOOM爬取条数为:1890
2020-08-18 Tuesday 09:19:36 ERROR :ZOOM页面结果已更新,请检查页面并重新爬取
2020-08-18 Tuesday 09:19:36 INFO :上周日期:2020-08-11 SALESFORCE爬取条数为:0
2020-08-18 Tuesday 09:19:36 INFO :本周日期:2020-08-18 SALESFORCE爬取条数为:1233
2020-08-18 Tuesday 09:19:36 ERROR :SALESFORCE页面结果已更新,请检查页面并重新爬取