1. Cebox installs the Docker engine
#1. Install the Docker Engine
sudo yum install /path/to/package.rpm
#2. Start the Docker
sudo systemctl start docker
#3. Check whether the installation is normal
sudo docker run hello-world
Copy the code
2. Start the Selenium container
Docker run -d -p 4444:4444 --name selenium --shm-size=2g Selenium /standalone- Chrome :4.0.0-alpha-6-20200730Copy the code
3. Start mysql, which automatically generates databases
-
1. Write Dockerfile
#Base image using mysql:latest FROM mysql:latest # # # the author MAINTAINER jinsong_yan <[email protected]> # ## define the directory that will be automatically executed by the container ENV AUTO_RUN_DIR /docker-entrypoint-initdb.d # Define the initialization SQL file ENV INSTALL_DB_SQL spider.sql # /docker-entrypoint-initdb.d/; the container will automatically execute the SQL COPY ./$INSTALL_DB_SQL $AUTO_RUN_DIR/ # ## Add executable permission to execute file RUN chmod a+x $AUTO_RUN_DIR/$INSTALL_DB_SQL Copy the code
-
2. Spiders. SQL file
- to build library CREATE DATABASE IF NOT EXISTS spider default charset utf8 COLLATE utf8_general_ci; Mysql > alter database use spider; - build table DROP TABLE IF EXISTS `application_spider`; CREATE TABLE `application_spider` ( `id` bigint(20) NOT NULL AUTO_INCREMENT, `application` varchar(255) NOT NULL.`category` varchar(255) NOT NULL.`ip` varchar(255) NOT NULL.`protocol` varchar(255) NOT NULL.`port` varchar(255) NOT NULL.`create_time` date NOT NULL, PRIMARY KEY (`id`))ENGINE=InnoDB DEFAULT CHARSET=utf8; Insert data Copy the code
-
3. Generate an image
docker build -t spider-mysql . Copy the code
-
4. Generate containers based on the image
docker run --name=spider-mysql -p 3306:3306 -e MYSQL_ROOT_PASSWORD=123456 -d spider-mysql Copy the code
4. Start the Python crawler
-
1. Write Dockerfile
#Mirror basedThe FROM python: 3.7#Specify working directory WORKDIR /app ADD ./ ./ #Create a mount point VOLUME ["usr/dockerSpider","/app"] #Install the required packages RUN pip install -r requirements.txt ENTRYPOINT ["/bin/bash"] #CMD ["python"."application_spider.py"] && ["python"."detect_spider.py"] CMD ["spider-start.sh"] Copy the code
-
2. Write requirements. TXT
Astroid ==2.4.2 beautifulsoup4==4.9.1 Certifi ==2020.6.20 CFFI ==1.14.0 CHARdet ==3.0.4 colorAMA ==0.4.3 common==0.1.2 Idna ==2.10 Interface==2.11.1 isort==4.3.21 lazy-object-proxy==1.4.3 McCabe ==0.6.1 nose==1.3.7 pyCParser ==2.20 Pylint ==2.5.3 PyMySQL==0.10.0 Requests ==2.24.0 Selenium ==3.141.0 six==1.15.0 SoupLint ==2.0.1 TOMl ==0.10.1 Typed -ast==1.4.1 urllib3==1.25.9 wrapt==1.12.1 zope.event==4.4 zope.interface==5.1.0 zope.schema==6.0.0Copy the code
-
3. Write the spiders – start. Sh
#! /bin/bash python application_spider.py python detect_spider.py Copy the code
-
4. Generate an image
docker build -t spider_project . Copy the code
-
5. Start the container
#Link can communicate directly with Selenium and mysql docker run -v /usr/dockerSpider:/app --link selenium --link spider-mysql spider-project Copy the code
5. Scheduled tasks
1. Open the text crontab -e for adding scheduled tasks 2. Add scheduled tasks every Monday at 17:00. Check the scheduled task crontab -lCopy the code
6. The attachment
- The crawler crawls the part
import logging
from urllib import request
from bs4 import BeautifulSoup
import pymysql
import time
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
def log_init(logFilename):
""" Output log to file and console """
# Define a Handler and set a format which output to file
logging.basicConfig(
level=logging.INFO, # 定义输出到文件的log级别,
format='%(asctime)s %(levelname)s\t :%(message)s', # 定义输出log的格式
datefmt='%Y-%m-%d %A %H:%M:%S', # 时间
filename=logFilename, # log文件名
filemode='a') # 写入模式“w”或“a”
# Define a Handler and set a format which output to console
console = logging.StreamHandler() # 定义console handler
console.setLevel(logging.INFO) # 定义该handler级别
formatter = logging.Formatter('%(asctime)s %(levelname)s\t :%(message)s') # 定义该handler格式
console.setFormatter(formatter)
# Create an instance
logging.getLogger().addHandler(console) # 实例化添加handler
def mysql_init():
global conn, cursor
# 创建数据库连接,注意这里我加入了charset和cursorclass参数
conn = pymysql.connect(
host="spider-mysql",
user="root",
password="123456",
database="spider",
charset='utf8',
cursorclass=pymysql.cursors.DictCursor)
# 获取游标
cursor = conn.cursor()
def office365_spider():
url1 = "https://docs.microsoft.com/en-us/office365/enterprise/office-365-u-s-government-gcc-high-endpoints"
url2 = "https://docs.microsoft.com/en-us/office365/enterprise/office-365-u-s-government-dod-endpoints"
url3 = "https://docs.microsoft.com/en-us/office365/enterprise/urls-and-ip-address-ranges"
url4 = "https://docs.microsoft.com/en-us/office365/enterprise/urls-and-ip-address-ranges-21vianet"
url5 = "https://docs.microsoft.com/en-us/office365/enterprise/office-365-germany-endpoints"
url = [url1, url2, url3, url4, url5]
create_time = time.strftime('%Y-%m-%d')
# print(url)
for case in url:
# print(case)
html = request.urlopen(case).read().decode('utf-8')
soup = BeautifulSoup(html, 'html.parser')
# 获取table的每一行
select = soup.find_all('td')
# for s in select:
# print(s)
a = []
for i in range(len(select)):
if i == 0 or i == 1:
continue
elif case == url3 and i == 2:
continue
else:
a.append(select[i].text)
b = [a[i:i + 5] for i in range(0, len(a), 5)]
# company IP Port Protocol四元组
office365_category = ["Exchange Online", "SharePoint Online and OneDrive for Business",
"Skype for Business Online and Microsoft Teams", "Microsoft 365 Common and Office Online"]
app_category_index = 0
try:
for item in b:
# print(item)
if len(item) != 5:
continue
item[3] = item[3].replace(",", ",")
ip = item[3].split(",")
item[4] = item[4].replace(":", ":")
protocol_and_port = item[4].split(":")
if len(protocol_and_port) < 2:
continue
if case == url1:
if item[0] == "9":
app_category_index += 1
elif item[0] == "7":
app_category_index += 1
elif item[0] == "11":
app_category_index += 1
elif case == url2:
if item[0] == "9":
app_category_index += 1
elif item[0] == "7":
app_category_index += 1
elif item[0] == "11":
app_category_index += 1
elif case == url3:
if item[0] == "31":
app_category_index += 1
elif item[0] == "11":
app_category_index += 1
elif item[0] == "40":
app_category_index += 1
elif case == url4:
if item[0] == "4":
app_category_index += 1
elif item[0] == "3":
app_category_index += 1
elif item[0] == "6":
app_category_index += 1
else:
if item[0] == "8":
app_category_index += 1
elif item[0] == "6":
app_category_index += 1
elif item[0] == "18":
app_category_index += 1
protocol_tcp = protocol_and_port[0]
port_tcp = protocol_and_port[1]
protocol_udp = ""
port_udp = ""
if port_tcp.find("UDP") != -1:
protocol_udp = "UDP"
port_tcp = port_tcp.replace("UDP", "")
port_udp = protocol_and_port[2]
for i in ip:
if i.find("us") != -1 and i.find("/") != -1:
continue
if i.find(".com") != -1 and i.find("/") != -1:
continue
if len(protocol_udp) != 0:
print("OFFICE365-" + office365_category[
app_category_index] + "-" + i + "-" + protocol_udp + "-" + port_udp)
cursor.execute(
"INSERT INTO application_spider(application,category,ip,protocol,port,create_time) VALUES(%s,%s,%s,%s,%s,%s);",
('OFFICE365', office365_category[app_category_index], i, protocol_udp, port_udp,
"2020-07-22"))
print("OFFICE365-" + office365_category[
app_category_index] + "-" + i + "-" + protocol_tcp + "-" + port_tcp)
cursor.execute(
"INSERT INTO application_spider(application,category,ip,protocol,port,create_time) VALUES(%s,%s,%s,%s,%s,%s);",
('OFFICE365', office365_category[app_category_index], i, protocol_tcp, port_tcp, create_time))
conn.commit()
except Exception as e:
print(e)
logging.error("office365_spider()爬取发生异常,请检查页面是否更新")
conn.rollback()
def zoom_spider():
url = "https://support.zoom.us/hc/en-us/articles/201362683-Network-firewall-or-proxy-server-settings-for-Zoom"
print(url)
create_time = time.strftime('%Y-%m-%d')
html = request.urlopen(url).read().decode('utf-8')
soup = BeautifulSoup(html, 'html.parser')
# 获取table的每一行
select = soup.find_all('td')
index = [4, 8, 12, 20, 24, 28, 32, 40]
a = []
for i in range(len(select)):
if not (i in index):
continue
else:
for k in range(0, 4):
a.append(select[i].text)
i += 1
b = [a[i:i + 4] for i in range(0, len(a), 4)]
# company IP Port Protocol四元组
k = 0
zoom_category = ["Network firewall or web security gateway",
"Zoom Phone", "Zoom website"]
app_category_index = 0
for item in b:
if len(item) != 4:
continue
k += 1
# print(item)
item[1] = item[1].replace("\n", "").replace("\xa0-", ",").replace(" (see note)", "")
item[3] = item[3].replace("\n", "").replace("IPv4:", " ").replace("IPv6:", " ")
if k == 4:
app_category_index += 1
elif k == 8:
app_category_index += 1
if k == 5:
item[3] = item[3].replace("32", "32 ")
protocol = item[0]
port = item[1]
ip_list = item[3].split(" ")
# print(ip_list)
# print(item)
# print("--------------------------------------")
try:
for ip in ip_list:
if len(ip) == 0:
continue
if ip.count("/") > 1:
continue
print("ZOOM-" + zoom_category[app_category_index] + "-" + ip + "-" + protocol + "-" + port)
cursor.execute(
"INSERT INTO application_spider(application,category,ip,protocol,port,create_time) VALUES(%s,%s,%s,%s,%s,%s);",
('ZOOM', zoom_category[app_category_index], ip, protocol, port, create_time))
conn.commit()
except Exception as e:
print(e)
logging.error("zoom_spider()爬取发生异常,请检查页面是否更新")
conn.rollback()
def salesforce_spider():
url = "https://help.salesforce.com/articleView?id=000321501&type=1&mode=1"
create_time = time.strftime('%Y-%m-%d')
# 无界面运行参数
#chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--no-sandbox')
#chrome_options.add_argument('--window-size=1420,1080')
#chrome_options.add_argument('--headless')
#chrome_options.add_argument('--disable-gpu')
#chrome_options.add_argument('--disable-dev-shm-usage')
#driver = webdriver.Chrome(options=chrome_options)
driver = webdriver.Remote(
command_executor="http://selenium:4444/wd/hub",
desired_capabilities=DesiredCapabilities.CHROME
)
driver.get(url)
driver.implicitly_wait(10)
time.sleep(2)
source = driver.page_source
soup = BeautifulSoup(source, 'html.parser')
select = soup.find_all('td')
index = 0
salesforce_category = ["Salesforce", "Community Cloud", "Email Security Filter", "Trialforce organization email",
"Chatter and Community mail", "System mail", "Email forwarding"]
app_category_index = 0
try:
for s in select:
index += 1
if index == 63:
app_category_index += 1
elif index == 82:
app_category_index += 1
elif index == 163:
app_category_index += 1
elif index == 206:
app_category_index += 1
elif index == 289:
app_category_index += 1
elif index == 333:
app_category_index += 1
if index < 5 or index > 374:
continue
if s.text.find(".") < 3:
continue
ip = s.text
print("SALESFORCE- " + salesforce_category[app_category_index] + "-" + ip + "- " + "TCP/UDP" + "- " + "ANY")
cursor.execute(
"INSERT INTO application_spider(application,category,ip,protocol,port,create_time) VALUES(%s,%s,%s,%s,%s,%s);",
('SALESFORCE', salesforce_category[app_category_index], ip, "TCP/UDP", "ANY", create_time))
conn.commit()
except Exception as e:
logging.error("salesforce_spider()爬取发生异常,请检查页面是否更新")
conn.rollback()
driver.quit()
def mysql_close():
cursor.close()
conn.close()
def main():
log_init("spider.log");
#mysql init
mysql_init()
# spider 3 application
office365_spider()
zoom_spider()
salesforce_spider()
#time.sleep(3600)
# close cursor and connection
mysql_close()
if __name__ == '__main__':
main()
Copy the code
- Crawler detection section
from datetime import timedelta, date, datetime
import pymysql
import logging
def log_init(logFilename):
""" Output log to file and console """
# Define a Handler and set a format which output to file
logging.basicConfig(
level=logging.INFO, # 定义输出到文件的log级别,
format='%(asctime)s %(levelname)s\t :%(message)s', # 定义输出log的格式
datefmt='%Y-%m-%d %A %H:%M:%S', # 时间
filename=logFilename, # log文件名
filemode='a') # 写入模式“w”或“a”
# Define a Handler and set a format which output to console
console = logging.StreamHandler() # 定义console handler
console.setLevel(logging.INFO) # 定义该handler级别
formatter = logging.Formatter('%(asctime)s %(levelname)s\t :%(message)s') # 定义该handler格式
console.setFormatter(formatter)
# Create an instance
logging.getLogger().addHandler(console) # 实例化添加handler
def mysql_init():
global conn, cursor
# 创建数据库连接,注意这里我加入了charset和cursorclass参数
conn = pymysql.connect(
host="spider-mysql",
user="root",
password="123456",
database="spider",
charset='utf8',
cursorclass=pymysql.cursors.DictCursor)
# 获取游标
cursor = conn.cursor()
def get_last_week_date(_today):
return (_today - timedelta(days=7)).strftime('%Y-%m-%d')
def get_last_2week_date(_today):
return (_today - timedelta(days=14)).strftime('%Y-%m-%d')
def check_office365():
# get today time 2020-08-12
today = datetime.today()
today_date = today.strftime('%Y-%m-%d')
# get last week time
last_week_date = get_last_week_date(today)
last_2week_date = get_last_2week_date(today)
today_count = 0
last_week_count = 0
try:
cursor.execute("select count(*) from application_spider where create_time=%s and application=%s ",
(today_date, "OFFICE365"))
# cursor.execute("select count(*) from application_spider where create_time<=%s ", last_2week_date)
today_count = cursor.fetchall()
today_count = int(str(today_count).split(":")[1].replace("}", "").replace("]", ""))
cursor.execute("select count(*) from application_spider where create_time=%s and application=%s", (last_week_date, "OFFICE365"))
last_week_count = cursor.fetchall()
last_week_count = int(str(last_week_count).split(":")[1].replace("}", "").replace("]", ""))
except Exception as e:
logging.error(e)
logging.error("检测OFFICE365数据发生异常,请检查页面是否更新")
logging.info("上周日期:" + str(last_week_date) + " OFFICE365爬取条数为:" + str(last_week_count))
logging.info("本周日期:" + str(today_date) + " OFFICE365爬取条数为:" + str(today_count))
if today_count != last_week_count:
logging.error("OFFICE365页面结果已更新,请检查页面并重新爬取")
else:
logging.info("OFFICE365本次爬取结果正确,网页未更新")
def check_zoom():
# get today time
today = datetime.today()
today_date = today.strftime('%Y-%m-%d')
# get last week time
last_week_date = get_last_week_date(today)
last_2week_date = get_last_2week_date(today)
today_count = 0
last_week_count = 0
try:
cursor.execute("select count(*) from application_spider where create_time=%s and application=%s ",
(today_date, "ZOOM"))
# cursor.execute("select count(*) from application_spider where create_time<=%s ", last_2week_date)
today_count = cursor.fetchall()
today_count = int(str(today_count).split(":")[1].replace("}", "").replace("]", ""))
cursor.execute("select count(*) from application_spider where create_time=%s and application=%s", (last_week_date, "ZOOM"))
last_week_count = cursor.fetchall()
last_week_count = int(str(last_week_count).split(":")[1].replace("}", "").replace("]", ""))
except Exception as e:
logging.error(e)
logging.error("检测ZOOM数据发生异常,请检查页面是否更新")
logging.info("上周日期:" + str(last_week_date) + " ZOOM爬取条数为:" + str(last_week_count))
logging.info("本周日期:" + str(today_date) + " ZOOM爬取条数为:" + str(today_count))
if today_count != last_week_count:
logging.error("ZOOM页面结果已更新,请检查页面并重新爬取")
else:
logging.info("ZOOM本次爬取结果正确,网页未更新")
def check_salesforce():
# get today time
today = datetime.today()
today_date = today.strftime('%Y-%m-%d')
# get last week time
last_week_date = get_last_week_date(today)
last_2week_date = get_last_2week_date(today)
today_count = 0
last_week_count = 0
try:
cursor.execute("select count(*) from application_spider where create_time=%s and application=%s ",
(today_date, "SALESFORCE"))
# cursor.execute("select count(*) from application_spider where create_time<=%s ", last_2week_date)
today_count = cursor.fetchall()
today_count = int(str(today_count).split(":")[1].replace("}", "").replace("]", ""))
cursor.execute("select count(*) from application_spider where create_time=%s and application=%s", (last_week_date, "SALESFORCE"))
last_week_count = cursor.fetchall()
last_week_count = int(str(last_week_count).split(":")[1].replace("}", "").replace("]", ""))
except Exception as e:
logging.error(e)
logging.error("检测ZOOM数据发生异常,请检查页面是否更新")
logging.info("上周日期:" + str(last_week_date) + " SALESFORCE爬取条数为:" + str(last_week_count))
logging.info("本周日期:" + str(today_date) + " SALESFORCE爬取条数为:" + str(today_count))
if today_count != last_week_count:
logging.error("SALESFORCE页面结果已更新,请检查页面并重新爬取")
else:
logging.info("SALESFORCE本次爬取结果正确,网页未更新")
def mysql_close():
cursor.close()
conn.close()
if __name__ == '__main__':
#start log
log_init("spider.log")
#start mysql
mysql_init()
#start check
check_office365()
check_zoom()
check_salesforce()
#close mysql
mysql_close()
Copy the code
- The crawler log
The 2020-08-18 Tuesday 09:19:26 DEBUG: POST to http://172.17.0.3:4444/wd/hub/session {" "capabilities" : {" firstMatch ": [{}], "alwaysMatch": {"browserName": "chrome", "platformName": "any"}}, "desiredCapabilities": {"browserName": "chrome", "version": "", "platform": "ANY"}} 2020-08-18 Tuesday 09:19:26 DEBUG :Starting new HTTP connection (1): 172.17.0.3:4444 2020-08-18 Tuesday 09:19:27 DEBUG :http://172.17.0.3:4444 "POST /wd/hub/session HTTP/1.1" 200 1074 2020-08-18 Tuesday 09:19:27 DEBUG :Finished Request 2020-08-18 Tuesday 09:19:27 DEBUG :POST {http://172.17.0.3:4444/wd/hub/session/d1d772ae19cbbdd684b658123b676d32/url "url" : "https://help.salesforce.com/articleView?id=000321501&type=1&mode=1"} 2020-08-18 Tuesday 09:19:27 DEBUG :Starting new HTTP connection (1): 172.17.0.3:4444 2020-08-18 Tuesday 09:19:31 DEBUG :http://172.17.0.3:4444 "POST / wd/hub/session/d1d772ae19cbbdd684b658123b676d32 / url HTTP / 1.1 "200 14 2020-08-18 Tuesday 09:19:31 DEBUG: Finished Request 2020-08-18 Tuesday 09:19:31 DEBUG :POST http://172.17.0.3:4444/wd/hub/session/d1d772ae19cbbdd684b658123b676d32/timeouts {" implicit ": 10000} 2020-08-18 Tuesday 09:19:31 DEBUG :Starting new HTTP connection (1): 172.17.0.3:4444 2020-08-18 Tuesday 09:19:31 DEBUG :http://172.17.0.3:4444 "POST / wd/hub/session/d1d772ae19cbbdd684b658123b676d32 timeouts HTTP / 1.1 "200 14 2020-08-18 Tuesday 09:19:31 DEBUG: Finished Request 2020-08-18 Tuesday 09:19:33 DEBUG :GET http://172.17.0.3:4444/wd/hub/session/d1d772ae19cbbdd684b658123b676d32/source {} 2020-08-18 Tuesday 09:19:33 DEBUG :Starting new HTTP connection (1): 172.17.0.3:4444 2020-08-18 Tuesday 09:19:34 DEBUG :http://172.17.0.3:4444 "GET / wd/hub/session/d1d772ae19cbbdd684b658123b676d32 / source HTTP / 1.1 "200 400765 2020-08-18 Tuesday 09:19:34 DEBUG: Finished Request 2020-08-18 Tuesday 09:19:35 DEBUG :DELETE http://172.17.0.3:4444/wd/hub/session/d1d772ae19cbbdd684b658123b676d32 {} 2020-08-18 Tuesday 09:19:35 DEBUG: Starting new HTTP connection (1): 172.17.0.3:4444 2020-08-18 Tuesday 09:19:35 DEBUG :http://172.17.0.3:4444 "DELETE / wd/hub/session/d1d772ae19cbbdd684b658123b676d32 HTTP / 1.1 "200 14 2020-08-18 Tuesday 09:19:35 DEBUG: Finished the Request 2020-08-18 Tuesday 09:19:36 INFO :2020-08-18 Tuesday 09:19:36 INFO: this week date :2020-08-18 12038 2020-08-18 Tuesday 09:19:36 ERROR :OFFICE365 2020-08-18 Tuesday 09:19:36 INFO 2020-08-18 Tuesday 09:19:36 INFO 2020-08-18 Tuesday 09:19:36 INFO 2020-08-18 Tuesday 09:19:36 INFO 2020-08-18 Tuesday 09:19:36 INFO 2020-08-18 Tuesday 09:19:36 INFO 2020-08-18 Tuesday 09:19:36 ERROR :ZOOM page result updated 2020-08-18 Tuesday 09:19:36 INFO 2020-08-18 Tuesday 09:19:36 INFO 2020-08-18 Tuesday 09:19:36 INFO 2020-08-18 Tuesday 09:19:36 INFO 1233 2020-08-18 Tuesday 09:19:36 ERROR :SALESFORCE page result updated please check the page and try againCopy the code