Due to a large number of websites crawling back, the workload of research and development will increase dramatically if the collection of website JS and other strategies are analyzed. In order to quickly start the project data collection, Selenium automated testing tool can be used to simulate the user terminal and combine with the crawler proxy IP to collect data easily and quickly.
1. Advantages of Selenium crawler collection:
(1) Less investment in RESEARCH and development, easy to maintain the code
(2) Regression convenience
(3) Good scalability
(4) Good collection effect
2. Setting up Selenium environment
The installation
pip3 install -r requirements.txt
Copy the code
Install Chrome and download chrome Deriver
Download chrome www.google.com/chrome/
Download the corresponding version driver chromedriver.chromium.org/downloads
Basic configuration
Basic Interface Configuration
REDIS_HOST = 'localhost' # Redis REDIS_PORT = 6379 # Redis password, BROWSER_TYPE = 'Chrome' # GENERATOR_MAP = {'weibo': GENERATOR_MAP = {'weibo': 'WeiboCookiesGenerator'} # test class, if extending other sites, please configure TESTER_MAP = {'weibo': TEST_URL_MAP = {'weibo': 'https://m.weibo.cn/api/container/getIndex?uid=1804544030&type=uid&page=1&containerid=1076031804544030'} # generator and validator cycle CYCLE = 120 # API_HOST = '0.0.0.0' API_PORT = 5000Copy the code
Process switch
In the config. Py modification
# GENERATOR_PROCESS = True # validator switch, cyclic check whether Cookies are available in the database, Delete VALID_PROCESS = False # API interface service API_PROCESS = FalseCopy the code
Import account
python3 importer.py
Copy the code
Please enter the password group and enter exit. Read 180000000----16yun Account 180000000 Password 16yun Successfully enter exitCopy the code
run
Please enter some accounts before running, run the following command:
python3 run.py
Copy the code
Running effect
All three processes are enabled:
API interface start * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit) The Cookies generation process starts The Cookies detection process starts the Cookies account is being generated 180000000 Password 16yun Cookies being tested User name 180000000 Cookies valid 180000000Copy the code
Selenium uses the proxy IP in the following demo:
import os
import time
import zipfile
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
class GenCookies(object):
Random useragent
USER_AGENT = open(‘useragents.txt’).readlines()
Proxy server (official website of the productwww.16yun.cn)
PROXY_HOST = ‘t.16yun.cn’ # proxy or host
PROXY_PORT = 31111 # port
PROXY_USER = ‘USERNAME’ # username
PROXY_PASS = ‘PASSWORD’ # password
@classmethod
def get_chromedriver(cls, use_proxy=False, user_agent=None):
manifest_json = “””
{
“Version” : “1.0.0”,
“manifest_version”: 2,
“name”: “Chrome Proxy”,
“permissions”: [
“proxy”,
“tabs”,
“unlimitedStorage”,
“storage”,
“<all_urls>”,
“webRequest”,
“webRequestBlocking”
].
“background”: {
“scripts”: [“background.js”]
},
“Minimum_chrome_version” : “22.0.0”
}
“” “
background_js = “””
var config = {
mode: “fixed_servers”,
rules: {
singleProxy: {
scheme: “http”,
host: “%s”,
port: parseInt(%s)
},
bypassList: [“localhost”]
}
};
chrome.proxy.settings.set({value: config, scope: “regular”}, function() {});
function callbackFn(details) {
return {
authCredentials: {
username: “%s”,
password: “%s”
}
};
}
chrome.webRequest.onAuthRequired.addListener(
callbackFn,
{urls: [“<all_urls>”]},
[‘blocking’]
);
“”” % (cls.PROXY_HOST, cls.PROXY_PORT, cls.PROXY_USER, cls.PROXY_PASS)
path = os.path.dirname(os.path.abspath(file))
chrome_options = webdriver.ChromeOptions()
Turn off some flags for WebDriver
chrome_options.add_experimental_option(‘excludeSwitches’, [‘enable-automation’])
if use_proxy:
pluginfile = ‘proxy_auth_plugin.zip’
with zipfile.ZipFile(pluginfile, ‘w’) as zp:
zp.writestr(“manifest.json”, manifest_json)
zp.writestr(“background.js”, background_js)
chrome_options.add_extension(pluginfile)
if user_agent:
chrome_options.add_argument(‘–user-agent=%s’ % user_agent)
driver = webdriver.Chrome(
os.path.join(path, ‘chromedriver’),
chrome_options=chrome_options)
Modify the WebDriver GET attribute
script = ”’
Object.defineProperty(navigator, ‘webdriver’, {
get: () => undefined
})
‘ ‘ ‘
driver.execute_cdp_cmd(“Page.addScriptToEvaluateOnNewDocument”, {“source”: script})
return driver
def init(self, username, password):
Go to example
The self. The url = ‘passport. Example. Cn/signin/logi… ‘
self.browser = self.get_chromedriver(use_proxy=True, user_agent=self.USER_AGENT)
self.wait = WebDriverWait(self.browser, 20)
self.username = username
self.password = password
def open(self):
“” “
Open the page, enter the username and password and click
:return: None
“” “
self.browser.delete_all_cookies()
self.browser.get(self.url)
username = self.wait.until(EC.presence_of_element_located((By.ID, ‘loginName’)))
password = self.wait.until(EC.presence_of_element_located((By.ID, ‘loginPassword’)))
submit = self.wait.until(EC.element_to_be_clickable((By.ID, ‘loginAction’)))
username.send_keys(self.username)
password.send_keys(self.password)
time.sleep(1)
submit.click()
def password_error(self):
“” “
Check whether the password is incorrect
:return:
“” “
try:
return WebDriverWait(self.browser, 5).until(
Ec.text_to_be_present_in_element ((by.id, ‘errorMsg’), ‘username or password error ‘))
except TimeoutException:
return False
def get_cookies(self):
“” “
To get the Cookies
:return:
“” “
return self.browser.get_cookies()
def main(self):
“” “
The entrance
:return:
“” “
self.open()
if self.password_error():
return {
‘status’: 2,
‘Content ‘:’ Wrong username or password ‘
}
cookies = self.get_cookies()
return {
‘status’: 1,
‘content’: cookies
}
if name == ‘main‘:
result = GenCookies(
username=’180000000′,
password=’16yun’,
).main()
print(result)