Due to a large number of websites crawling back, the workload of research and development will increase dramatically if the collection of website JS and other strategies are analyzed. In order to quickly start the project data collection, Selenium automated testing tool can be used to simulate the user terminal and combine with the crawler proxy IP to collect data easily and quickly.

1. Advantages of Selenium crawler collection:

(1) Less investment in RESEARCH and development, easy to maintain the code

(2) Regression convenience

(3) Good scalability

(4) Good collection effect

2. Setting up Selenium environment

The installation

pip3 install -r requirements.txt
Copy the code

Install Chrome and download chrome Deriver

Download chrome www.google.com/chrome/

Download the corresponding version driver chromedriver.chromium.org/downloads

Basic configuration

Basic Interface Configuration

REDIS_HOST = 'localhost' # Redis REDIS_PORT = 6379 # Redis password, BROWSER_TYPE = 'Chrome' # GENERATOR_MAP = {'weibo': GENERATOR_MAP = {'weibo': 'WeiboCookiesGenerator'} # test class, if extending other sites, please configure TESTER_MAP = {'weibo': TEST_URL_MAP = {'weibo': 'https://m.weibo.cn/api/container/getIndex?uid=1804544030&type=uid&page=1&containerid=1076031804544030'} # generator and validator cycle CYCLE = 120 # API_HOST = '0.0.0.0' API_PORT = 5000Copy the code

Process switch

In the config. Py modification

# GENERATOR_PROCESS = True # validator switch, cyclic check whether Cookies are available in the database, Delete VALID_PROCESS = False # API interface service API_PROCESS = FalseCopy the code

Import account

python3 importer.py
Copy the code

Please enter the password group and enter exit. Read 180000000----16yun Account 180000000 Password 16yun Successfully enter exitCopy the code

run

Please enter some accounts before running, run the following command:

python3 run.py
Copy the code

Running effect

All three processes are enabled:

API interface start * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit) The Cookies generation process starts The Cookies detection process starts the Cookies account is being generated 180000000 Password 16yun Cookies being tested User name 180000000 Cookies valid 180000000Copy the code

Selenium uses the proxy IP in the following demo:

import os

import time

import zipfile

from selenium import webdriver

from selenium.common.exceptions import TimeoutException

from selenium.webdriver.common.by import By

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.support.ui import WebDriverWait

class GenCookies(object):

Random useragent

USER_AGENT = open(‘useragents.txt’).readlines()

Proxy server (official website of the productwww.16yun.cn)

PROXY_HOST = ‘t.16yun.cn’ # proxy or host

PROXY_PORT = 31111 # port

PROXY_USER = ‘USERNAME’ # username

PROXY_PASS = ‘PASSWORD’ # password

@classmethod

def get_chromedriver(cls, use_proxy=False, user_agent=None):

manifest_json = “””

{

“Version” : “1.0.0”,

“manifest_version”: 2,

“name”: “Chrome Proxy”,

“permissions”: [

“proxy”,

“tabs”,

“unlimitedStorage”,

“storage”,

“<all_urls>”,

“webRequest”,

“webRequestBlocking”

].

“background”: {

“scripts”: [“background.js”]

},

“Minimum_chrome_version” : “22.0.0”

}

“” “

background_js = “””

var config = {

mode: “fixed_servers”,

rules: {

singleProxy: {

scheme: “http”,

host: “%s”,

port: parseInt(%s)

},

bypassList: [“localhost”]

}

};

chrome.proxy.settings.set({value: config, scope: “regular”}, function() {});

function callbackFn(details) {

return {

authCredentials: {

username: “%s”,

password: “%s”

}

};

}

chrome.webRequest.onAuthRequired.addListener(

callbackFn,

{urls: [“<all_urls>”]},

[‘blocking’]

);

“”” % (cls.PROXY_HOST, cls.PROXY_PORT, cls.PROXY_USER, cls.PROXY_PASS)

path = os.path.dirname(os.path.abspath(file))

chrome_options = webdriver.ChromeOptions()

Turn off some flags for WebDriver

chrome_options.add_experimental_option(‘excludeSwitches’, [‘enable-automation’])

if use_proxy:

pluginfile = ‘proxy_auth_plugin.zip’

with zipfile.ZipFile(pluginfile, ‘w’) as zp:

zp.writestr(“manifest.json”, manifest_json)

zp.writestr(“background.js”, background_js)

chrome_options.add_extension(pluginfile)

if user_agent:

chrome_options.add_argument(‘–user-agent=%s’ % user_agent)

driver = webdriver.Chrome(

os.path.join(path, ‘chromedriver’),

chrome_options=chrome_options)

Modify the WebDriver GET attribute

script = ”’

Object.defineProperty(navigator, ‘webdriver’, {

get: () => undefined

})

‘ ‘ ‘

driver.execute_cdp_cmd(“Page.addScriptToEvaluateOnNewDocument”, {“source”: script})

return driver

def init(self, username, password):

Go to example

The self. The url = ‘passport. Example. Cn/signin/logi… ‘

self.browser = self.get_chromedriver(use_proxy=True, user_agent=self.USER_AGENT)

self.wait = WebDriverWait(self.browser, 20)

self.username = username

self.password = password

def open(self):

“” “

Open the page, enter the username and password and click

:return: None

“” “

self.browser.delete_all_cookies()

self.browser.get(self.url)

username = self.wait.until(EC.presence_of_element_located((By.ID, ‘loginName’)))

password = self.wait.until(EC.presence_of_element_located((By.ID, ‘loginPassword’)))

submit = self.wait.until(EC.element_to_be_clickable((By.ID, ‘loginAction’)))

username.send_keys(self.username)

password.send_keys(self.password)

time.sleep(1)

submit.click()

def password_error(self):

“” “

Check whether the password is incorrect

:return:

“” “

try:

return WebDriverWait(self.browser, 5).until(

Ec.text_to_be_present_in_element ((by.id, ‘errorMsg’), ‘username or password error ‘))

except TimeoutException:

return False

def get_cookies(self):

“” “

To get the Cookies

:return:

“” “

return self.browser.get_cookies()

def main(self):

“” “

The entrance

:return:

“” “

self.open()

if self.password_error():

return {

‘status’: 2,

‘Content ‘:’ Wrong username or password ‘

}

cookies = self.get_cookies()

return {

‘status’: 1,

‘content’: cookies

}

if name == ‘main‘:

result = GenCookies(

username=’180000000′,

password=’16yun’,

).main()

print(result)