It is well known that crawlers must use dynamic proxies to collect websites in order to avoid frequent restrictions on website visits. But in the process of specific site collection, even using dynamic proxy will still appear 403, 503 or 429 creep error, why? According to past experience, it is generally caused by the following reasons:

1. Dynamic user-agent modification

Crawlers collect websites, and normal HTTP requests need to be optimized by user-Agent (UA), because UA is the browser identifier. If THE HTTP request does not have UA, or even some crawlers actively mark it as collection, then the target website is highly likely to refuse collection

2. Control the request frequency of a single proxy IP address

Although crawlers use dynamic proxies, if the program’s multi-threaded control is not implemented well, a single proxy IP will issue a large number of requests in a short time, resulting in frequent access to this IP

3. IP address validity time management

When a dynamic proxy IP address is used, you must check its survival. Once a proxy IP address with high latency and low bandwidth is found, you must actively discard it to avoid timeout

If you think the above work is too cumbersome, it is recommended to use the enhanced version of automatic forwarding crawler agent. This product can realize the automatic allocation of different proxy IP forwarding for each HTTP request, and carry out automatic multi-threaded management of IP pool at the same time, ensuring the request connection rate of more than 99% and the delay of less than 300ms, so that you can quickly start to collect websites. ProxyHost, proxyPort, proxyUser, proxyPass) and targetUrl can be Run.

import java.io.BufferedReader; import java.io.InputStreamReader; import java.io.IOException; import java.net.URI; import java.util.Arrays; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.http.Header; import org.apache.http.HeaderElement; import org.apache.http.HttpHost; import org.apache.http.auth.AuthScope; import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.client.AuthCache; import org.apache.http.client.CredentialsProvider; import org.apache.http.client.HttpRequestRetryHandler; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.config.AuthSchemes; import org.apache.http.client.entity.GzipDecompressingEntity; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.client.methods.HttpRequestBase; import org.apache.http.client.protocol.HttpClientContext; import org.apache.http.config.Registry; import org.apache.http.config.RegistryBuilder; import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.LayeredConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.impl.auth.BasicScheme; import org.apache.http.impl.client.BasicAuthCache; import org.apache.http.impl.client.BasicCredentialsProvider; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.client.ProxyAuthenticationStrategy; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.message.BasicHeader; import org.apache.http.message.BasicNameValuePair; import org.apache.http.NameValuePair; import org.apache.http.util.EntityUtils; Public class Demo {final static String proxyHost = "T.16yun.cn "; final static Integer proxyPort = 31000; Final static String proxyUser = "username"; final static String proxyPass = "password"; private static PoolingHttpClientConnectionManager cm = null; private static HttpRequestRetryHandler httpRequestRetryHandler = null; private static HttpHost proxy = null; private static CredentialsProvider credsProvider = null; private static RequestConfig reqConfig = null; static { ConnectionSocketFactory plainsf = PlainConnectionSocketFactory.getSocketFactory(); LayeredConnectionSocketFactory sslsf = SSLConnectionSocketFactory.getSocketFactory(); Registry registry = RegistryBuilder.create() .register("http", plainsf) .register("https", sslsf) .build(); cm = new PoolingHttpClientConnectionManager(registry); cm.setMaxTotal(20); cm.setDefaultMaxPerRoute(5); proxy = new HttpHost(proxyHost, proxyPort, "http"); credsProvider = new BasicCredentialsProvider(); credsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(proxyUser, proxyPass)); reqConfig = RequestConfig.custom() .setConnectionRequestTimeout(5000) .setConnectTimeout(5000) .setSocketTimeout(5000) .setExpectContinueEnabled(false) .setProxy(new HttpHost(proxyHost, proxyPort)) .build(); } public static void doRequest(HttpRequestBase httpReq) { CloseableHttpResponse httpResp = null; try { setHeaders(httpReq); httpReq.setConfig(reqConfig); CloseableHttpClient httpClient = HttpClients.custom() .setConnectionManager(cm) .setDefaultCredentialsProvider(credsProvider) .build(); AuthCache authCache = new BasicAuthCache(); authCache.put(proxy, new BasicScheme()); HttpClientContext localContext = HttpClientContext.create(); localContext.setAuthCache(authCache); httpResp = httpClient.execute(httpReq, localContext); int statusCode = httpResp.getStatusLine().getStatusCode(); System.out.println(statusCode); BufferedReader rd = new BufferedReader(new InputStreamReader(httpResp.getEntity().getContent())); String line = ""; while((line = rd.readLine()) ! = null) { System.out.println(line); } } catch (Exception e) { e.printStackTrace(); } finally { try { if (httpResp ! = null) { httpResp.close(); } } catch (IOException e) { e.printStackTrace(); }}} /** * Set request header ** @param httpReq */ private static void setHeaders(HttpRequestBase httpReq) {// Set proxy-tunnel // Random random = new Random(); // int tunnel = random.nextInt(10000); // httpReq.setHeader("Proxy-Tunnel", String.valueOf(tunnel)); httpReq.setHeader("Accept-Encoding", null); } public static void doGetRequest() {// Target page to access String targetUrl = "https://httpbin.org/ip"; try { HttpGet httpGet = new HttpGet(targetUrl); doRequest(httpGet); } catch (Exception e) { e.printStackTrace(); } } public static void main(String[] args) { doGetRequest(); }}Copy the code